regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 /* this is a chain of data about sub patterns we are processing that
 110    need to be handled separately/specially in study_chunk. Its so
 111    we can simulate recursion without losing state.  */
 112 struct scan_frame;
 113 typedef struct scan_frame {
 114     regnode *last_regnode;      /* last node to process in this frame */
 115     regnode *next_regnode;      /* next node to process when last is reached */
 116     U32 prev_recursed_depth;
 117     I32 stopparen;              /* what stopparen do we use */
 118     U32 is_top_frame;           /* what flags do we use? */
 119
 120     struct scan_frame *this_prev_frame; /* this previous frame */
 121     struct scan_frame *prev_frame;      /* previous frame */
 122     struct scan_frame *next_frame;      /* next frame */
 123 } scan_frame;
 124
 125 struct RExC_state_t {
 126     U32         flags;                  /* RXf_* are we folding, multilining? */
 127     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 128     char        *precomp;               /* uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 137     regnode     *emit_start;            /* Start of emitted-code area */
 138     regnode     *emit_bound;            /* First regnode outside of the
 139                                            allocated space */
 140     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 141                                            implies compiling, so don't emit */
 142     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 143                                            large enough for the largest
 144                                            non-EXACTish node, so can use it as
 145                                            scratch in pass1 */
 146     I32         naughty;                /* How bad is this pattern? */
 147     I32         sawback;                /* Did we see \1, ...? */
 148     U32         seen;
 149     SSize_t     size;                   /* Code size. */
 150     I32                npar;            /* Capture buffer count, (OPEN) plus
 151                                            one. ("par" 0 is the whole
 152                                            pattern)*/
 153     I32         nestroot;               /* root parens we are in - used by
 154                                            accept */
 155     I32         extralen;
 156     I32         seen_zerolen;
 157     regnode     **open_parens;          /* pointers to open parens */
 158     regnode     **close_parens;         /* pointers to close parens */
 159     regnode     *opend;                 /* END node in program */
 160     I32         utf8;           /* whether the pattern is utf8 or not */
 161     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 162                                 /* XXX use this for future optimisation of case
 163                                  * where pattern must be upgraded to utf8. */
 164     I32         uni_semantics;  /* If a d charset modifier should use unicode
 165                                    rules, even if the pattern is not in
 166                                    utf8 */
 167     HV          *paren_names;           /* Paren names */
 168
 169     regnode     **recurse;              /* Recurse regops */
 170     I32         recurse_count;          /* Number of recurse regops */
 171     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 172                                            through */
 173     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 174     I32         in_lookbehind;
 175     I32         contains_locale;
 176     I32         contains_i;
 177     I32         override_recoding;
 178     I32         in_multi_char_class;
 179     struct reg_code_block *code_blocks; /* positions of literal (?{})
 180                                             within pattern */
 181     int         num_code_blocks;        /* size of code_blocks[] */
 182     int         code_index;             /* next code_blocks[] slot */
 183     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 184     scan_frame *frame_head;
 185     scan_frame *frame_last;
 186     U32         frame_count;
 187 #ifdef ADD_TO_REGEXEC
 188     char        *starttry;              /* -Dr: where regtry was called. */
 189 #define RExC_starttry   (pRExC_state->starttry)
 190 #endif
 191     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 192 #ifdef DEBUGGING
 193     const char  *lastparse;
 194     I32         lastnum;
 195     AV          *paren_name_list;       /* idx -> name */
 196     U32         study_chunk_recursed_count;
 197     SV          *mysv1;
 198     SV          *mysv2;
 199 #define RExC_lastparse  (pRExC_state->lastparse)
 200 #define RExC_lastnum    (pRExC_state->lastnum)
 201 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 202 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 203 #define RExC_mysv       (pRExC_state->mysv1)
 204 #define RExC_mysv1      (pRExC_state->mysv1)
 205 #define RExC_mysv2      (pRExC_state->mysv2)
 206
 207 #endif
 208 };
 209
 210 #define RExC_flags      (pRExC_state->flags)
 211 #define RExC_pm_flags   (pRExC_state->pm_flags)
 212 #define RExC_precomp    (pRExC_state->precomp)
 213 #define RExC_rx_sv      (pRExC_state->rx_sv)
 214 #define RExC_rx         (pRExC_state->rx)
 215 #define RExC_rxi        (pRExC_state->rxi)
 216 #define RExC_start      (pRExC_state->start)
 217 #define RExC_end        (pRExC_state->end)
 218 #define RExC_parse      (pRExC_state->parse)
 219 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 220 #ifdef RE_TRACK_PATTERN_OFFSETS
 221 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 222                                                          others */
 223 #endif
 224 #define RExC_emit       (pRExC_state->emit)
 225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 226 #define RExC_emit_start (pRExC_state->emit_start)
 227 #define RExC_emit_bound (pRExC_state->emit_bound)
 228 #define RExC_sawback    (pRExC_state->sawback)
 229 #define RExC_seen       (pRExC_state->seen)
 230 #define RExC_size       (pRExC_state->size)
 231 #define RExC_maxlen        (pRExC_state->maxlen)
 232 #define RExC_npar       (pRExC_state->npar)
 233 #define RExC_nestroot   (pRExC_state->nestroot)
 234 #define RExC_extralen   (pRExC_state->extralen)
 235 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 236 #define RExC_utf8       (pRExC_state->utf8)
 237 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 238 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 239 #define RExC_open_parens        (pRExC_state->open_parens)
 240 #define RExC_close_parens       (pRExC_state->close_parens)
 241 #define RExC_opend      (pRExC_state->opend)
 242 #define RExC_paren_names        (pRExC_state->paren_names)
 243 #define RExC_recurse    (pRExC_state->recurse)
 244 #define RExC_recurse_count      (pRExC_state->recurse_count)
 245 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 246 #define RExC_study_chunk_recursed_bytes  \
 247                                    (pRExC_state->study_chunk_recursed_bytes)
 248 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 249 #define RExC_contains_locale    (pRExC_state->contains_locale)
 250 #define RExC_contains_i (pRExC_state->contains_i)
 251 #define RExC_override_recoding (pRExC_state->override_recoding)
 252 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 253 #define RExC_frame_head (pRExC_state->frame_head)
 254 #define RExC_frame_last (pRExC_state->frame_last)
 255 #define RExC_frame_count (pRExC_state->frame_count)
 256
 257 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 258  * a flag to disable back-off on the fixed/floating substrings - if it's
 259  * a high complexity pattern we assume the benefit of avoiding a full match
 260  * is worth the cost of checking for the substrings even if they rarely help.
 261  */
 262 #define RExC_naughty    (pRExC_state->naughty)
 263 #define TOO_NAUGHTY (10)
 264 #define MARK_NAUGHTY(add) \
 265     if (RExC_naughty < TOO_NAUGHTY) \
 266         RExC_naughty += (add)
 267 #define MARK_NAUGHTY_EXP(exp, add) \
 268     if (RExC_naughty < TOO_NAUGHTY) \
 269         RExC_naughty += RExC_naughty / (exp) + (add)
 270
 271 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 272 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 273         ((*s) == '{' && regcurly(s)))
 274
 275 /*
 276  * Flags to be passed up and down.
 277  */
 278 #define WORST           0       /* Worst case. */
 279 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 280
 281 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 282  * character.  (There needs to be a case: in the switch statement in regexec.c
 283  * for any node marked SIMPLE.)  Note that this is not the same thing as
 284  * REGNODE_SIMPLE */
 285 #define SIMPLE          0x02
 286 #define SPSTART         0x04    /* Starts with * or + */
 287 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 288 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 289 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 290
 291 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 292
 293 /* whether trie related optimizations are enabled */
 294 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 295 #define TRIE_STUDY_OPT
 296 #define FULL_TRIE_STUDY
 297 #define TRIE_STCLASS
 298 #endif
 299
 300
 301
 302 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 303 #define PBITVAL(paren) (1 << ((paren) & 7))
 304 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 305 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 306 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 307
 308 #define REQUIRE_UTF8    STMT_START {                                       \
 309                                      if (!UTF) {                           \
 310                                          *flagp = RESTART_UTF8;            \
 311                                          return NULL;                      \
 312                                      }                                     \
 313                         } STMT_END
 314
 315 /* This converts the named class defined in regcomp.h to its equivalent class
 316  * number defined in handy.h. */
 317 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 318 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 319
 320 #define _invlist_union_complement_2nd(a, b, output) \
 321                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 322 #define _invlist_intersection_complement_2nd(a, b, output) \
 323                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 324
 325 /* About scan_data_t.
 326
 327   During optimisation we recurse through the regexp program performing
 328   various inplace (keyhole style) optimisations. In addition study_chunk
 329   and scan_commit populate this data structure with information about
 330   what strings MUST appear in the pattern. We look for the longest
 331   string that must appear at a fixed location, and we look for the
 332   longest string that may appear at a floating location. So for instance
 333   in the pattern:
 334
 335     /FOO[xX]A.*B[xX]BAR/
 336
 337   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 338   strings (because they follow a .* construct). study_chunk will identify
 339   both FOO and BAR as being the longest fixed and floating strings respectively.
 340
 341   The strings can be composites, for instance
 342
 343      /(f)(o)(o)/
 344
 345   will result in a composite fixed substring 'foo'.
 346
 347   For each string some basic information is maintained:
 348
 349   - offset or min_offset
 350     This is the position the string must appear at, or not before.
 351     It also implicitly (when combined with minlenp) tells us how many
 352     characters must match before the string we are searching for.
 353     Likewise when combined with minlenp and the length of the string it
 354     tells us how many characters must appear after the string we have
 355     found.
 356
 357   - max_offset
 358     Only used for floating strings. This is the rightmost point that
 359     the string can appear at. If set to SSize_t_MAX it indicates that the
 360     string can occur infinitely far to the right.
 361
 362   - minlenp
 363     A pointer to the minimum number of characters of the pattern that the
 364     string was found inside. This is important as in the case of positive
 365     lookahead or positive lookbehind we can have multiple patterns
 366     involved. Consider
 367
 368     /(?=FOO).*F/
 369
 370     The minimum length of the pattern overall is 3, the minimum length
 371     of the lookahead part is 3, but the minimum length of the part that
 372     will actually match is 1. So 'FOO's minimum length is 3, but the
 373     minimum length for the F is 1. This is important as the minimum length
 374     is used to determine offsets in front of and behind the string being
 375     looked for.  Since strings can be composites this is the length of the
 376     pattern at the time it was committed with a scan_commit. Note that
 377     the length is calculated by study_chunk, so that the minimum lengths
 378     are not known until the full pattern has been compiled, thus the
 379     pointer to the value.
 380
 381   - lookbehind
 382
 383     In the case of lookbehind the string being searched for can be
 384     offset past the start point of the final matching string.
 385     If this value was just blithely removed from the min_offset it would
 386     invalidate some of the calculations for how many chars must match
 387     before or after (as they are derived from min_offset and minlen and
 388     the length of the string being searched for).
 389     When the final pattern is compiled and the data is moved from the
 390     scan_data_t structure into the regexp structure the information
 391     about lookbehind is factored in, with the information that would
 392     have been lost precalculated in the end_shift field for the
 393     associated string.
 394
 395   The fields pos_min and pos_delta are used to store the minimum offset
 396   and the delta to the maximum offset at the current point in the pattern.
 397
 398 */
 399
 400 typedef struct scan_data_t {
 401     /*I32 len_min;      unused */
 402     /*I32 len_delta;    unused */
 403     SSize_t pos_min;
 404     SSize_t pos_delta;
 405     SV *last_found;
 406     SSize_t last_end;       /* min value, <0 unless valid. */
 407     SSize_t last_start_min;
 408     SSize_t last_start_max;
 409     SV **longest;           /* Either &l_fixed, or &l_float. */
 410     SV *longest_fixed;      /* longest fixed string found in pattern */
 411     SSize_t offset_fixed;   /* offset where it starts */
 412     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 413     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 414     SV *longest_float;      /* longest floating string found in pattern */
 415     SSize_t offset_float_min; /* earliest point in string it can appear */
 416     SSize_t offset_float_max; /* latest point in string it can appear */
 417     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 418     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 419     I32 flags;
 420     I32 whilem_c;
 421     SSize_t *last_closep;
 422     regnode_ssc *start_class;
 423 } scan_data_t;
 424
 425 /*
 426  * Forward declarations for pregcomp()'s friends.
 427  */
 428
 429 static const scan_data_t zero_scan_data =
 430   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 431
 432 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 433 #define SF_BEFORE_SEOL          0x0001
 434 #define SF_BEFORE_MEOL          0x0002
 435 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 436 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 437
 438 #define SF_FIX_SHIFT_EOL        (+2)
 439 #define SF_FL_SHIFT_EOL         (+4)
 440
 441 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 442 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 443
 444 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 445 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 446 #define SF_IS_INF               0x0040
 447 #define SF_HAS_PAR              0x0080
 448 #define SF_IN_PAR               0x0100
 449 #define SF_HAS_EVAL             0x0200
 450 #define SCF_DO_SUBSTR           0x0400
 451 #define SCF_DO_STCLASS_AND      0x0800
 452 #define SCF_DO_STCLASS_OR       0x1000
 453 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 454 #define SCF_WHILEM_VISITED_POS  0x2000
 455
 456 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 457 #define SCF_SEEN_ACCEPT         0x8000
 458 #define SCF_TRIE_DOING_RESTUDY 0x10000
 459 #define SCF_IN_DEFINE          0x20000
 460
 461
 462
 463
 464 #define UTF cBOOL(RExC_utf8)
 465
 466 /* The enums for all these are ordered so things work out correctly */
 467 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 468 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 469                                                      == REGEX_DEPENDS_CHARSET)
 470 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 471 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 472                                                      >= REGEX_UNICODE_CHARSET)
 473 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 474                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 475 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 476                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 477 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 478                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 479
 480 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 481
 482 /* For programs that want to be strictly Unicode compatible by dying if any
 483  * attempt is made to match a non-Unicode code point against a Unicode
 484  * property.  */
 485 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 486
 487 #define OOB_NAMEDCLASS          -1
 488
 489 /* There is no code point that is out-of-bounds, so this is problematic.  But
 490  * its only current use is to initialize a variable that is always set before
 491  * looked at. */
 492 #define OOB_UNICODE             0xDEADBEEF
 493
 494 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 495 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 496
 497
 498 /* length of regex to show in messages that don't mark a position within */
 499 #define RegexLengthToShowInErrorMessages 127
 500
 501 /*
 502  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 503  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 504  * op/pragma/warn/regcomp.
 505  */
 506 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 507 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 508
 509 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 510                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 511
 512 #define REPORT_LOCATION_ARGS(offset)            \
 513                 UTF8fARG(UTF, offset, RExC_precomp), \
 514                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 515
 516 /*
 517  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 518  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 519  * "...".
 520  */
 521 #define _FAIL(code) STMT_START {                                        \
 522     const char *ellipses = "";                                          \
 523     IV len = RExC_end - RExC_precomp;                                   \
 524                                                                         \
 525     if (!SIZE_ONLY)                                                     \
 526         SAVEFREESV(RExC_rx_sv);                                         \
 527     if (len > RegexLengthToShowInErrorMessages) {                       \
 528         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 529         len = RegexLengthToShowInErrorMessages - 10;                    \
 530         ellipses = "...";                                               \
 531     }                                                                   \
 532     code;                                                               \
 533 } STMT_END
 534
 535 #define FAIL(msg) _FAIL(                            \
 536     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 537             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 538
 539 #define FAIL2(msg,arg) _FAIL(                       \
 540     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 541             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 542
 543 /*
 544  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 545  */
 546 #define Simple_vFAIL(m) STMT_START {                                    \
 547     const IV offset =                                                   \
 548         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 549     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 550             m, REPORT_LOCATION_ARGS(offset));   \
 551 } STMT_END
 552
 553 /*
 554  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 555  */
 556 #define vFAIL(m) STMT_START {                           \
 557     if (!SIZE_ONLY)                                     \
 558         SAVEFREESV(RExC_rx_sv);                         \
 559     Simple_vFAIL(m);                                    \
 560 } STMT_END
 561
 562 /*
 563  * Like Simple_vFAIL(), but accepts two arguments.
 564  */
 565 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 566     const IV offset = RExC_parse - RExC_precomp;                        \
 567     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 568                       REPORT_LOCATION_ARGS(offset));    \
 569 } STMT_END
 570
 571 /*
 572  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 573  */
 574 #define vFAIL2(m,a1) STMT_START {                       \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL2(m, a1);                               \
 578 } STMT_END
 579
 580
 581 /*
 582  * Like Simple_vFAIL(), but accepts three arguments.
 583  */
 584 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 585     const IV offset = RExC_parse - RExC_precomp;                \
 586     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 587             REPORT_LOCATION_ARGS(offset));      \
 588 } STMT_END
 589
 590 /*
 591  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 592  */
 593 #define vFAIL3(m,a1,a2) STMT_START {                    \
 594     if (!SIZE_ONLY)                                     \
 595         SAVEFREESV(RExC_rx_sv);                         \
 596     Simple_vFAIL3(m, a1, a2);                           \
 597 } STMT_END
 598
 599 /*
 600  * Like Simple_vFAIL(), but accepts four arguments.
 601  */
 602 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 603     const IV offset = RExC_parse - RExC_precomp;                \
 604     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 605             REPORT_LOCATION_ARGS(offset));      \
 606 } STMT_END
 607
 608 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 609     if (!SIZE_ONLY)                                     \
 610         SAVEFREESV(RExC_rx_sv);                         \
 611     Simple_vFAIL4(m, a1, a2, a3);                       \
 612 } STMT_END
 613
 614 /* A specialized version of vFAIL2 that works with UTF8f */
 615 #define vFAIL2utf8f(m, a1) STMT_START { \
 616     const IV offset = RExC_parse - RExC_precomp;   \
 617     if (!SIZE_ONLY)                                \
 618         SAVEFREESV(RExC_rx_sv);                    \
 619     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 620             REPORT_LOCATION_ARGS(offset));         \
 621 } STMT_END
 622
 623 /* These have asserts in them because of [perl #122671] Many warnings in
 624  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 625  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 626  * would get output again.  So they should be output in pass2, and these
 627  * asserts make sure new warnings follow that paradigm. */
 628
 629 /* m is not necessarily a "literal string", in this macro */
 630 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 631     const IV offset = loc - RExC_precomp;                               \
 632     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 633             m, REPORT_LOCATION_ARGS(offset));       \
 634 } STMT_END
 635
 636 #define ckWARNreg(loc,m) STMT_START {                                   \
 637     const IV offset = loc - RExC_precomp;                               \
 638     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 639             REPORT_LOCATION_ARGS(offset));              \
 640 } STMT_END
 641
 642 #define vWARN_dep(loc, m) STMT_START {                                  \
 643     const IV offset = loc - RExC_precomp;                               \
 644     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 645             REPORT_LOCATION_ARGS(offset));              \
 646 } STMT_END
 647
 648 #define ckWARNdep(loc,m) STMT_START {                                   \
 649     const IV offset = loc - RExC_precomp;                               \
 650     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 651             m REPORT_LOCATION,                                          \
 652             REPORT_LOCATION_ARGS(offset));              \
 653 } STMT_END
 654
 655 #define ckWARNregdep(loc,m) STMT_START {                                \
 656     const IV offset = loc - RExC_precomp;                               \
 657     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 658             m REPORT_LOCATION,                                          \
 659             REPORT_LOCATION_ARGS(offset));              \
 660 } STMT_END
 661
 662 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 663     const IV offset = loc - RExC_precomp;                               \
 664     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 665             m REPORT_LOCATION,                                          \
 666             a1, REPORT_LOCATION_ARGS(offset));  \
 667 } STMT_END
 668
 669 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 670     const IV offset = loc - RExC_precomp;                               \
 671     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 672             a1, REPORT_LOCATION_ARGS(offset));  \
 673 } STMT_END
 674
 675 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 676     const IV offset = loc - RExC_precomp;                               \
 677     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 678             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 679 } STMT_END
 680
 681 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 682     const IV offset = loc - RExC_precomp;                               \
 683     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 684             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 685 } STMT_END
 686
 687 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 688     const IV offset = loc - RExC_precomp;                               \
 689     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 690             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 691 } STMT_END
 692
 693 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 694     const IV offset = loc - RExC_precomp;                               \
 695     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 696             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 697 } STMT_END
 698
 699 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 700     const IV offset = loc - RExC_precomp;                               \
 701     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 702             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 703 } STMT_END
 704
 705 /* Macros for recording node offsets.   20001227 mjd@plover.com
 706  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 707  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 708  * Element 0 holds the number n.
 709  * Position is 1 indexed.
 710  */
 711 #ifndef RE_TRACK_PATTERN_OFFSETS
 712 #define Set_Node_Offset_To_R(node,byte)
 713 #define Set_Node_Offset(node,byte)
 714 #define Set_Cur_Node_Offset
 715 #define Set_Node_Length_To_R(node,len)
 716 #define Set_Node_Length(node,len)
 717 #define Set_Node_Cur_Length(node,start)
 718 #define Node_Offset(n)
 719 #define Node_Length(n)
 720 #define Set_Node_Offset_Length(node,offset,len)
 721 #define ProgLen(ri) ri->u.proglen
 722 #define SetProgLen(ri,x) ri->u.proglen = x
 723 #else
 724 #define ProgLen(ri) ri->u.offsets[0]
 725 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 726 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 727     if (! SIZE_ONLY) {                                                  \
 728         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 729                     __LINE__, (int)(node), (int)(byte)));               \
 730         if((node) < 0) {                                                \
 731             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 732                                          (int)(node));                  \
 733         } else {                                                        \
 734             RExC_offsets[2*(node)-1] = (byte);                          \
 735         }                                                               \
 736     }                                                                   \
 737 } STMT_END
 738
 739 #define Set_Node_Offset(node,byte) \
 740     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 741 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 742
 743 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 744     if (! SIZE_ONLY) {                                                  \
 745         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 746                 __LINE__, (int)(node), (int)(len)));                    \
 747         if((node) < 0) {                                                \
 748             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 749                                          (int)(node));                  \
 750         } else {                                                        \
 751             RExC_offsets[2*(node)] = (len);                             \
 752         }                                                               \
 753     }                                                                   \
 754 } STMT_END
 755
 756 #define Set_Node_Length(node,len) \
 757     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 758 #define Set_Node_Cur_Length(node, start)                \
 759     Set_Node_Length(node, RExC_parse - start)
 760
 761 /* Get offsets and lengths */
 762 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 763 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 764
 765 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 766     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 767     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 768 } STMT_END
 769 #endif
 770
 771 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 772 #define EXPERIMENTAL_INPLACESCAN
 773 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 774
 775 #define DEBUG_RExC_seen() \
 776         DEBUG_OPTIMISE_MORE_r({                                             \
 777             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 778                                                                             \
 779             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 780                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 781                                                                             \
 782             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 783                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 784                                                                             \
 785             if (RExC_seen & REG_GPOS_SEEN)                                  \
 786                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 787                                                                             \
 788             if (RExC_seen & REG_CANY_SEEN)                                  \
 789                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 790                                                                             \
 791             if (RExC_seen & REG_RECURSE_SEEN)                               \
 792                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 793                                                                             \
 794             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 795                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 796                                                                             \
 797             if (RExC_seen & REG_VERBARG_SEEN)                               \
 798                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 799                                                                             \
 800             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 801                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 802                                                                             \
 803             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 804                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 805                                                                             \
 806             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 807                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 808                                                                             \
 809             if (RExC_seen & REG_GOSTART_SEEN)                               \
 810                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 811                                                                             \
 812             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 813                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 814                                                                             \
 815             PerlIO_printf(Perl_debug_log,"\n");                             \
 816         });
 817
 818 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 819   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 820
 821 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 822     if ( ( flags ) ) {                                                      \
 823         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 824         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 825         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 826         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 827         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 828         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 829         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 830         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 831         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 832         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 833         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 834         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 835         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 836         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 837         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 838         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 839         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 840     }
 841
 842
 843 #define DEBUG_STUDYDATA(str,data,depth)                              \
 844 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 845     PerlIO_printf(Perl_debug_log,                                    \
 846         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 847         " Flags: 0x%"UVXf,                                           \
 848         (int)(depth)*2, "",                                          \
 849         (IV)((data)->pos_min),                                       \
 850         (IV)((data)->pos_delta),                                     \
 851         (UV)((data)->flags)                                          \
 852     );                                                               \
 853     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 854     PerlIO_printf(Perl_debug_log,                                    \
 855         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 856         (IV)((data)->whilem_c),                                      \
 857         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 858         is_inf ? "INF " : ""                                         \
 859     );                                                               \
 860     if ((data)->last_found)                                          \
 861         PerlIO_printf(Perl_debug_log,                                \
 862             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 863             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 864             SvPVX_const((data)->last_found),                         \
 865             (IV)((data)->last_end),                                  \
 866             (IV)((data)->last_start_min),                            \
 867             (IV)((data)->last_start_max),                            \
 868             ((data)->longest &&                                      \
 869              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 870             SvPVX_const((data)->longest_fixed),                      \
 871             (IV)((data)->offset_fixed),                              \
 872             ((data)->longest &&                                      \
 873              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 874             SvPVX_const((data)->longest_float),                      \
 875             (IV)((data)->offset_float_min),                          \
 876             (IV)((data)->offset_float_max)                           \
 877         );                                                           \
 878     PerlIO_printf(Perl_debug_log,"\n");                              \
 879 });
 880
 881 #ifdef DEBUGGING
 882
 883 /* is c a control character for which we have a mnemonic? */
 884 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 885
 886 STATIC const char *
 887 S_cntrl_to_mnemonic(const U8 c)
 888 {
 889     /* Returns the mnemonic string that represents character 'c', if one
 890      * exists; NULL otherwise.  The only ones that exist for the purposes of
 891      * this routine are a few control characters */
 892
 893     switch (c) {
 894         case '\a':       return "\\a";
 895         case '\b':       return "\\b";
 896         case ESC_NATIVE: return "\\e";
 897         case '\f':       return "\\f";
 898         case '\n':       return "\\n";
 899         case '\r':       return "\\r";
 900         case '\t':       return "\\t";
 901     }
 902
 903     return NULL;
 904 }
 905
 906 #endif
 907
 908 /* Mark that we cannot extend a found fixed substring at this point.
 909    Update the longest found anchored substring and the longest found
 910    floating substrings if needed. */
 911
 912 STATIC void
 913 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 914                     SSize_t *minlenp, int is_inf)
 915 {
 916     const STRLEN l = CHR_SVLEN(data->last_found);
 917     const STRLEN old_l = CHR_SVLEN(*data->longest);
 918     GET_RE_DEBUG_FLAGS_DECL;
 919
 920     PERL_ARGS_ASSERT_SCAN_COMMIT;
 921
 922     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 923         SvSetMagicSV(*data->longest, data->last_found);
 924         if (*data->longest == data->longest_fixed) {
 925             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 926             if (data->flags & SF_BEFORE_EOL)
 927                 data->flags
 928                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 929             else
 930                 data->flags &= ~SF_FIX_BEFORE_EOL;
 931             data->minlen_fixed=minlenp;
 932             data->lookbehind_fixed=0;
 933         }
 934         else { /* *data->longest == data->longest_float */
 935             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 936             data->offset_float_max = (l
 937                           ? data->last_start_max
 938                           : (data->pos_delta > SSize_t_MAX - data->pos_min
 939                                          ? SSize_t_MAX
 940                                          : data->pos_min + data->pos_delta));
 941             if (is_inf
 942                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 943                 data->offset_float_max = SSize_t_MAX;
 944             if (data->flags & SF_BEFORE_EOL)
 945                 data->flags
 946                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 947             else
 948                 data->flags &= ~SF_FL_BEFORE_EOL;
 949             data->minlen_float=minlenp;
 950             data->lookbehind_float=0;
 951         }
 952     }
 953     SvCUR_set(data->last_found, 0);
 954     {
 955         SV * const sv = data->last_found;
 956         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 957             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 958             if (mg)
 959                 mg->mg_len = 0;
 960         }
 961     }
 962     data->last_end = -1;
 963     data->flags &= ~SF_BEFORE_EOL;
 964     DEBUG_STUDYDATA("commit: ",data,0);
 965 }
 966
 967 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 968  * list that describes which code points it matches */
 969
 970 STATIC void
 971 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 972 {
 973     /* Set the SSC 'ssc' to match an empty string or any code point */
 974
 975     PERL_ARGS_ASSERT_SSC_ANYTHING;
 976
 977     assert(is_ANYOF_SYNTHETIC(ssc));
 978
 979     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 980     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 981     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 982 }
 983
 984 STATIC int
 985 S_ssc_is_anything(const regnode_ssc *ssc)
 986 {
 987     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 988      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 989      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 990      * in any way, so there's no point in using it */
 991
 992     UV start, end;
 993     bool ret;
 994
 995     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 996
 997     assert(is_ANYOF_SYNTHETIC(ssc));
 998
 999     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1000         return FALSE;
1001     }
1002
1003     /* See if the list consists solely of the range 0 - Infinity */
1004     invlist_iterinit(ssc->invlist);
1005     ret = invlist_iternext(ssc->invlist, &start, &end)
1006           && start == 0
1007           && end == UV_MAX;
1008
1009     invlist_iterfinish(ssc->invlist);
1010
1011     if (ret) {
1012         return TRUE;
1013     }
1014
1015     /* If e.g., both \w and \W are set, matches everything */
1016     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1017         int i;
1018         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1019             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1020                 return TRUE;
1021             }
1022         }
1023     }
1024
1025     return FALSE;
1026 }
1027
1028 STATIC void
1029 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1030 {
1031     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1032      * string, any code point, or any posix class under locale */
1033
1034     PERL_ARGS_ASSERT_SSC_INIT;
1035
1036     Zero(ssc, 1, regnode_ssc);
1037     set_ANYOF_SYNTHETIC(ssc);
1038     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1039     ssc_anything(ssc);
1040
1041     /* If any portion of the regex is to operate under locale rules that aren't
1042      * fully known at compile time, initialization includes it.  The reason
1043      * this isn't done for all regexes is that the optimizer was written under
1044      * the assumption that locale was all-or-nothing.  Given the complexity and
1045      * lack of documentation in the optimizer, and that there are inadequate
1046      * test cases for locale, many parts of it may not work properly, it is
1047      * safest to avoid locale unless necessary. */
1048     if (RExC_contains_locale) {
1049         ANYOF_POSIXL_SETALL(ssc);
1050     }
1051     else {
1052         ANYOF_POSIXL_ZERO(ssc);
1053     }
1054 }
1055
1056 STATIC int
1057 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1058                         const regnode_ssc *ssc)
1059 {
1060     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1061      * to the list of code points matched, and locale posix classes; hence does
1062      * not check its flags) */
1063
1064     UV start, end;
1065     bool ret;
1066
1067     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1068
1069     assert(is_ANYOF_SYNTHETIC(ssc));
1070
1071     invlist_iterinit(ssc->invlist);
1072     ret = invlist_iternext(ssc->invlist, &start, &end)
1073           && start == 0
1074           && end == UV_MAX;
1075
1076     invlist_iterfinish(ssc->invlist);
1077
1078     if (! ret) {
1079         return FALSE;
1080     }
1081
1082     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1083         return FALSE;
1084     }
1085
1086     return TRUE;
1087 }
1088
1089 STATIC SV*
1090 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1091                                const regnode_charclass* const node)
1092 {
1093     /* Returns a mortal inversion list defining which code points are matched
1094      * by 'node', which is of type ANYOF.  Handles complementing the result if
1095      * appropriate.  If some code points aren't knowable at this time, the
1096      * returned list must, and will, contain every code point that is a
1097      * possibility. */
1098
1099     SV* invlist = sv_2mortal(_new_invlist(0));
1100     SV* only_utf8_locale_invlist = NULL;
1101     unsigned int i;
1102     const U32 n = ARG(node);
1103     bool new_node_has_latin1 = FALSE;
1104
1105     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1106
1107     /* Look at the data structure created by S_set_ANYOF_arg() */
1108     if (n != ANYOF_ONLY_HAS_BITMAP) {
1109         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1110         AV * const av = MUTABLE_AV(SvRV(rv));
1111         SV **const ary = AvARRAY(av);
1112         assert(RExC_rxi->data->what[n] == 's');
1113
1114         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1115             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1116         }
1117         else if (ary[0] && ary[0] != &PL_sv_undef) {
1118
1119             /* Here, no compile-time swash, and there are things that won't be
1120              * known until runtime -- we have to assume it could be anything */
1121             return _add_range_to_invlist(invlist, 0, UV_MAX);
1122         }
1123         else if (ary[3] && ary[3] != &PL_sv_undef) {
1124
1125             /* Here no compile-time swash, and no run-time only data.  Use the
1126              * node's inversion list */
1127             invlist = sv_2mortal(invlist_clone(ary[3]));
1128         }
1129
1130         /* Get the code points valid only under UTF-8 locales */
1131         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1132             && ary[2] && ary[2] != &PL_sv_undef)
1133         {
1134             only_utf8_locale_invlist = ary[2];
1135         }
1136     }
1137
1138     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1139      * code points, and an inversion list for the others, but if there are code
1140      * points that should match only conditionally on the target string being
1141      * UTF-8, those are placed in the inversion list, and not the bitmap.
1142      * Since there are circumstances under which they could match, they are
1143      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1144      * to exclude them here, so that when we invert below, the end result
1145      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1146      * have to do this here before we add the unconditionally matched code
1147      * points */
1148     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1149         _invlist_intersection_complement_2nd(invlist,
1150                                              PL_UpperLatin1,
1151                                              &invlist);
1152     }
1153
1154     /* Add in the points from the bit map */
1155     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1156         if (ANYOF_BITMAP_TEST(node, i)) {
1157             invlist = add_cp_to_invlist(invlist, i);
1158             new_node_has_latin1 = TRUE;
1159         }
1160     }
1161
1162     /* If this can match all upper Latin1 code points, have to add them
1163      * as well */
1164     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1165         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1166     }
1167
1168     /* Similarly for these */
1169     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1170         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1171     }
1172
1173     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1174         _invlist_invert(invlist);
1175     }
1176     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1177
1178         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1179          * locale.  We can skip this if there are no 0-255 at all. */
1180         _invlist_union(invlist, PL_Latin1, &invlist);
1181     }
1182
1183     /* Similarly add the UTF-8 locale possible matches.  These have to be
1184      * deferred until after the non-UTF-8 locale ones are taken care of just
1185      * above, or it leads to wrong results under ANYOF_INVERT */
1186     if (only_utf8_locale_invlist) {
1187         _invlist_union_maybe_complement_2nd(invlist,
1188                                             only_utf8_locale_invlist,
1189                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1190                                             &invlist);
1191     }
1192
1193     return invlist;
1194 }
1195
1196 /* These two functions currently do the exact same thing */
1197 #define ssc_init_zero           ssc_init
1198
1199 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1200 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1201
1202 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1203  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1204  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1205
1206 STATIC void
1207 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1208                 const regnode_charclass *and_with)
1209 {
1210     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1211      * another SSC or a regular ANYOF class.  Can create false positives. */
1212
1213     SV* anded_cp_list;
1214     U8  anded_flags;
1215
1216     PERL_ARGS_ASSERT_SSC_AND;
1217
1218     assert(is_ANYOF_SYNTHETIC(ssc));
1219
1220     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1221      * the code point inversion list and just the relevant flags */
1222     if (is_ANYOF_SYNTHETIC(and_with)) {
1223         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1224         anded_flags = ANYOF_FLAGS(and_with);
1225
1226         /* XXX This is a kludge around what appears to be deficiencies in the
1227          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1228          * there are paths through the optimizer where it doesn't get weeded
1229          * out when it should.  And if we don't make some extra provision for
1230          * it like the code just below, it doesn't get added when it should.
1231          * This solution is to add it only when AND'ing, which is here, and
1232          * only when what is being AND'ed is the pristine, original node
1233          * matching anything.  Thus it is like adding it to ssc_anything() but
1234          * only when the result is to be AND'ed.  Probably the same solution
1235          * could be adopted for the same problem we have with /l matching,
1236          * which is solved differently in S_ssc_init(), and that would lead to
1237          * fewer false positives than that solution has.  But if this solution
1238          * creates bugs, the consequences are only that a warning isn't raised
1239          * that should be; while the consequences for having /l bugs is
1240          * incorrect matches */
1241         if (ssc_is_anything((regnode_ssc *)and_with)) {
1242             anded_flags |= ANYOF_WARN_SUPER;
1243         }
1244     }
1245     else {
1246         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1247         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1248     }
1249
1250     ANYOF_FLAGS(ssc) &= anded_flags;
1251
1252     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1253      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1254      * 'and_with' may be inverted.  When not inverted, we have the situation of
1255      * computing:
1256      *  (C1 | P1) & (C2 | P2)
1257      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1258      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1259      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1260      *                    <=  ((C1 & C2) | P1 | P2)
1261      * Alternatively, the last few steps could be:
1262      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1263      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1264      *                    <=  (C1 | C2 | (P1 & P2))
1265      * We favor the second approach if either P1 or P2 is non-empty.  This is
1266      * because these components are a barrier to doing optimizations, as what
1267      * they match cannot be known until the moment of matching as they are
1268      * dependent on the current locale, 'AND"ing them likely will reduce or
1269      * eliminate them.
1270      * But we can do better if we know that C1,P1 are in their initial state (a
1271      * frequent occurrence), each matching everything:
1272      *  (<everything>) & (C2 | P2) =  C2 | P2
1273      * Similarly, if C2,P2 are in their initial state (again a frequent
1274      * occurrence), the result is a no-op
1275      *  (C1 | P1) & (<everything>) =  C1 | P1
1276      *
1277      * Inverted, we have
1278      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1279      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1280      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1281      * */
1282
1283     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1284         && ! is_ANYOF_SYNTHETIC(and_with))
1285     {
1286         unsigned int i;
1287
1288         ssc_intersection(ssc,
1289                          anded_cp_list,
1290                          FALSE /* Has already been inverted */
1291                          );
1292
1293         /* If either P1 or P2 is empty, the intersection will be also; can skip
1294          * the loop */
1295         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1296             ANYOF_POSIXL_ZERO(ssc);
1297         }
1298         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1299
1300             /* Note that the Posix class component P from 'and_with' actually
1301              * looks like:
1302              *      P = Pa | Pb | ... | Pn
1303              * where each component is one posix class, such as in [\w\s].
1304              * Thus
1305              *      ~P = ~(Pa | Pb | ... | Pn)
1306              *         = ~Pa & ~Pb & ... & ~Pn
1307              *        <= ~Pa | ~Pb | ... | ~Pn
1308              * The last is something we can easily calculate, but unfortunately
1309              * is likely to have many false positives.  We could do better
1310              * in some (but certainly not all) instances if two classes in
1311              * P have known relationships.  For example
1312              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1313              * So
1314              *      :lower: & :print: = :lower:
1315              * And similarly for classes that must be disjoint.  For example,
1316              * since \s and \w can have no elements in common based on rules in
1317              * the POSIX standard,
1318              *      \w & ^\S = nothing
1319              * Unfortunately, some vendor locales do not meet the Posix
1320              * standard, in particular almost everything by Microsoft.
1321              * The loop below just changes e.g., \w into \W and vice versa */
1322
1323             regnode_charclass_posixl temp;
1324             int add = 1;    /* To calculate the index of the complement */
1325
1326             ANYOF_POSIXL_ZERO(&temp);
1327             for (i = 0; i < ANYOF_MAX; i++) {
1328                 assert(i % 2 != 0
1329                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1330                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1331
1332                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1333                     ANYOF_POSIXL_SET(&temp, i + add);
1334                 }
1335                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1336             }
1337             ANYOF_POSIXL_AND(&temp, ssc);
1338
1339         } /* else ssc already has no posixes */
1340     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1341          in its initial state */
1342     else if (! is_ANYOF_SYNTHETIC(and_with)
1343              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1344     {
1345         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1346          * copy it over 'ssc' */
1347         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1348             if (is_ANYOF_SYNTHETIC(and_with)) {
1349                 StructCopy(and_with, ssc, regnode_ssc);
1350             }
1351             else {
1352                 ssc->invlist = anded_cp_list;
1353                 ANYOF_POSIXL_ZERO(ssc);
1354                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1355                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1356                 }
1357             }
1358         }
1359         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1360                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1361         {
1362             /* One or the other of P1, P2 is non-empty. */
1363             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1364                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1365             }
1366             ssc_union(ssc, anded_cp_list, FALSE);
1367         }
1368         else { /* P1 = P2 = empty */
1369             ssc_intersection(ssc, anded_cp_list, FALSE);
1370         }
1371     }
1372 }
1373
1374 STATIC void
1375 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1376                const regnode_charclass *or_with)
1377 {
1378     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1379      * another SSC or a regular ANYOF class.  Can create false positives if
1380      * 'or_with' is to be inverted. */
1381
1382     SV* ored_cp_list;
1383     U8 ored_flags;
1384
1385     PERL_ARGS_ASSERT_SSC_OR;
1386
1387     assert(is_ANYOF_SYNTHETIC(ssc));
1388
1389     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1390      * the code point inversion list and just the relevant flags */
1391     if (is_ANYOF_SYNTHETIC(or_with)) {
1392         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1393         ored_flags = ANYOF_FLAGS(or_with);
1394     }
1395     else {
1396         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1397         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1398     }
1399
1400     ANYOF_FLAGS(ssc) |= ored_flags;
1401
1402     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1403      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1404      * 'or_with' may be inverted.  When not inverted, we have the simple
1405      * situation of computing:
1406      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1407      * If P1|P2 yields a situation with both a class and its complement are
1408      * set, like having both \w and \W, this matches all code points, and we
1409      * can delete these from the P component of the ssc going forward.  XXX We
1410      * might be able to delete all the P components, but I (khw) am not certain
1411      * about this, and it is better to be safe.
1412      *
1413      * Inverted, we have
1414      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1415      *                         <=  (C1 | P1) | ~C2
1416      *                         <=  (C1 | ~C2) | P1
1417      * (which results in actually simpler code than the non-inverted case)
1418      * */
1419
1420     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1421         && ! is_ANYOF_SYNTHETIC(or_with))
1422     {
1423         /* We ignore P2, leaving P1 going forward */
1424     }   /* else  Not inverted */
1425     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1426         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1427         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1428             unsigned int i;
1429             for (i = 0; i < ANYOF_MAX; i += 2) {
1430                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1431                 {
1432                     ssc_match_all_cp(ssc);
1433                     ANYOF_POSIXL_CLEAR(ssc, i);
1434                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1435                 }
1436             }
1437         }
1438     }
1439
1440     ssc_union(ssc,
1441               ored_cp_list,
1442               FALSE /* Already has been inverted */
1443               );
1444 }
1445
1446 PERL_STATIC_INLINE void
1447 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1448 {
1449     PERL_ARGS_ASSERT_SSC_UNION;
1450
1451     assert(is_ANYOF_SYNTHETIC(ssc));
1452
1453     _invlist_union_maybe_complement_2nd(ssc->invlist,
1454                                         invlist,
1455                                         invert2nd,
1456                                         &ssc->invlist);
1457 }
1458
1459 PERL_STATIC_INLINE void
1460 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1461                          SV* const invlist,
1462                          const bool invert2nd)
1463 {
1464     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1465
1466     assert(is_ANYOF_SYNTHETIC(ssc));
1467
1468     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1469                                                invlist,
1470                                                invert2nd,
1471                                                &ssc->invlist);
1472 }
1473
1474 PERL_STATIC_INLINE void
1475 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1476 {
1477     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1478
1479     assert(is_ANYOF_SYNTHETIC(ssc));
1480
1481     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1482 }
1483
1484 PERL_STATIC_INLINE void
1485 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1486 {
1487     /* AND just the single code point 'cp' into the SSC 'ssc' */
1488
1489     SV* cp_list = _new_invlist(2);
1490
1491     PERL_ARGS_ASSERT_SSC_CP_AND;
1492
1493     assert(is_ANYOF_SYNTHETIC(ssc));
1494
1495     cp_list = add_cp_to_invlist(cp_list, cp);
1496     ssc_intersection(ssc, cp_list,
1497                      FALSE /* Not inverted */
1498                      );
1499     SvREFCNT_dec_NN(cp_list);
1500 }
1501
1502 PERL_STATIC_INLINE void
1503 S_ssc_clear_locale(regnode_ssc *ssc)
1504 {
1505     /* Set the SSC 'ssc' to not match any locale things */
1506     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1507
1508     assert(is_ANYOF_SYNTHETIC(ssc));
1509
1510     ANYOF_POSIXL_ZERO(ssc);
1511     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1512 }
1513
1514 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1515
1516 STATIC bool
1517 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1518 {
1519     /* The synthetic start class is used to hopefully quickly winnow down
1520      * places where a pattern could start a match in the target string.  If it
1521      * doesn't really narrow things down that much, there isn't much point to
1522      * having the overhead of using it.  This function uses some very crude
1523      * heuristics to decide if to use the ssc or not.
1524      *
1525      * It returns TRUE if 'ssc' rules out more than half what it considers to
1526      * be the "likely" possible matches, but of course it doesn't know what the
1527      * actual things being matched are going to be; these are only guesses
1528      *
1529      * For /l matches, it assumes that the only likely matches are going to be
1530      *      in the 0-255 range, uniformly distributed, so half of that is 127
1531      * For /a and /d matches, it assumes that the likely matches will be just
1532      *      the ASCII range, so half of that is 63
1533      * For /u and there isn't anything matching above the Latin1 range, it
1534      *      assumes that that is the only range likely to be matched, and uses
1535      *      half that as the cut-off: 127.  If anything matches above Latin1,
1536      *      it assumes that all of Unicode could match (uniformly), except for
1537      *      non-Unicode code points and things in the General Category "Other"
1538      *      (unassigned, private use, surrogates, controls and formats).  This
1539      *      is a much large number. */
1540
1541     const U32 max_match = (LOC)
1542                           ? 127
1543                           : (! UNI_SEMANTICS)
1544                             ? 63
1545                             : (invlist_highest(ssc->invlist) < 256)
1546                               ? 127
1547                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1548     U32 count = 0;      /* Running total of number of code points matched by
1549                            'ssc' */
1550     UV start, end;      /* Start and end points of current range in inversion
1551                            list */
1552
1553     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1554
1555     invlist_iterinit(ssc->invlist);
1556     while (invlist_iternext(ssc->invlist, &start, &end)) {
1557
1558         /* /u is the only thing that we expect to match above 255; so if not /u
1559          * and even if there are matches above 255, ignore them.  This catches
1560          * things like \d under /d which does match the digits above 255, but
1561          * since the pattern is /d, it is not likely to be expecting them */
1562         if (! UNI_SEMANTICS) {
1563             if (start > 255) {
1564                 break;
1565             }
1566             end = MIN(end, 255);
1567         }
1568         count += end - start + 1;
1569         if (count > max_match) {
1570             invlist_iterfinish(ssc->invlist);
1571             return FALSE;
1572         }
1573     }
1574
1575     return TRUE;
1576 }
1577
1578
1579 STATIC void
1580 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1581 {
1582     /* The inversion list in the SSC is marked mortal; now we need a more
1583      * permanent copy, which is stored the same way that is done in a regular
1584      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1585      * map */
1586
1587     SV* invlist = invlist_clone(ssc->invlist);
1588
1589     PERL_ARGS_ASSERT_SSC_FINALIZE;
1590
1591     assert(is_ANYOF_SYNTHETIC(ssc));
1592
1593     /* The code in this file assumes that all but these flags aren't relevant
1594      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1595      * by the time we reach here */
1596     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1597
1598     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1599
1600     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1601                                 NULL, NULL, NULL, FALSE);
1602
1603     /* Make sure is clone-safe */
1604     ssc->invlist = NULL;
1605
1606     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1607         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1608     }
1609
1610     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1611 }
1612
1613 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1614 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1615 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1616 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1617                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1618                                : 0 )
1619
1620
1621 #ifdef DEBUGGING
1622 /*
1623    dump_trie(trie,widecharmap,revcharmap)
1624    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1625    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1626
1627    These routines dump out a trie in a somewhat readable format.
1628    The _interim_ variants are used for debugging the interim
1629    tables that are used to generate the final compressed
1630    representation which is what dump_trie expects.
1631
1632    Part of the reason for their existence is to provide a form
1633    of documentation as to how the different representations function.
1634
1635 */
1636
1637 /*
1638   Dumps the final compressed table form of the trie to Perl_debug_log.
1639   Used for debugging make_trie().
1640 */
1641
1642 STATIC void
1643 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1644             AV *revcharmap, U32 depth)
1645 {
1646     U32 state;
1647     SV *sv=sv_newmortal();
1648     int colwidth= widecharmap ? 6 : 4;
1649     U16 word;
1650     GET_RE_DEBUG_FLAGS_DECL;
1651
1652     PERL_ARGS_ASSERT_DUMP_TRIE;
1653
1654     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1655         (int)depth * 2 + 2,"",
1656         "Match","Base","Ofs" );
1657
1658     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1659         SV ** const tmp = av_fetch( revcharmap, state, 0);
1660         if ( tmp ) {
1661             PerlIO_printf( Perl_debug_log, "%*s",
1662                 colwidth,
1663                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1664                             PL_colors[0], PL_colors[1],
1665                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1666                             PERL_PV_ESCAPE_FIRSTCHAR
1667                 )
1668             );
1669         }
1670     }
1671     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1672         (int)depth * 2 + 2,"");
1673
1674     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1675         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1676     PerlIO_printf( Perl_debug_log, "\n");
1677
1678     for( state = 1 ; state < trie->statecount ; state++ ) {
1679         const U32 base = trie->states[ state ].trans.base;
1680
1681         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1682                                        (int)depth * 2 + 2,"", (UV)state);
1683
1684         if ( trie->states[ state ].wordnum ) {
1685             PerlIO_printf( Perl_debug_log, " W%4X",
1686                                            trie->states[ state ].wordnum );
1687         } else {
1688             PerlIO_printf( Perl_debug_log, "%6s", "" );
1689         }
1690
1691         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1692
1693         if ( base ) {
1694             U32 ofs = 0;
1695
1696             while( ( base + ofs  < trie->uniquecharcount ) ||
1697                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1698                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1699                                                                     != state))
1700                     ofs++;
1701
1702             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1703
1704             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1705                 if ( ( base + ofs >= trie->uniquecharcount )
1706                         && ( base + ofs - trie->uniquecharcount
1707                                                         < trie->lasttrans )
1708                         && trie->trans[ base + ofs
1709                                     - trie->uniquecharcount ].check == state )
1710                 {
1711                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1712                     colwidth,
1713                     (UV)trie->trans[ base + ofs
1714                                              - trie->uniquecharcount ].next );
1715                 } else {
1716                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1717                 }
1718             }
1719
1720             PerlIO_printf( Perl_debug_log, "]");
1721
1722         }
1723         PerlIO_printf( Perl_debug_log, "\n" );
1724     }
1725     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1726                                 (int)depth*2, "");
1727     for (word=1; word <= trie->wordcount; word++) {
1728         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1729             (int)word, (int)(trie->wordinfo[word].prev),
1730             (int)(trie->wordinfo[word].len));
1731     }
1732     PerlIO_printf(Perl_debug_log, "\n" );
1733 }
1734 /*
1735   Dumps a fully constructed but uncompressed trie in list form.
1736   List tries normally only are used for construction when the number of
1737   possible chars (trie->uniquecharcount) is very high.
1738   Used for debugging make_trie().
1739 */
1740 STATIC void
1741 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1742                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1743                          U32 depth)
1744 {
1745     U32 state;
1746     SV *sv=sv_newmortal();
1747     int colwidth= widecharmap ? 6 : 4;
1748     GET_RE_DEBUG_FLAGS_DECL;
1749
1750     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1751
1752     /* print out the table precompression.  */
1753     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1754         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1755         "------:-----+-----------------\n" );
1756
1757     for( state=1 ; state < next_alloc ; state ++ ) {
1758         U16 charid;
1759
1760         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1761             (int)depth * 2 + 2,"", (UV)state  );
1762         if ( ! trie->states[ state ].wordnum ) {
1763             PerlIO_printf( Perl_debug_log, "%5s| ","");
1764         } else {
1765             PerlIO_printf( Perl_debug_log, "W%4x| ",
1766                 trie->states[ state ].wordnum
1767             );
1768         }
1769         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1770             SV ** const tmp = av_fetch( revcharmap,
1771                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1772             if ( tmp ) {
1773                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1774                     colwidth,
1775                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1776                               colwidth,
1777                               PL_colors[0], PL_colors[1],
1778                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1779                               | PERL_PV_ESCAPE_FIRSTCHAR
1780                     ) ,
1781                     TRIE_LIST_ITEM(state,charid).forid,
1782                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1783                 );
1784                 if (!(charid % 10))
1785                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1786                         (int)((depth * 2) + 14), "");
1787             }
1788         }
1789         PerlIO_printf( Perl_debug_log, "\n");
1790     }
1791 }
1792
1793 /*
1794   Dumps a fully constructed but uncompressed trie in table form.
1795   This is the normal DFA style state transition table, with a few
1796   twists to facilitate compression later.
1797   Used for debugging make_trie().
1798 */
1799 STATIC void
1800 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1801                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1802                           U32 depth)
1803 {
1804     U32 state;
1805     U16 charid;
1806     SV *sv=sv_newmortal();
1807     int colwidth= widecharmap ? 6 : 4;
1808     GET_RE_DEBUG_FLAGS_DECL;
1809
1810     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1811
1812     /*
1813        print out the table precompression so that we can do a visual check
1814        that they are identical.
1815      */
1816
1817     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1818
1819     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1820         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1821         if ( tmp ) {
1822             PerlIO_printf( Perl_debug_log, "%*s",
1823                 colwidth,
1824                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1825                             PL_colors[0], PL_colors[1],
1826                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1827                             PERL_PV_ESCAPE_FIRSTCHAR
1828                 )
1829             );
1830         }
1831     }
1832
1833     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1834
1835     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1836         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1837     }
1838
1839     PerlIO_printf( Perl_debug_log, "\n" );
1840
1841     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1842
1843         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1844             (int)depth * 2 + 2,"",
1845             (UV)TRIE_NODENUM( state ) );
1846
1847         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1848             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1849             if (v)
1850                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1851             else
1852                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1853         }
1854         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1855             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1856                                             (UV)trie->trans[ state ].check );
1857         } else {
1858             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1859                                             (UV)trie->trans[ state ].check,
1860             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1861         }
1862     }
1863 }
1864
1865 #endif
1866
1867
1868 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1869   startbranch: the first branch in the whole branch sequence
1870   first      : start branch of sequence of branch-exact nodes.
1871                May be the same as startbranch
1872   last       : Thing following the last branch.
1873                May be the same as tail.
1874   tail       : item following the branch sequence
1875   count      : words in the sequence
1876   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1877   depth      : indent depth
1878
1879 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1880
1881 A trie is an N'ary tree where the branches are determined by digital
1882 decomposition of the key. IE, at the root node you look up the 1st character and
1883 follow that branch repeat until you find the end of the branches. Nodes can be
1884 marked as "accepting" meaning they represent a complete word. Eg:
1885
1886   /he|she|his|hers/
1887
1888 would convert into the following structure. Numbers represent states, letters
1889 following numbers represent valid transitions on the letter from that state, if
1890 the number is in square brackets it represents an accepting state, otherwise it
1891 will be in parenthesis.
1892
1893       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1894       |    |
1895       |   (2)
1896       |    |
1897      (1)   +-i->(6)-+-s->[7]
1898       |
1899       +-s->(3)-+-h->(4)-+-e->[5]
1900
1901       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1902
1903 This shows that when matching against the string 'hers' we will begin at state 1
1904 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1905 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1906 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1907 single traverse. We store a mapping from accepting to state to which word was
1908 matched, and then when we have multiple possibilities we try to complete the
1909 rest of the regex in the order in which they occured in the alternation.
1910
1911 The only prior NFA like behaviour that would be changed by the TRIE support is
1912 the silent ignoring of duplicate alternations which are of the form:
1913
1914  / (DUPE|DUPE) X? (?{ ... }) Y /x
1915
1916 Thus EVAL blocks following a trie may be called a different number of times with
1917 and without the optimisation. With the optimisations dupes will be silently
1918 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1919 the following demonstrates:
1920
1921  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1922
1923 which prints out 'word' three times, but
1924
1925  'words'=~/(word|word|word)(?{ print $1 })S/
1926
1927 which doesnt print it out at all. This is due to other optimisations kicking in.
1928
1929 Example of what happens on a structural level:
1930
1931 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1932
1933    1: CURLYM[1] {1,32767}(18)
1934    5:   BRANCH(8)
1935    6:     EXACT <ac>(16)
1936    8:   BRANCH(11)
1937    9:     EXACT <ad>(16)
1938   11:   BRANCH(14)
1939   12:     EXACT <ab>(16)
1940   16:   SUCCEED(0)
1941   17:   NOTHING(18)
1942   18: END(0)
1943
1944 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1945 and should turn into:
1946
1947    1: CURLYM[1] {1,32767}(18)
1948    5:   TRIE(16)
1949         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1950           <ac>
1951           <ad>
1952           <ab>
1953   16:   SUCCEED(0)
1954   17:   NOTHING(18)
1955   18: END(0)
1956
1957 Cases where tail != last would be like /(?foo|bar)baz/:
1958
1959    1: BRANCH(4)
1960    2:   EXACT <foo>(8)
1961    4: BRANCH(7)
1962    5:   EXACT <bar>(8)
1963    7: TAIL(8)
1964    8: EXACT <baz>(10)
1965   10: END(0)
1966
1967 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1968 and would end up looking like:
1969
1970     1: TRIE(8)
1971       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1972         <foo>
1973         <bar>
1974    7: TAIL(8)
1975    8: EXACT <baz>(10)
1976   10: END(0)
1977
1978     d = uvchr_to_utf8_flags(d, uv, 0);
1979
1980 is the recommended Unicode-aware way of saying
1981
1982     *(d++) = uv;
1983 */
1984
1985 #define TRIE_STORE_REVCHAR(val)                                            \
1986     STMT_START {                                                           \
1987         if (UTF) {                                                         \
1988             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1989             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1990             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1991             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1992             SvPOK_on(zlopp);                                               \
1993             SvUTF8_on(zlopp);                                              \
1994             av_push(revcharmap, zlopp);                                    \
1995         } else {                                                           \
1996             char ooooff = (char)val;                                           \
1997             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1998         }                                                                  \
1999         } STMT_END
2000
2001 /* This gets the next character from the input, folding it if not already
2002  * folded. */
2003 #define TRIE_READ_CHAR STMT_START {                                           \
2004     wordlen++;                                                                \
2005     if ( UTF ) {                                                              \
2006         /* if it is UTF then it is either already folded, or does not need    \
2007          * folding */                                                         \
2008         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2009     }                                                                         \
2010     else if (folder == PL_fold_latin1) {                                      \
2011         /* This folder implies Unicode rules, which in the range expressible  \
2012          *  by not UTF is the lower case, with the two exceptions, one of     \
2013          *  which should have been taken care of before calling this */       \
2014         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2015         uvc = toLOWER_L1(*uc);                                                \
2016         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2017         len = 1;                                                              \
2018     } else {                                                                  \
2019         /* raw data, will be folded later if needed */                        \
2020         uvc = (U32)*uc;                                                       \
2021         len = 1;                                                              \
2022     }                                                                         \
2023 } STMT_END
2024
2025
2026
2027 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2028     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2029         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2030         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2031     }                                                           \
2032     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2033     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2034     TRIE_LIST_CUR( state )++;                                   \
2035 } STMT_END
2036
2037 #define TRIE_LIST_NEW(state) STMT_START {                       \
2038     Newxz( trie->states[ state ].trans.list,               \
2039         4, reg_trie_trans_le );                                 \
2040      TRIE_LIST_CUR( state ) = 1;                                \
2041      TRIE_LIST_LEN( state ) = 4;                                \
2042 } STMT_END
2043
2044 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2045     U16 dupe= trie->states[ state ].wordnum;                    \
2046     regnode * const noper_next = regnext( noper );              \
2047                                                                 \
2048     DEBUG_r({                                                   \
2049         /* store the word for dumping */                        \
2050         SV* tmp;                                                \
2051         if (OP(noper) != NOTHING)                               \
2052             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2053         else                                                    \
2054             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2055         av_push( trie_words, tmp );                             \
2056     });                                                         \
2057                                                                 \
2058     curword++;                                                  \
2059     trie->wordinfo[curword].prev   = 0;                         \
2060     trie->wordinfo[curword].len    = wordlen;                   \
2061     trie->wordinfo[curword].accept = state;                     \
2062                                                                 \
2063     if ( noper_next < tail ) {                                  \
2064         if (!trie->jump)                                        \
2065             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2066                                                  sizeof(U16) ); \
2067         trie->jump[curword] = (U16)(noper_next - convert);      \
2068         if (!jumper)                                            \
2069             jumper = noper_next;                                \
2070         if (!nextbranch)                                        \
2071             nextbranch= regnext(cur);                           \
2072     }                                                           \
2073                                                                 \
2074     if ( dupe ) {                                               \
2075         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2076         /* chain, so that when the bits of chain are later    */\
2077         /* linked together, the dups appear in the chain      */\
2078         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2079         trie->wordinfo[dupe].prev = curword;                    \
2080     } else {                                                    \
2081         /* we haven't inserted this word yet.                */ \
2082         trie->states[ state ].wordnum = curword;                \
2083     }                                                           \
2084 } STMT_END
2085
2086
2087 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2088      ( ( base + charid >=  ucharcount                                   \
2089          && base + charid < ubound                                      \
2090          && state == trie->trans[ base - ucharcount + charid ].check    \
2091          && trie->trans[ base - ucharcount + charid ].next )            \
2092            ? trie->trans[ base - ucharcount + charid ].next             \
2093            : ( state==1 ? special : 0 )                                 \
2094       )
2095
2096 #define MADE_TRIE       1
2097 #define MADE_JUMP_TRIE  2
2098 #define MADE_EXACT_TRIE 4
2099
2100 STATIC I32
2101 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2102                   regnode *first, regnode *last, regnode *tail,
2103                   U32 word_count, U32 flags, U32 depth)
2104 {
2105     /* first pass, loop through and scan words */
2106     reg_trie_data *trie;
2107     HV *widecharmap = NULL;
2108     AV *revcharmap = newAV();
2109     regnode *cur;
2110     STRLEN len = 0;
2111     UV uvc = 0;
2112     U16 curword = 0;
2113     U32 next_alloc = 0;
2114     regnode *jumper = NULL;
2115     regnode *nextbranch = NULL;
2116     regnode *convert = NULL;
2117     U32 *prev_states; /* temp array mapping each state to previous one */
2118     /* we just use folder as a flag in utf8 */
2119     const U8 * folder = NULL;
2120
2121 #ifdef DEBUGGING
2122     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2123     AV *trie_words = NULL;
2124     /* along with revcharmap, this only used during construction but both are
2125      * useful during debugging so we store them in the struct when debugging.
2126      */
2127 #else
2128     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2129     STRLEN trie_charcount=0;
2130 #endif
2131     SV *re_trie_maxbuff;
2132     GET_RE_DEBUG_FLAGS_DECL;
2133
2134     PERL_ARGS_ASSERT_MAKE_TRIE;
2135 #ifndef DEBUGGING
2136     PERL_UNUSED_ARG(depth);
2137 #endif
2138
2139     switch (flags) {
2140         case EXACT: case EXACTL: break;
2141         case EXACTFA:
2142         case EXACTFU_SS:
2143         case EXACTFU:
2144         case EXACTFLU8: folder = PL_fold_latin1; break;
2145         case EXACTF:  folder = PL_fold; break;
2146         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2147     }
2148
2149     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2150     trie->refcount = 1;
2151     trie->startstate = 1;
2152     trie->wordcount = word_count;
2153     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2154     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2155     if (flags == EXACT || flags == EXACTL)
2156         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2157     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2158                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2159
2160     DEBUG_r({
2161         trie_words = newAV();
2162     });
2163
2164     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2165     assert(re_trie_maxbuff);
2166     if (!SvIOK(re_trie_maxbuff)) {
2167         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2168     }
2169     DEBUG_TRIE_COMPILE_r({
2170         PerlIO_printf( Perl_debug_log,
2171           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2172           (int)depth * 2 + 2, "",
2173           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2174           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2175     });
2176
2177    /* Find the node we are going to overwrite */
2178     if ( first == startbranch && OP( last ) != BRANCH ) {
2179         /* whole branch chain */
2180         convert = first;
2181     } else {
2182         /* branch sub-chain */
2183         convert = NEXTOPER( first );
2184     }
2185
2186     /*  -- First loop and Setup --
2187
2188        We first traverse the branches and scan each word to determine if it
2189        contains widechars, and how many unique chars there are, this is
2190        important as we have to build a table with at least as many columns as we
2191        have unique chars.
2192
2193        We use an array of integers to represent the character codes 0..255
2194        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2195        the native representation of the character value as the key and IV's for
2196        the coded index.
2197
2198        *TODO* If we keep track of how many times each character is used we can
2199        remap the columns so that the table compression later on is more
2200        efficient in terms of memory by ensuring the most common value is in the
2201        middle and the least common are on the outside.  IMO this would be better
2202        than a most to least common mapping as theres a decent chance the most
2203        common letter will share a node with the least common, meaning the node
2204        will not be compressible. With a middle is most common approach the worst
2205        case is when we have the least common nodes twice.
2206
2207      */
2208
2209     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2210         regnode *noper = NEXTOPER( cur );
2211         const U8 *uc = (U8*)STRING( noper );
2212         const U8 *e  = uc + STR_LEN( noper );
2213         int foldlen = 0;
2214         U32 wordlen      = 0;         /* required init */
2215         STRLEN minchars = 0;
2216         STRLEN maxchars = 0;
2217         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2218                                                bitmap?*/
2219
2220         if (OP(noper) == NOTHING) {
2221             regnode *noper_next= regnext(noper);
2222             if (noper_next != tail && OP(noper_next) == flags) {
2223                 noper = noper_next;
2224                 uc= (U8*)STRING(noper);
2225                 e= uc + STR_LEN(noper);
2226                 trie->minlen= STR_LEN(noper);
2227             } else {
2228                 trie->minlen= 0;
2229                 continue;
2230             }
2231         }
2232
2233         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2234             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2235                                           regardless of encoding */
2236             if (OP( noper ) == EXACTFU_SS) {
2237                 /* false positives are ok, so just set this */
2238                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2239             }
2240         }
2241         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2242                                            branch */
2243             TRIE_CHARCOUNT(trie)++;
2244             TRIE_READ_CHAR;
2245
2246             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2247              * is in effect.  Under /i, this character can match itself, or
2248              * anything that folds to it.  If not under /i, it can match just
2249              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2250              * all fold to k, and all are single characters.   But some folds
2251              * expand to more than one character, so for example LATIN SMALL
2252              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2253              * the string beginning at 'uc' is 'ffi', it could be matched by
2254              * three characters, or just by the one ligature character. (It
2255              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2256              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2257              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2258              * match.)  The trie needs to know the minimum and maximum number
2259              * of characters that could match so that it can use size alone to
2260              * quickly reject many match attempts.  The max is simple: it is
2261              * the number of folded characters in this branch (since a fold is
2262              * never shorter than what folds to it. */
2263
2264             maxchars++;
2265
2266             /* And the min is equal to the max if not under /i (indicated by
2267              * 'folder' being NULL), or there are no multi-character folds.  If
2268              * there is a multi-character fold, the min is incremented just
2269              * once, for the character that folds to the sequence.  Each
2270              * character in the sequence needs to be added to the list below of
2271              * characters in the trie, but we count only the first towards the
2272              * min number of characters needed.  This is done through the
2273              * variable 'foldlen', which is returned by the macros that look
2274              * for these sequences as the number of bytes the sequence
2275              * occupies.  Each time through the loop, we decrement 'foldlen' by
2276              * how many bytes the current char occupies.  Only when it reaches
2277              * 0 do we increment 'minchars' or look for another multi-character
2278              * sequence. */
2279             if (folder == NULL) {
2280                 minchars++;
2281             }
2282             else if (foldlen > 0) {
2283                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2284             }
2285             else {
2286                 minchars++;
2287
2288                 /* See if *uc is the beginning of a multi-character fold.  If
2289                  * so, we decrement the length remaining to look at, to account
2290                  * for the current character this iteration.  (We can use 'uc'
2291                  * instead of the fold returned by TRIE_READ_CHAR because for
2292                  * non-UTF, the latin1_safe macro is smart enough to account
2293                  * for all the unfolded characters, and because for UTF, the
2294                  * string will already have been folded earlier in the
2295                  * compilation process */
2296                 if (UTF) {
2297                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2298                         foldlen -= UTF8SKIP(uc);
2299                     }
2300                 }
2301                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2302                     foldlen--;
2303                 }
2304             }
2305
2306             /* The current character (and any potential folds) should be added
2307              * to the possible matching characters for this position in this
2308              * branch */
2309             if ( uvc < 256 ) {
2310                 if ( folder ) {
2311                     U8 folded= folder[ (U8) uvc ];
2312                     if ( !trie->charmap[ folded ] ) {
2313                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2314                         TRIE_STORE_REVCHAR( folded );
2315                     }
2316                 }
2317                 if ( !trie->charmap[ uvc ] ) {
2318                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2319                     TRIE_STORE_REVCHAR( uvc );
2320                 }
2321                 if ( set_bit ) {
2322                     /* store the codepoint in the bitmap, and its folded
2323                      * equivalent. */
2324                     TRIE_BITMAP_SET(trie, uvc);
2325
2326                     /* store the folded codepoint */
2327                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2328
2329                     if ( !UTF ) {
2330                         /* store first byte of utf8 representation of
2331                            variant codepoints */
2332                         if (! UVCHR_IS_INVARIANT(uvc)) {
2333                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2334                         }
2335                     }
2336                     set_bit = 0; /* We've done our bit :-) */
2337                 }
2338             } else {
2339
2340                 /* XXX We could come up with the list of code points that fold
2341                  * to this using PL_utf8_foldclosures, except not for
2342                  * multi-char folds, as there may be multiple combinations
2343                  * there that could work, which needs to wait until runtime to
2344                  * resolve (The comment about LIGATURE FFI above is such an
2345                  * example */
2346
2347                 SV** svpp;
2348                 if ( !widecharmap )
2349                     widecharmap = newHV();
2350
2351                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2352
2353                 if ( !svpp )
2354                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2355
2356                 if ( !SvTRUE( *svpp ) ) {
2357                     sv_setiv( *svpp, ++trie->uniquecharcount );
2358                     TRIE_STORE_REVCHAR(uvc);
2359                 }
2360             }
2361         } /* end loop through characters in this branch of the trie */
2362
2363         /* We take the min and max for this branch and combine to find the min
2364          * and max for all branches processed so far */
2365         if( cur == first ) {
2366             trie->minlen = minchars;
2367             trie->maxlen = maxchars;
2368         } else if (minchars < trie->minlen) {
2369             trie->minlen = minchars;
2370         } else if (maxchars > trie->maxlen) {
2371             trie->maxlen = maxchars;
2372         }
2373     } /* end first pass */
2374     DEBUG_TRIE_COMPILE_r(
2375         PerlIO_printf( Perl_debug_log,
2376                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2377                 (int)depth * 2 + 2,"",
2378                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2379                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2380                 (int)trie->minlen, (int)trie->maxlen )
2381     );
2382
2383     /*
2384         We now know what we are dealing with in terms of unique chars and
2385         string sizes so we can calculate how much memory a naive
2386         representation using a flat table  will take. If it's over a reasonable
2387         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2388         conservative but potentially much slower representation using an array
2389         of lists.
2390
2391         At the end we convert both representations into the same compressed
2392         form that will be used in regexec.c for matching with. The latter
2393         is a form that cannot be used to construct with but has memory
2394         properties similar to the list form and access properties similar
2395         to the table form making it both suitable for fast searches and
2396         small enough that its feasable to store for the duration of a program.
2397
2398         See the comment in the code where the compressed table is produced
2399         inplace from the flat tabe representation for an explanation of how
2400         the compression works.
2401
2402     */
2403
2404
2405     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2406     prev_states[1] = 0;
2407
2408     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2409                                                     > SvIV(re_trie_maxbuff) )
2410     {
2411         /*
2412             Second Pass -- Array Of Lists Representation
2413
2414             Each state will be represented by a list of charid:state records
2415             (reg_trie_trans_le) the first such element holds the CUR and LEN
2416             points of the allocated array. (See defines above).
2417
2418             We build the initial structure using the lists, and then convert
2419             it into the compressed table form which allows faster lookups
2420             (but cant be modified once converted).
2421         */
2422
2423         STRLEN transcount = 1;
2424
2425         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2426             "%*sCompiling trie using list compiler\n",
2427             (int)depth * 2 + 2, ""));
2428
2429         trie->states = (reg_trie_state *)
2430             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2431                                   sizeof(reg_trie_state) );
2432         TRIE_LIST_NEW(1);
2433         next_alloc = 2;
2434
2435         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2436
2437             regnode *noper   = NEXTOPER( cur );
2438             U8 *uc           = (U8*)STRING( noper );
2439             const U8 *e      = uc + STR_LEN( noper );
2440             U32 state        = 1;         /* required init */
2441             U16 charid       = 0;         /* sanity init */
2442             U32 wordlen      = 0;         /* required init */
2443
2444             if (OP(noper) == NOTHING) {
2445                 regnode *noper_next= regnext(noper);
2446                 if (noper_next != tail && OP(noper_next) == flags) {
2447                     noper = noper_next;
2448                     uc= (U8*)STRING(noper);
2449                     e= uc + STR_LEN(noper);
2450                 }
2451             }
2452
2453             if (OP(noper) != NOTHING) {
2454                 for ( ; uc < e ; uc += len ) {
2455
2456                     TRIE_READ_CHAR;
2457
2458                     if ( uvc < 256 ) {
2459                         charid = trie->charmap[ uvc ];
2460                     } else {
2461                         SV** const svpp = hv_fetch( widecharmap,
2462                                                     (char*)&uvc,
2463                                                     sizeof( UV ),
2464                                                     0);
2465                         if ( !svpp ) {
2466                             charid = 0;
2467                         } else {
2468                             charid=(U16)SvIV( *svpp );
2469                         }
2470                     }
2471                     /* charid is now 0 if we dont know the char read, or
2472                      * nonzero if we do */
2473                     if ( charid ) {
2474
2475                         U16 check;
2476                         U32 newstate = 0;
2477
2478                         charid--;
2479                         if ( !trie->states[ state ].trans.list ) {
2480                             TRIE_LIST_NEW( state );
2481                         }
2482                         for ( check = 1;
2483                               check <= TRIE_LIST_USED( state );
2484                               check++ )
2485                         {
2486                             if ( TRIE_LIST_ITEM( state, check ).forid
2487                                                                     == charid )
2488                             {
2489                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2490                                 break;
2491                             }
2492                         }
2493                         if ( ! newstate ) {
2494                             newstate = next_alloc++;
2495                             prev_states[newstate] = state;
2496                             TRIE_LIST_PUSH( state, charid, newstate );
2497                             transcount++;
2498                         }
2499                         state = newstate;
2500                     } else {
2501                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2502                     }
2503                 }
2504             }
2505             TRIE_HANDLE_WORD(state);
2506
2507         } /* end second pass */
2508
2509         /* next alloc is the NEXT state to be allocated */
2510         trie->statecount = next_alloc;
2511         trie->states = (reg_trie_state *)
2512             PerlMemShared_realloc( trie->states,
2513                                    next_alloc
2514                                    * sizeof(reg_trie_state) );
2515
2516         /* and now dump it out before we compress it */
2517         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2518                                                          revcharmap, next_alloc,
2519                                                          depth+1)
2520         );
2521
2522         trie->trans = (reg_trie_trans *)
2523             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2524         {
2525             U32 state;
2526             U32 tp = 0;
2527             U32 zp = 0;
2528
2529
2530             for( state=1 ; state < next_alloc ; state ++ ) {
2531                 U32 base=0;
2532
2533                 /*
2534                 DEBUG_TRIE_COMPILE_MORE_r(
2535                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2536                 );
2537                 */
2538
2539                 if (trie->states[state].trans.list) {
2540                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2541                     U16 maxid=minid;
2542                     U16 idx;
2543
2544                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2545                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2546                         if ( forid < minid ) {
2547                             minid=forid;
2548                         } else if ( forid > maxid ) {
2549                             maxid=forid;
2550                         }
2551                     }
2552                     if ( transcount < tp + maxid - minid + 1) {
2553                         transcount *= 2;
2554                         trie->trans = (reg_trie_trans *)
2555                             PerlMemShared_realloc( trie->trans,
2556                                                      transcount
2557                                                      * sizeof(reg_trie_trans) );
2558                         Zero( trie->trans + (transcount / 2),
2559                               transcount / 2,
2560                               reg_trie_trans );
2561                     }
2562                     base = trie->uniquecharcount + tp - minid;
2563                     if ( maxid == minid ) {
2564                         U32 set = 0;
2565                         for ( ; zp < tp ; zp++ ) {
2566                             if ( ! trie->trans[ zp ].next ) {
2567                                 base = trie->uniquecharcount + zp - minid;
2568                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2569                                                                    1).newstate;
2570                                 trie->trans[ zp ].check = state;
2571                                 set = 1;
2572                                 break;
2573                             }
2574                         }
2575                         if ( !set ) {
2576                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2577                                                                    1).newstate;
2578                             trie->trans[ tp ].check = state;
2579                             tp++;
2580                             zp = tp;
2581                         }
2582                     } else {
2583                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2584                             const U32 tid = base
2585                                            - trie->uniquecharcount
2586                                            + TRIE_LIST_ITEM( state, idx ).forid;
2587                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2588                                                                 idx ).newstate;
2589                             trie->trans[ tid ].check = state;
2590                         }
2591                         tp += ( maxid - minid + 1 );
2592                     }
2593                     Safefree(trie->states[ state ].trans.list);
2594                 }
2595                 /*
2596                 DEBUG_TRIE_COMPILE_MORE_r(
2597                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2598                 );
2599                 */
2600                 trie->states[ state ].trans.base=base;
2601             }
2602             trie->lasttrans = tp + 1;
2603         }
2604     } else {
2605         /*
2606            Second Pass -- Flat Table Representation.
2607
2608            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2609            each.  We know that we will need Charcount+1 trans at most to store
2610            the data (one row per char at worst case) So we preallocate both
2611            structures assuming worst case.
2612
2613            We then construct the trie using only the .next slots of the entry
2614            structs.
2615
2616            We use the .check field of the first entry of the node temporarily
2617            to make compression both faster and easier by keeping track of how
2618            many non zero fields are in the node.
2619
2620            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2621            transition.
2622
2623            There are two terms at use here: state as a TRIE_NODEIDX() which is
2624            a number representing the first entry of the node, and state as a
2625            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2626            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2627            if there are 2 entrys per node. eg:
2628
2629              A B       A B
2630           1. 2 4    1. 3 7
2631           2. 0 3    3. 0 5
2632           3. 0 0    5. 0 0
2633           4. 0 0    7. 0 0
2634
2635            The table is internally in the right hand, idx form. However as we
2636            also have to deal with the states array which is indexed by nodenum
2637            we have to use TRIE_NODENUM() to convert.
2638
2639         */
2640         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2641             "%*sCompiling trie using table compiler\n",
2642             (int)depth * 2 + 2, ""));
2643
2644         trie->trans = (reg_trie_trans *)
2645             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2646                                   * trie->uniquecharcount + 1,
2647                                   sizeof(reg_trie_trans) );
2648         trie->states = (reg_trie_state *)
2649             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2650                                   sizeof(reg_trie_state) );
2651         next_alloc = trie->uniquecharcount + 1;
2652
2653
2654         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2655
2656             regnode *noper   = NEXTOPER( cur );
2657             const U8 *uc     = (U8*)STRING( noper );
2658             const U8 *e      = uc + STR_LEN( noper );
2659
2660             U32 state        = 1;         /* required init */
2661
2662             U16 charid       = 0;         /* sanity init */
2663             U32 accept_state = 0;         /* sanity init */
2664
2665             U32 wordlen      = 0;         /* required init */
2666
2667             if (OP(noper) == NOTHING) {
2668                 regnode *noper_next= regnext(noper);
2669                 if (noper_next != tail && OP(noper_next) == flags) {
2670                     noper = noper_next;
2671                     uc= (U8*)STRING(noper);
2672                     e= uc + STR_LEN(noper);
2673                 }
2674             }
2675
2676             if ( OP(noper) != NOTHING ) {
2677                 for ( ; uc < e ; uc += len ) {
2678
2679                     TRIE_READ_CHAR;
2680
2681                     if ( uvc < 256 ) {
2682                         charid = trie->charmap[ uvc ];
2683                     } else {
2684                         SV* const * const svpp = hv_fetch( widecharmap,
2685                                                            (char*)&uvc,
2686                                                            sizeof( UV ),
2687                                                            0);
2688                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2689                     }
2690                     if ( charid ) {
2691                         charid--;
2692                         if ( !trie->trans[ state + charid ].next ) {
2693                             trie->trans[ state + charid ].next = next_alloc;
2694                             trie->trans[ state ].check++;
2695                             prev_states[TRIE_NODENUM(next_alloc)]
2696                                     = TRIE_NODENUM(state);
2697                             next_alloc += trie->uniquecharcount;
2698                         }
2699                         state = trie->trans[ state + charid ].next;
2700                     } else {
2701                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2702                     }
2703                     /* charid is now 0 if we dont know the char read, or
2704                      * nonzero if we do */
2705                 }
2706             }
2707             accept_state = TRIE_NODENUM( state );
2708             TRIE_HANDLE_WORD(accept_state);
2709
2710         } /* end second pass */
2711
2712         /* and now dump it out before we compress it */
2713         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2714                                                           revcharmap,
2715                                                           next_alloc, depth+1));
2716
2717         {
2718         /*
2719            * Inplace compress the table.*
2720
2721            For sparse data sets the table constructed by the trie algorithm will
2722            be mostly 0/FAIL transitions or to put it another way mostly empty.
2723            (Note that leaf nodes will not contain any transitions.)
2724
2725            This algorithm compresses the tables by eliminating most such
2726            transitions, at the cost of a modest bit of extra work during lookup:
2727
2728            - Each states[] entry contains a .base field which indicates the
2729            index in the state[] array wheres its transition data is stored.
2730
2731            - If .base is 0 there are no valid transitions from that node.
2732
2733            - If .base is nonzero then charid is added to it to find an entry in
2734            the trans array.
2735
2736            -If trans[states[state].base+charid].check!=state then the
2737            transition is taken to be a 0/Fail transition. Thus if there are fail
2738            transitions at the front of the node then the .base offset will point
2739            somewhere inside the previous nodes data (or maybe even into a node
2740            even earlier), but the .check field determines if the transition is
2741            valid.
2742
2743            XXX - wrong maybe?
2744            The following process inplace converts the table to the compressed
2745            table: We first do not compress the root node 1,and mark all its
2746            .check pointers as 1 and set its .base pointer as 1 as well. This
2747            allows us to do a DFA construction from the compressed table later,
2748            and ensures that any .base pointers we calculate later are greater
2749            than 0.
2750
2751            - We set 'pos' to indicate the first entry of the second node.
2752
2753            - We then iterate over the columns of the node, finding the first and
2754            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2755            and set the .check pointers accordingly, and advance pos
2756            appropriately and repreat for the next node. Note that when we copy
2757            the next pointers we have to convert them from the original
2758            NODEIDX form to NODENUM form as the former is not valid post
2759            compression.
2760
2761            - If a node has no transitions used we mark its base as 0 and do not
2762            advance the pos pointer.
2763
2764            - If a node only has one transition we use a second pointer into the
2765            structure to fill in allocated fail transitions from other states.
2766            This pointer is independent of the main pointer and scans forward
2767            looking for null transitions that are allocated to a state. When it
2768            finds one it writes the single transition into the "hole".  If the
2769            pointer doesnt find one the single transition is appended as normal.
2770
2771            - Once compressed we can Renew/realloc the structures to release the
2772            excess space.
2773
2774            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2775            specifically Fig 3.47 and the associated pseudocode.
2776
2777            demq
2778         */
2779         const U32 laststate = TRIE_NODENUM( next_alloc );
2780         U32 state, charid;
2781         U32 pos = 0, zp=0;
2782         trie->statecount = laststate;
2783
2784         for ( state = 1 ; state < laststate ; state++ ) {
2785             U8 flag = 0;
2786             const U32 stateidx = TRIE_NODEIDX( state );
2787             const U32 o_used = trie->trans[ stateidx ].check;
2788             U32 used = trie->trans[ stateidx ].check;
2789             trie->trans[ stateidx ].check = 0;
2790
2791             for ( charid = 0;
2792                   used && charid < trie->uniquecharcount;
2793                   charid++ )
2794             {
2795                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2796                     if ( trie->trans[ stateidx + charid ].next ) {
2797                         if (o_used == 1) {
2798                             for ( ; zp < pos ; zp++ ) {
2799                                 if ( ! trie->trans[ zp ].next ) {
2800                                     break;
2801                                 }
2802                             }
2803                             trie->states[ state ].trans.base
2804                                                     = zp
2805                                                       + trie->uniquecharcount
2806                                                       - charid ;
2807                             trie->trans[ zp ].next
2808                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2809                                                              + charid ].next );
2810                             trie->trans[ zp ].check = state;
2811                             if ( ++zp > pos ) pos = zp;
2812                             break;
2813                         }
2814                         used--;
2815                     }
2816                     if ( !flag ) {
2817                         flag = 1;
2818                         trie->states[ state ].trans.base
2819                                        = pos + trie->uniquecharcount - charid ;
2820                     }
2821                     trie->trans[ pos ].next
2822                         = SAFE_TRIE_NODENUM(
2823                                        trie->trans[ stateidx + charid ].next );
2824                     trie->trans[ pos ].check = state;
2825                     pos++;
2826                 }
2827             }
2828         }
2829         trie->lasttrans = pos + 1;
2830         trie->states = (reg_trie_state *)
2831             PerlMemShared_realloc( trie->states, laststate
2832                                    * sizeof(reg_trie_state) );
2833         DEBUG_TRIE_COMPILE_MORE_r(
2834             PerlIO_printf( Perl_debug_log,
2835                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2836                 (int)depth * 2 + 2,"",
2837                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2838                        + 1 ),
2839                 (IV)next_alloc,
2840                 (IV)pos,
2841                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2842             );
2843
2844         } /* end table compress */
2845     }
2846     DEBUG_TRIE_COMPILE_MORE_r(
2847             PerlIO_printf(Perl_debug_log,
2848                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2849                 (int)depth * 2 + 2, "",
2850                 (UV)trie->statecount,
2851                 (UV)trie->lasttrans)
2852     );
2853     /* resize the trans array to remove unused space */
2854     trie->trans = (reg_trie_trans *)
2855         PerlMemShared_realloc( trie->trans, trie->lasttrans
2856                                * sizeof(reg_trie_trans) );
2857
2858     {   /* Modify the program and insert the new TRIE node */
2859         U8 nodetype =(U8)(flags & 0xFF);
2860         char *str=NULL;
2861
2862 #ifdef DEBUGGING
2863         regnode *optimize = NULL;
2864 #ifdef RE_TRACK_PATTERN_OFFSETS
2865
2866         U32 mjd_offset = 0;
2867         U32 mjd_nodelen = 0;
2868 #endif /* RE_TRACK_PATTERN_OFFSETS */
2869 #endif /* DEBUGGING */
2870         /*
2871            This means we convert either the first branch or the first Exact,
2872            depending on whether the thing following (in 'last') is a branch
2873            or not and whther first is the startbranch (ie is it a sub part of
2874            the alternation or is it the whole thing.)
2875            Assuming its a sub part we convert the EXACT otherwise we convert
2876            the whole branch sequence, including the first.
2877          */
2878         /* Find the node we are going to overwrite */
2879         if ( first != startbranch || OP( last ) == BRANCH ) {
2880             /* branch sub-chain */
2881             NEXT_OFF( first ) = (U16)(last - first);
2882 #ifdef RE_TRACK_PATTERN_OFFSETS
2883             DEBUG_r({
2884                 mjd_offset= Node_Offset((convert));
2885                 mjd_nodelen= Node_Length((convert));
2886             });
2887 #endif
2888             /* whole branch chain */
2889         }
2890 #ifdef RE_TRACK_PATTERN_OFFSETS
2891         else {
2892             DEBUG_r({
2893                 const  regnode *nop = NEXTOPER( convert );
2894                 mjd_offset= Node_Offset((nop));
2895                 mjd_nodelen= Node_Length((nop));
2896             });
2897         }
2898         DEBUG_OPTIMISE_r(
2899             PerlIO_printf(Perl_debug_log,
2900                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2901                 (int)depth * 2 + 2, "",
2902                 (UV)mjd_offset, (UV)mjd_nodelen)
2903         );
2904 #endif
2905         /* But first we check to see if there is a common prefix we can
2906            split out as an EXACT and put in front of the TRIE node.  */
2907         trie->startstate= 1;
2908         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2909             U32 state;
2910             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2911                 U32 ofs = 0;
2912                 I32 idx = -1;
2913                 U32 count = 0;
2914                 const U32 base = trie->states[ state ].trans.base;
2915
2916                 if ( trie->states[state].wordnum )
2917                         count = 1;
2918
2919                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2920                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2921                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2922                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2923                     {
2924                         if ( ++count > 1 ) {
2925                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2926                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2927                             if ( state == 1 ) break;
2928                             if ( count == 2 ) {
2929                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2930                                 DEBUG_OPTIMISE_r(
2931                                     PerlIO_printf(Perl_debug_log,
2932                                         "%*sNew Start State=%"UVuf" Class: [",
2933                                         (int)depth * 2 + 2, "",
2934                                         (UV)state));
2935                                 if (idx >= 0) {
2936                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2937                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2938
2939                                     TRIE_BITMAP_SET(trie,*ch);
2940                                     if ( folder )
2941                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2942                                     DEBUG_OPTIMISE_r(
2943                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2944                                     );
2945                                 }
2946                             }
2947                             TRIE_BITMAP_SET(trie,*ch);
2948                             if ( folder )
2949                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2950                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2951                         }
2952                         idx = ofs;
2953                     }
2954                 }
2955                 if ( count == 1 ) {
2956                     SV **tmp = av_fetch( revcharmap, idx, 0);
2957                     STRLEN len;
2958                     char *ch = SvPV( *tmp, len );
2959                     DEBUG_OPTIMISE_r({
2960                         SV *sv=sv_newmortal();
2961                         PerlIO_printf( Perl_debug_log,
2962                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2963                             (int)depth * 2 + 2, "",
2964                             (UV)state, (UV)idx,
2965                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2966                                 PL_colors[0], PL_colors[1],
2967                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2968                                 PERL_PV_ESCAPE_FIRSTCHAR
2969                             )
2970                         );
2971                     });
2972                     if ( state==1 ) {
2973                         OP( convert ) = nodetype;
2974                         str=STRING(convert);
2975                         STR_LEN(convert)=0;
2976                     }
2977                     STR_LEN(convert) += len;
2978                     while (len--)
2979                         *str++ = *ch++;
2980                 } else {
2981 #ifdef DEBUGGING
2982                     if (state>1)
2983                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2984 #endif
2985                     break;
2986                 }
2987             }
2988             trie->prefixlen = (state-1);
2989             if (str) {
2990                 regnode *n = convert+NODE_SZ_STR(convert);
2991                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2992                 trie->startstate = state;
2993                 trie->minlen -= (state - 1);
2994                 trie->maxlen -= (state - 1);
2995 #ifdef DEBUGGING
2996                /* At least the UNICOS C compiler choked on this
2997                 * being argument to DEBUG_r(), so let's just have
2998                 * it right here. */
2999                if (
3000 #ifdef PERL_EXT_RE_BUILD
3001                    1
3002 #else
3003                    DEBUG_r_TEST
3004 #endif
3005                    ) {
3006                    regnode *fix = convert;
3007                    U32 word = trie->wordcount;
3008                    mjd_nodelen++;
3009                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3010                    while( ++fix < n ) {
3011                        Set_Node_Offset_Length(fix, 0, 0);
3012                    }
3013                    while (word--) {
3014                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3015                        if (tmp) {
3016                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3017                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3018                            else
3019                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3020                        }
3021                    }
3022                }
3023 #endif
3024                 if (trie->maxlen) {
3025                     convert = n;
3026                 } else {
3027                     NEXT_OFF(convert) = (U16)(tail - convert);
3028                     DEBUG_r(optimize= n);
3029                 }
3030             }
3031         }
3032         if (!jumper)
3033             jumper = last;
3034         if ( trie->maxlen ) {
3035             NEXT_OFF( convert ) = (U16)(tail - convert);
3036             ARG_SET( convert, data_slot );
3037             /* Store the offset to the first unabsorbed branch in
3038                jump[0], which is otherwise unused by the jump logic.
3039                We use this when dumping a trie and during optimisation. */
3040             if (trie->jump)
3041                 trie->jump[0] = (U16)(nextbranch - convert);
3042
3043             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3044              *   and there is a bitmap
3045              *   and the first "jump target" node we found leaves enough room
3046              * then convert the TRIE node into a TRIEC node, with the bitmap
3047              * embedded inline in the opcode - this is hypothetically faster.
3048              */
3049             if ( !trie->states[trie->startstate].wordnum
3050                  && trie->bitmap
3051                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3052             {
3053                 OP( convert ) = TRIEC;
3054                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3055                 PerlMemShared_free(trie->bitmap);
3056                 trie->bitmap= NULL;
3057             } else
3058                 OP( convert ) = TRIE;
3059
3060             /* store the type in the flags */
3061             convert->flags = nodetype;
3062             DEBUG_r({
3063             optimize = convert
3064                       + NODE_STEP_REGNODE
3065                       + regarglen[ OP( convert ) ];
3066             });
3067             /* XXX We really should free up the resource in trie now,
3068                    as we won't use them - (which resources?) dmq */
3069         }
3070         /* needed for dumping*/
3071         DEBUG_r(if (optimize) {
3072             regnode *opt = convert;
3073
3074             while ( ++opt < optimize) {
3075                 Set_Node_Offset_Length(opt,0,0);
3076             }
3077             /*
3078                 Try to clean up some of the debris left after the
3079                 optimisation.
3080              */
3081             while( optimize < jumper ) {
3082                 mjd_nodelen += Node_Length((optimize));
3083                 OP( optimize ) = OPTIMIZED;
3084                 Set_Node_Offset_Length(optimize,0,0);
3085                 optimize++;
3086             }
3087             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3088         });
3089     } /* end node insert */
3090
3091     /*  Finish populating the prev field of the wordinfo array.  Walk back
3092      *  from each accept state until we find another accept state, and if
3093      *  so, point the first word's .prev field at the second word. If the
3094      *  second already has a .prev field set, stop now. This will be the
3095      *  case either if we've already processed that word's accept state,
3096      *  or that state had multiple words, and the overspill words were
3097      *  already linked up earlier.
3098      */
3099     {
3100         U16 word;
3101         U32 state;
3102         U16 prev;
3103
3104         for (word=1; word <= trie->wordcount; word++) {
3105             prev = 0;
3106             if (trie->wordinfo[word].prev)
3107                 continue;
3108             state = trie->wordinfo[word].accept;
3109             while (state) {
3110                 state = prev_states[state];
3111                 if (!state)
3112                     break;
3113                 prev = trie->states[state].wordnum;
3114                 if (prev)
3115                     break;
3116             }
3117             trie->wordinfo[word].prev = prev;
3118         }
3119         Safefree(prev_states);
3120     }
3121
3122
3123     /* and now dump out the compressed format */
3124     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3125
3126     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3127 #ifdef DEBUGGING
3128     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3129     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3130 #else
3131     SvREFCNT_dec_NN(revcharmap);
3132 #endif
3133     return trie->jump
3134            ? MADE_JUMP_TRIE
3135            : trie->startstate>1
3136              ? MADE_EXACT_TRIE
3137              : MADE_TRIE;
3138 }
3139
3140 STATIC regnode *
3141 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3142 {
3143 /* The Trie is constructed and compressed now so we can build a fail array if
3144  * it's needed
3145
3146    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3147    3.32 in the
3148    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3149    Ullman 1985/88
3150    ISBN 0-201-10088-6
3151
3152    We find the fail state for each state in the trie, this state is the longest
3153    proper suffix of the current state's 'word' that is also a proper prefix of
3154    another word in our trie. State 1 represents the word '' and is thus the
3155    default fail state. This allows the DFA not to have to restart after its
3156    tried and failed a word at a given point, it simply continues as though it
3157    had been matching the other word in the first place.
3158    Consider
3159       'abcdgu'=~/abcdefg|cdgu/
3160    When we get to 'd' we are still matching the first word, we would encounter
3161    'g' which would fail, which would bring us to the state representing 'd' in
3162    the second word where we would try 'g' and succeed, proceeding to match
3163    'cdgu'.
3164  */
3165  /* add a fail transition */
3166     const U32 trie_offset = ARG(source);
3167     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3168     U32 *q;
3169     const U32 ucharcount = trie->uniquecharcount;
3170     const U32 numstates = trie->statecount;
3171     const U32 ubound = trie->lasttrans + ucharcount;
3172     U32 q_read = 0;
3173     U32 q_write = 0;
3174     U32 charid;
3175     U32 base = trie->states[ 1 ].trans.base;
3176     U32 *fail;
3177     reg_ac_data *aho;
3178     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3179     regnode *stclass;
3180     GET_RE_DEBUG_FLAGS_DECL;
3181
3182     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3183     PERL_UNUSED_CONTEXT;
3184 #ifndef DEBUGGING
3185     PERL_UNUSED_ARG(depth);
3186 #endif
3187
3188     if ( OP(source) == TRIE ) {
3189         struct regnode_1 *op = (struct regnode_1 *)
3190             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3191         StructCopy(source,op,struct regnode_1);
3192         stclass = (regnode *)op;
3193     } else {
3194         struct regnode_charclass *op = (struct regnode_charclass *)
3195             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3196         StructCopy(source,op,struct regnode_charclass);
3197         stclass = (regnode *)op;
3198     }
3199     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3200
3201     ARG_SET( stclass, data_slot );
3202     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3203     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3204     aho->trie=trie_offset;
3205     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3206     Copy( trie->states, aho->states, numstates, reg_trie_state );
3207     Newxz( q, numstates, U32);
3208     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3209     aho->refcount = 1;
3210     fail = aho->fail;
3211     /* initialize fail[0..1] to be 1 so that we always have
3212        a valid final fail state */
3213     fail[ 0 ] = fail[ 1 ] = 1;
3214
3215     for ( charid = 0; charid < ucharcount ; charid++ ) {
3216         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3217         if ( newstate ) {
3218             q[ q_write ] = newstate;
3219             /* set to point at the root */
3220             fail[ q[ q_write++ ] ]=1;
3221         }
3222     }
3223     while ( q_read < q_write) {
3224         const U32 cur = q[ q_read++ % numstates ];
3225         base = trie->states[ cur ].trans.base;
3226
3227         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3228             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3229             if (ch_state) {
3230                 U32 fail_state = cur;
3231                 U32 fail_base;
3232                 do {
3233                     fail_state = fail[ fail_state ];
3234                     fail_base = aho->states[ fail_state ].trans.base;
3235                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3236
3237                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3238                 fail[ ch_state ] = fail_state;
3239                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3240                 {
3241                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3242                 }
3243                 q[ q_write++ % numstates] = ch_state;
3244             }
3245         }
3246     }
3247     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3248        when we fail in state 1, this allows us to use the
3249        charclass scan to find a valid start char. This is based on the principle
3250        that theres a good chance the string being searched contains lots of stuff
3251        that cant be a start char.
3252      */
3253     fail[ 0 ] = fail[ 1 ] = 0;
3254     DEBUG_TRIE_COMPILE_r({
3255         PerlIO_printf(Perl_debug_log,
3256                       "%*sStclass Failtable (%"UVuf" states): 0",
3257                       (int)(depth * 2), "", (UV)numstates
3258         );
3259         for( q_read=1; q_read<numstates; q_read++ ) {
3260             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3261         }
3262         PerlIO_printf(Perl_debug_log, "\n");
3263     });
3264     Safefree(q);
3265     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3266     return stclass;
3267 }
3268
3269
3270 #define DEBUG_PEEP(str,scan,depth) \
3271     DEBUG_OPTIMISE_r({if (scan){ \
3272        regnode *Next = regnext(scan); \
3273        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3274        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3275            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3276            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3277        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3278        PerlIO_printf(Perl_debug_log, "\n"); \
3279    }});
3280
3281 /* The below joins as many adjacent EXACTish nodes as possible into a single
3282  * one.  The regop may be changed if the node(s) contain certain sequences that
3283  * require special handling.  The joining is only done if:
3284  * 1) there is room in the current conglomerated node to entirely contain the
3285  *    next one.
3286  * 2) they are the exact same node type
3287  *
3288  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3289  * these get optimized out
3290  *
3291  * If a node is to match under /i (folded), the number of characters it matches
3292  * can be different than its character length if it contains a multi-character
3293  * fold.  *min_subtract is set to the total delta number of characters of the
3294  * input nodes.
3295  *
3296  * And *unfolded_multi_char is set to indicate whether or not the node contains
3297  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3298  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3299  * SMALL LETTER SHARP S, as only if the target string being matched against
3300  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3301  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3302  * whose components are all above the Latin1 range are not run-time locale
3303  * dependent, and have already been folded by the time this function is
3304  * called.)
3305  *
3306  * This is as good a place as any to discuss the design of handling these
3307  * multi-character fold sequences.  It's been wrong in Perl for a very long
3308  * time.  There are three code points in Unicode whose multi-character folds
3309  * were long ago discovered to mess things up.  The previous designs for
3310  * dealing with these involved assigning a special node for them.  This
3311  * approach doesn't always work, as evidenced by this example:
3312  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3313  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3314  * would match just the \xDF, it won't be able to handle the case where a
3315  * successful match would have to cross the node's boundary.  The new approach
3316  * that hopefully generally solves the problem generates an EXACTFU_SS node
3317  * that is "sss" in this case.
3318  *
3319  * It turns out that there are problems with all multi-character folds, and not
3320  * just these three.  Now the code is general, for all such cases.  The
3321  * approach taken is:
3322  * 1)   This routine examines each EXACTFish node that could contain multi-
3323  *      character folded sequences.  Since a single character can fold into
3324  *      such a sequence, the minimum match length for this node is less than
3325  *      the number of characters in the node.  This routine returns in
3326  *      *min_subtract how many characters to subtract from the the actual
3327  *      length of the string to get a real minimum match length; it is 0 if
3328  *      there are no multi-char foldeds.  This delta is used by the caller to
3329  *      adjust the min length of the match, and the delta between min and max,
3330  *      so that the optimizer doesn't reject these possibilities based on size
3331  *      constraints.
3332  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3333  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3334  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3335  *      there is a possible fold length change.  That means that a regular
3336  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3337  *      with length changes, and so can be processed faster.  regexec.c takes
3338  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3339  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3340  *      known until runtime).  This saves effort in regex matching.  However,
3341  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3342  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3343  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3344  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3345  *      possibilities for the non-UTF8 patterns are quite simple, except for
3346  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3347  *      members of a fold-pair, and arrays are set up for all of them so that
3348  *      the other member of the pair can be found quickly.  Code elsewhere in
3349  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3350  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3351  *      described in the next item.
3352  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3353  *      validity of the fold won't be known until runtime, and so must remain
3354  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3355  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3356  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3357  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3358  *      The reason this is a problem is that the optimizer part of regexec.c
3359  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3360  *      that a character in the pattern corresponds to at most a single
3361  *      character in the target string.  (And I do mean character, and not byte
3362  *      here, unlike other parts of the documentation that have never been
3363  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3364  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3365  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3366  *      nodes, violate the assumption, and they are the only instances where it
3367  *      is violated.  I'm reluctant to try to change the assumption, as the
3368  *      code involved is impenetrable to me (khw), so instead the code here
3369  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3370  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3371  *      boolean indicating whether or not the node contains such a fold.  When
3372  *      it is true, the caller sets a flag that later causes the optimizer in
3373  *      this file to not set values for the floating and fixed string lengths,
3374  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3375  *      assumption.  Thus, there is no optimization based on string lengths for
3376  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3377  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3378  *      assumption is wrong only in these cases is that all other non-UTF-8
3379  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3380  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3381  *      EXACTF nodes because we don't know at compile time if it actually
3382  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3383  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3384  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3385  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3386  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3387  *      string would require the pattern to be forced into UTF-8, the overhead
3388  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3389  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3390  *      locale.)
3391  *
3392  *      Similarly, the code that generates tries doesn't currently handle
3393  *      not-already-folded multi-char folds, and it looks like a pain to change
3394  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3395  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3396  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3397  *      using /iaa matching will be doing so almost entirely with ASCII
3398  *      strings, so this should rarely be encountered in practice */
3399
3400 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3401     if (PL_regkind[OP(scan)] == EXACT) \
3402         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3403
3404 STATIC U32
3405 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3406                    UV *min_subtract, bool *unfolded_multi_char,
3407                    U32 flags,regnode *val, U32 depth)
3408 {
3409     /* Merge several consecutive EXACTish nodes into one. */
3410     regnode *n = regnext(scan);
3411     U32 stringok = 1;
3412     regnode *next = scan + NODE_SZ_STR(scan);
3413     U32 merged = 0;
3414     U32 stopnow = 0;
3415 #ifdef DEBUGGING
3416     regnode *stop = scan;
3417     GET_RE_DEBUG_FLAGS_DECL;
3418 #else
3419     PERL_UNUSED_ARG(depth);
3420 #endif
3421
3422     PERL_ARGS_ASSERT_JOIN_EXACT;
3423 #ifndef EXPERIMENTAL_INPLACESCAN
3424     PERL_UNUSED_ARG(flags);
3425     PERL_UNUSED_ARG(val);
3426 #endif
3427     DEBUG_PEEP("join",scan,depth);
3428
3429     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3430      * EXACT ones that are mergeable to the current one. */
3431     while (n
3432            && (PL_regkind[OP(n)] == NOTHING
3433                || (stringok && OP(n) == OP(scan)))
3434            && NEXT_OFF(n)
3435            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3436     {
3437
3438         if (OP(n) == TAIL || n > next)
3439             stringok = 0;
3440         if (PL_regkind[OP(n)] == NOTHING) {
3441             DEBUG_PEEP("skip:",n,depth);
3442             NEXT_OFF(scan) += NEXT_OFF(n);
3443             next = n + NODE_STEP_REGNODE;
3444 #ifdef DEBUGGING
3445             if (stringok)
3446                 stop = n;
3447 #endif
3448             n = regnext(n);
3449         }
3450         else if (stringok) {
3451             const unsigned int oldl = STR_LEN(scan);
3452             regnode * const nnext = regnext(n);
3453
3454             /* XXX I (khw) kind of doubt that this works on platforms (should
3455              * Perl ever run on one) where U8_MAX is above 255 because of lots
3456              * of other assumptions */
3457             /* Don't join if the sum can't fit into a single node */
3458             if (oldl + STR_LEN(n) > U8_MAX)
3459                 break;
3460
3461             DEBUG_PEEP("merg",n,depth);
3462             merged++;
3463
3464             NEXT_OFF(scan) += NEXT_OFF(n);
3465             STR_LEN(scan) += STR_LEN(n);
3466             next = n + NODE_SZ_STR(n);
3467             /* Now we can overwrite *n : */
3468             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3469 #ifdef DEBUGGING
3470             stop = next - 1;
3471 #endif
3472             n = nnext;
3473             if (stopnow) break;
3474         }
3475
3476 #ifdef EXPERIMENTAL_INPLACESCAN
3477         if (flags && !NEXT_OFF(n)) {
3478             DEBUG_PEEP("atch", val, depth);
3479             if (reg_off_by_arg[OP(n)]) {
3480                 ARG_SET(n, val - n);
3481             }
3482             else {
3483                 NEXT_OFF(n) = val - n;
3484             }
3485             stopnow = 1;
3486         }
3487 #endif
3488     }
3489
3490     *min_subtract = 0;
3491     *unfolded_multi_char = FALSE;
3492
3493     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3494      * can now analyze for sequences of problematic code points.  (Prior to
3495      * this final joining, sequences could have been split over boundaries, and
3496      * hence missed).  The sequences only happen in folding, hence for any
3497      * non-EXACT EXACTish node */
3498     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3499         U8* s0 = (U8*) STRING(scan);
3500         U8* s = s0;
3501         U8* s_end = s0 + STR_LEN(scan);
3502
3503         int total_count_delta = 0;  /* Total delta number of characters that
3504                                        multi-char folds expand to */
3505
3506         /* One pass is made over the node's string looking for all the
3507          * possibilities.  To avoid some tests in the loop, there are two main
3508          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3509          * non-UTF-8 */
3510         if (UTF) {
3511             U8* folded = NULL;
3512
3513             if (OP(scan) == EXACTFL) {
3514                 U8 *d;
3515
3516                 /* An EXACTFL node would already have been changed to another
3517                  * node type unless there is at least one character in it that
3518                  * is problematic; likely a character whose fold definition
3519                  * won't be known until runtime, and so has yet to be folded.
3520                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3521                  * to handle the UTF-8 case, we need to create a temporary
3522                  * folded copy using UTF-8 locale rules in order to analyze it.
3523                  * This is because our macros that look to see if a sequence is
3524                  * a multi-char fold assume everything is folded (otherwise the
3525                  * tests in those macros would be too complicated and slow).
3526                  * Note that here, the non-problematic folds will have already
3527                  * been done, so we can just copy such characters.  We actually
3528                  * don't completely fold the EXACTFL string.  We skip the
3529                  * unfolded multi-char folds, as that would just create work
3530                  * below to figure out the size they already are */
3531
3532                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3533                 d = folded;
3534                 while (s < s_end) {
3535                     STRLEN s_len = UTF8SKIP(s);
3536                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3537                         Copy(s, d, s_len, U8);
3538                         d += s_len;
3539                     }
3540                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3541                         *unfolded_multi_char = TRUE;
3542                         Copy(s, d, s_len, U8);
3543                         d += s_len;
3544                     }
3545                     else if (isASCII(*s)) {
3546                         *(d++) = toFOLD(*s);
3547                     }
3548                     else {
3549                         STRLEN len;
3550                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3551                         d += len;
3552                     }
3553                     s += s_len;
3554                 }
3555
3556                 /* Point the remainder of the routine to look at our temporary
3557                  * folded copy */
3558                 s = folded;
3559                 s_end = d;
3560             } /* End of creating folded copy of EXACTFL string */
3561
3562             /* Examine the string for a multi-character fold sequence.  UTF-8
3563              * patterns have all characters pre-folded by the time this code is
3564              * executed */
3565             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3566                                      length sequence we are looking for is 2 */
3567             {
3568                 int count = 0;  /* How many characters in a multi-char fold */
3569                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3570                 if (! len) {    /* Not a multi-char fold: get next char */
3571                     s += UTF8SKIP(s);
3572                     continue;
3573                 }
3574
3575                 /* Nodes with 'ss' require special handling, except for
3576                  * EXACTFA-ish for which there is no multi-char fold to this */
3577                 if (len == 2 && *s == 's' && *(s+1) == 's'
3578                     && OP(scan) != EXACTFA
3579                     && OP(scan) != EXACTFA_NO_TRIE)
3580                 {
3581                     count = 2;
3582                     if (OP(scan) != EXACTFL) {
3583                         OP(scan) = EXACTFU_SS;
3584                     }
3585                     s += 2;
3586                 }
3587                 else { /* Here is a generic multi-char fold. */
3588                     U8* multi_end  = s + len;
3589
3590                     /* Count how many characters are in it.  In the case of
3591                      * /aa, no folds which contain ASCII code points are
3592                      * allowed, so check for those, and skip if found. */
3593                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3594                         count = utf8_length(s, multi_end);
3595                         s = multi_end;
3596                     }
3597                     else {
3598                         while (s < multi_end) {
3599                             if (isASCII(*s)) {
3600                                 s++;
3601                                 goto next_iteration;
3602                             }
3603                             else {
3604                                 s += UTF8SKIP(s);
3605                             }
3606                             count++;
3607                         }
3608                     }
3609                 }
3610
3611                 /* The delta is how long the sequence is minus 1 (1 is how long
3612                  * the character that folds to the sequence is) */
3613                 total_count_delta += count - 1;
3614               next_iteration: ;
3615             }
3616
3617             /* We created a temporary folded copy of the string in EXACTFL
3618              * nodes.  Therefore we need to be sure it doesn't go below zero,
3619              * as the real string could be shorter */
3620             if (OP(scan) == EXACTFL) {
3621                 int total_chars = utf8_length((U8*) STRING(scan),
3622                                            (U8*) STRING(scan) + STR_LEN(scan));
3623                 if (total_count_delta > total_chars) {
3624                     total_count_delta = total_chars;
3625                 }
3626             }
3627
3628             *min_subtract += total_count_delta;
3629             Safefree(folded);
3630         }
3631         else if (OP(scan) == EXACTFA) {
3632
3633             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3634              * fold to the ASCII range (and there are no existing ones in the
3635              * upper latin1 range).  But, as outlined in the comments preceding
3636              * this function, we need to flag any occurrences of the sharp s.
3637              * This character forbids trie formation (because of added
3638              * complexity) */
3639             while (s < s_end) {
3640                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3641                     OP(scan) = EXACTFA_NO_TRIE;
3642                     *unfolded_multi_char = TRUE;
3643                     break;
3644                 }
3645                 s++;
3646                 continue;
3647             }
3648         }
3649         else {
3650
3651             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3652              * folds that are all Latin1.  As explained in the comments
3653              * preceding this function, we look also for the sharp s in EXACTF
3654              * and EXACTFL nodes; it can be in the final position.  Otherwise
3655              * we can stop looking 1 byte earlier because have to find at least
3656              * two characters for a multi-fold */
3657             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3658                               ? s_end
3659                               : s_end -1;
3660
3661             while (s < upper) {
3662                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3663                 if (! len) {    /* Not a multi-char fold. */
3664                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3665                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3666                     {
3667                         *unfolded_multi_char = TRUE;
3668                     }
3669                     s++;
3670                     continue;
3671                 }
3672
3673                 if (len == 2
3674                     && isALPHA_FOLD_EQ(*s, 's')
3675                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3676                 {
3677
3678                     /* EXACTF nodes need to know that the minimum length
3679                      * changed so that a sharp s in the string can match this
3680                      * ss in the pattern, but they remain EXACTF nodes, as they
3681                      * won't match this unless the target string is is UTF-8,
3682                      * which we don't know until runtime.  EXACTFL nodes can't
3683                      * transform into EXACTFU nodes */
3684                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3685                         OP(scan) = EXACTFU_SS;
3686                     }
3687                 }
3688
3689                 *min_subtract += len - 1;
3690                 s += len;
3691             }
3692         }
3693     }
3694
3695 #ifdef DEBUGGING
3696     /* Allow dumping but overwriting the collection of skipped
3697      * ops and/or strings with fake optimized ops */
3698     n = scan + NODE_SZ_STR(scan);
3699     while (n <= stop) {
3700         OP(n) = OPTIMIZED;
3701         FLAGS(n) = 0;
3702         NEXT_OFF(n) = 0;
3703         n++;
3704     }
3705 #endif
3706     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3707     return stopnow;
3708 }
3709
3710 /* REx optimizer.  Converts nodes into quicker variants "in place".
3711    Finds fixed substrings.  */
3712
3713 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3714    to the position after last scanned or to NULL. */
3715
3716 #define INIT_AND_WITHP \
3717     assert(!and_withp); \
3718     Newx(and_withp,1, regnode_ssc); \
3719     SAVEFREEPV(and_withp)
3720
3721
3722 static void
3723 S_unwind_scan_frames(pTHX_ const void *p)
3724 {
3725     scan_frame *f= (scan_frame *)p;
3726     do {
3727         scan_frame *n= f->next_frame;
3728         Safefree(f);
3729         f= n;
3730     } while (f);
3731 }
3732
3733
3734 STATIC SSize_t
3735 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3736                         SSize_t *minlenp, SSize_t *deltap,
3737                         regnode *last,
3738                         scan_data_t *data,
3739                         I32 stopparen,
3740                         U32 recursed_depth,
3741                         regnode_ssc *and_withp,
3742                         U32 flags, U32 depth)
3743                         /* scanp: Start here (read-write). */
3744                         /* deltap: Write maxlen-minlen here. */
3745                         /* last: Stop before this one. */
3746                         /* data: string data about the pattern */
3747                         /* stopparen: treat close N as END */
3748                         /* recursed: which subroutines have we recursed into */
3749                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3750 {
3751     /* There must be at least this number of characters to match */
3752     SSize_t min = 0;
3753     I32 pars = 0, code;
3754     regnode *scan = *scanp, *next;
3755     SSize_t delta = 0;
3756     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3757     int is_inf_internal = 0;            /* The studied chunk is infinite */
3758     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3759     scan_data_t data_fake;
3760     SV *re_trie_maxbuff = NULL;
3761     regnode *first_non_open = scan;
3762     SSize_t stopmin = SSize_t_MAX;
3763     scan_frame *frame = NULL;
3764     GET_RE_DEBUG_FLAGS_DECL;
3765
3766     PERL_ARGS_ASSERT_STUDY_CHUNK;
3767
3768
3769     if ( depth == 0 ) {
3770         while (first_non_open && OP(first_non_open) == OPEN)
3771             first_non_open=regnext(first_non_open);
3772     }
3773
3774
3775   fake_study_recurse:
3776     DEBUG_r(
3777         RExC_study_chunk_recursed_count++;
3778     );
3779     DEBUG_OPTIMISE_MORE_r(
3780     {
3781         PerlIO_printf(Perl_debug_log,
3782             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3783             (int)(depth*2), "", (long)stopparen,
3784             (unsigned long)RExC_study_chunk_recursed_count,
3785             (unsigned long)depth, (unsigned long)recursed_depth,
3786             scan,
3787             last);
3788         if (recursed_depth) {
3789             U32 i;
3790             U32 j;
3791             for ( j = 0 ; j < recursed_depth ; j++ ) {
3792                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3793                     if (
3794                         PAREN_TEST(RExC_study_chunk_recursed +
3795                                    ( j * RExC_study_chunk_recursed_bytes), i )
3796                         && (
3797                             !j ||
3798                             !PAREN_TEST(RExC_study_chunk_recursed +
3799                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3800                         )
3801                     ) {
3802                         PerlIO_printf(Perl_debug_log," %d",(int)i);
3803                         break;
3804                     }
3805                 }
3806                 if ( j + 1 < recursed_depth ) {
3807                     PerlIO_printf(Perl_debug_log, ",");
3808                 }
3809             }
3810         }
3811         PerlIO_printf(Perl_debug_log,"\n");
3812     }
3813     );
3814     while ( scan && OP(scan) != END && scan < last ){
3815         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3816                                    node length to get a real minimum (because
3817                                    the folded version may be shorter) */
3818         bool unfolded_multi_char = FALSE;
3819         /* Peephole optimizer: */
3820         DEBUG_STUDYDATA("Peep:", data, depth);
3821         DEBUG_PEEP("Peep", scan, depth);
3822
3823
3824         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3825          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3826          * by a different invocation of reg() -- Yves
3827          */
3828         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3829
3830         /* Follow the next-chain of the current node and optimize
3831            away all the NOTHINGs from it.  */
3832         if (OP(scan) != CURLYX) {
3833             const int max = (reg_off_by_arg[OP(scan)]
3834                        ? I32_MAX
3835                        /* I32 may be smaller than U16 on CRAYs! */
3836                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3837             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3838             int noff;
3839             regnode *n = scan;
3840
3841             /* Skip NOTHING and LONGJMP. */
3842             while ((n = regnext(n))
3843                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3844                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3845                    && off + noff < max)
3846                 off += noff;
3847             if (reg_off_by_arg[OP(scan)])
3848                 ARG(scan) = off;
3849             else
3850                 NEXT_OFF(scan) = off;
3851         }
3852
3853         /* The principal pseudo-switch.  Cannot be a switch, since we
3854            look into several different things.  */
3855         if ( OP(scan) == DEFINEP ) {
3856             SSize_t minlen = 0;
3857             SSize_t deltanext = 0;
3858             SSize_t fake_last_close = 0;
3859             I32 f = SCF_IN_DEFINE;
3860
3861             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3862             scan = regnext(scan);
3863             assert( OP(scan) == IFTHEN );
3864             DEBUG_PEEP("expect IFTHEN", scan, depth);
3865
3866             data_fake.last_closep= &fake_last_close;
3867             minlen = *minlenp;
3868             next = regnext(scan);
3869             scan = NEXTOPER(NEXTOPER(scan));
3870             DEBUG_PEEP("scan", scan, depth);
3871             DEBUG_PEEP("next", next, depth);
3872
3873             /* we suppose the run is continuous, last=next...
3874              * NOTE we dont use the return here! */
3875             (void)study_chunk(pRExC_state, &scan, &minlen,
3876                               &deltanext, next, &data_fake, stopparen,
3877                               recursed_depth, NULL, f, depth+1);
3878
3879             scan = next;
3880         } else
3881         if (
3882             OP(scan) == BRANCH  ||
3883             OP(scan) == BRANCHJ ||
3884             OP(scan) == IFTHEN
3885         ) {
3886             next = regnext(scan);
3887             code = OP(scan);
3888
3889             /* The op(next)==code check below is to see if we
3890              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3891              * IFTHEN is special as it might not appear in pairs.
3892              * Not sure whether BRANCH-BRANCHJ is possible, regardless
3893              * we dont handle it cleanly. */
3894             if (OP(next) == code || code == IFTHEN) {
3895                 /* NOTE - There is similar code to this block below for
3896                  * handling TRIE nodes on a re-study.  If you change stuff here
3897                  * check there too. */
3898                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3899                 regnode_ssc accum;
3900                 regnode * const startbranch=scan;
3901
3902                 if (flags & SCF_DO_SUBSTR) {
3903                     /* Cannot merge strings after this. */
3904                     scan_commit(pRExC_state, data, minlenp, is_inf);
3905                 }
3906
3907                 if (flags & SCF_DO_STCLASS)
3908                     ssc_init_zero(pRExC_state, &accum);
3909
3910                 while (OP(scan) == code) {
3911                     SSize_t deltanext, minnext, fake;
3912                     I32 f = 0;
3913                     regnode_ssc this_class;
3914
3915                     DEBUG_PEEP("Branch", scan, depth);
3916
3917                     num++;
3918                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3919                     if (data) {
3920                         data_fake.whilem_c = data->whilem_c;
3921                         data_fake.last_closep = data->last_closep;
3922                     }
3923                     else
3924                         data_fake.last_closep = &fake;
3925
3926                     data_fake.pos_delta = delta;
3927                     next = regnext(scan);
3928
3929                     scan = NEXTOPER(scan); /* everything */
3930                     if (code != BRANCH)    /* everything but BRANCH */
3931                         scan = NEXTOPER(scan);
3932
3933                     if (flags & SCF_DO_STCLASS) {
3934                         ssc_init(pRExC_state, &this_class);
3935                         data_fake.start_class = &this_class;
3936                         f = SCF_DO_STCLASS_AND;
3937                     }
3938                     if (flags & SCF_WHILEM_VISITED_POS)
3939                         f |= SCF_WHILEM_VISITED_POS;
3940
3941                     /* we suppose the run is continuous, last=next...*/
3942                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3943                                       &deltanext, next, &data_fake, stopparen,
3944                                       recursed_depth, NULL, f,depth+1);
3945
3946                     if (min1 > minnext)
3947                         min1 = minnext;
3948                     if (deltanext == SSize_t_MAX) {
3949                         is_inf = is_inf_internal = 1;
3950                         max1 = SSize_t_MAX;
3951                     } else if (max1 < minnext + deltanext)
3952                         max1 = minnext + deltanext;
3953                     scan = next;
3954                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3955                         pars++;
3956                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3957                         if ( stopmin > minnext)
3958                             stopmin = min + min1;
3959                         flags &= ~SCF_DO_SUBSTR;
3960                         if (data)
3961                             data->flags |= SCF_SEEN_ACCEPT;
3962                     }
3963                     if (data) {
3964                         if (data_fake.flags & SF_HAS_EVAL)
3965                             data->flags |= SF_HAS_EVAL;
3966                         data->whilem_c = data_fake.whilem_c;
3967                     }
3968                     if (flags & SCF_DO_STCLASS)
3969                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3970                 }
3971                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3972                     min1 = 0;
3973                 if (flags & SCF_DO_SUBSTR) {
3974                     data->pos_min += min1;
3975                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3976                         data->pos_delta = SSize_t_MAX;
3977                     else
3978                         data->pos_delta += max1 - min1;
3979                     if (max1 != min1 || is_inf)
3980                         data->longest = &(data->longest_float);
3981                 }
3982                 min += min1;
3983                 if (delta == SSize_t_MAX
3984                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3985                     delta = SSize_t_MAX;
3986                 else
3987                     delta += max1 - min1;
3988                 if (flags & SCF_DO_STCLASS_OR) {
3989                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3990                     if (min1) {
3991                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3992                         flags &= ~SCF_DO_STCLASS;
3993                     }
3994                 }
3995                 else if (flags & SCF_DO_STCLASS_AND) {
3996                     if (min1) {
3997                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3998                         flags &= ~SCF_DO_STCLASS;
3999                     }
4000                     else {
4001                         /* Switch to OR mode: cache the old value of
4002                          * data->start_class */
4003                         INIT_AND_WITHP;
4004                         StructCopy(data->start_class, and_withp, regnode_ssc);
4005                         flags &= ~SCF_DO_STCLASS_AND;
4006                         StructCopy(&accum, data->start_class, regnode_ssc);
4007                         flags |= SCF_DO_STCLASS_OR;
4008                     }
4009                 }
4010
4011                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4012                         OP( startbranch ) == BRANCH )
4013                 {
4014                 /* demq.
4015
4016                    Assuming this was/is a branch we are dealing with: 'scan'
4017                    now points at the item that follows the branch sequence,
4018                    whatever it is. We now start at the beginning of the
4019                    sequence and look for subsequences of
4020
4021                    BRANCH->EXACT=>x1
4022                    BRANCH->EXACT=>x2
4023                    tail
4024
4025                    which would be constructed from a pattern like
4026                    /A|LIST|OF|WORDS/
4027
4028                    If we can find such a subsequence we need to turn the first
4029                    element into a trie and then add the subsequent branch exact
4030                    strings to the trie.
4031
4032                    We have two cases
4033
4034                      1. patterns where the whole set of branches can be
4035                         converted.
4036
4037                      2. patterns where only a subset can be converted.
4038
4039                    In case 1 we can replace the whole set with a single regop
4040                    for the trie. In case 2 we need to keep the start and end
4041                    branches so
4042
4043                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4044                      becomes BRANCH TRIE; BRANCH X;
4045
4046                   There is an additional case, that being where there is a
4047                   common prefix, which gets split out into an EXACT like node
4048                   preceding the TRIE node.
4049
4050                   If x(1..n)==tail then we can do a simple trie, if not we make
4051                   a "jump" trie, such that when we match the appropriate word
4052                   we "jump" to the appropriate tail node. Essentially we turn
4053                   a nested if into a case structure of sorts.
4054
4055                 */
4056
4057                     int made=0;
4058                     if (!re_trie_maxbuff) {
4059                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4060                         if (!SvIOK(re_trie_maxbuff))
4061                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4062                     }
4063                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4064                         regnode *cur;
4065                         regnode *first = (regnode *)NULL;
4066                         regnode *last = (regnode *)NULL;
4067                         regnode *tail = scan;
4068                         U8 trietype = 0;
4069                         U32 count=0;
4070
4071                         /* var tail is used because there may be a TAIL
4072                            regop in the way. Ie, the exacts will point to the
4073                            thing following the TAIL, but the last branch will
4074                            point at the TAIL. So we advance tail. If we
4075                            have nested (?:) we may have to move through several
4076                            tails.
4077                          */
4078
4079                         while ( OP( tail ) == TAIL ) {
4080                             /* this is the TAIL generated by (?:) */
4081                             tail = regnext( tail );
4082                         }
4083
4084
4085                         DEBUG_TRIE_COMPILE_r({
4086                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4087                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4088                               (int)depth * 2 + 2, "",
4089                               "Looking for TRIE'able sequences. Tail node is: ",
4090                               SvPV_nolen_const( RExC_mysv )
4091                             );
4092                         });
4093
4094                         /*
4095
4096                             Step through the branches
4097                                 cur represents each branch,
4098                                 noper is the first thing to be matched as part
4099                                       of that branch
4100                                 noper_next is the regnext() of that node.
4101
4102                             We normally handle a case like this
4103                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4104                             support building with NOJUMPTRIE, which restricts
4105                             the trie logic to structures like /FOO|BAR/.
4106
4107                             If noper is a trieable nodetype then the branch is
4108                             a possible optimization target. If we are building
4109                             under NOJUMPTRIE then we require that noper_next is
4110                             the same as scan (our current position in the regex
4111                             program).
4112
4113                             Once we have two or more consecutive such branches
4114                             we can create a trie of the EXACT's contents and
4115                             stitch it in place into the program.
4116
4117                             If the sequence represents all of the branches in
4118                             the alternation we replace the entire thing with a
4119                             single TRIE node.
4120
4121                             Otherwise when it is a subsequence we need to
4122                             stitch it in place and replace only the relevant
4123                             branches. This means the first branch has to remain
4124                             as it is used by the alternation logic, and its
4125                             next pointer, and needs to be repointed at the item
4126                             on the branch chain following the last branch we
4127                             have optimized away.
4128
4129                             This could be either a BRANCH, in which case the
4130                             subsequence is internal, or it could be the item
4131                             following the branch sequence in which case the
4132                             subsequence is at the end (which does not
4133                             necessarily mean the first node is the start of the
4134                             alternation).
4135
4136                             TRIE_TYPE(X) is a define which maps the optype to a
4137                             trietype.
4138
4139                                 optype          |  trietype
4140                                 ----------------+-----------
4141                                 NOTHING         | NOTHING
4142                                 EXACT           | EXACT
4143                                 EXACTFU         | EXACTFU
4144                                 EXACTFU_SS      | EXACTFU
4145                                 EXACTFA         | EXACTFA
4146                                 EXACTL          | EXACTL
4147                                 EXACTFLU8       | EXACTFLU8
4148
4149
4150                         */
4151 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4152                        ? NOTHING                                            \
4153                        : ( EXACT == (X) )                                   \
4154                          ? EXACT                                            \
4155                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4156                            ? EXACTFU                                        \
4157                            : ( EXACTFA == (X) )                             \
4158                              ? EXACTFA                                      \
4159                              : ( EXACTL == (X) )                            \
4160                                ? EXACTL                                     \
4161                                : ( EXACTFLU8 == (X) )                        \
4162                                  ? EXACTFLU8                                 \
4163                                  : 0 )
4164
4165                         /* dont use tail as the end marker for this traverse */
4166                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4167                             regnode * const noper = NEXTOPER( cur );
4168                             U8 noper_type = OP( noper );
4169                             U8 noper_trietype = TRIE_TYPE( noper_type );
4170 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4171                             regnode * const noper_next = regnext( noper );
4172                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4173                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4174 #endif
4175
4176                             DEBUG_TRIE_COMPILE_r({
4177                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4178                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4179                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4180
4181                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4182                                 PerlIO_printf( Perl_debug_log, " -> %s",
4183                                     SvPV_nolen_const(RExC_mysv));
4184
4185                                 if ( noper_next ) {
4186                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4187                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4188                                     SvPV_nolen_const(RExC_mysv));
4189                                 }
4190                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4191                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4192                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4193                                 );
4194                             });
4195
4196                             /* Is noper a trieable nodetype that can be merged
4197                              * with the current trie (if there is one)? */
4198                             if ( noper_trietype
4199                                   &&
4200                                   (
4201                                         ( noper_trietype == NOTHING)
4202                                         || ( trietype == NOTHING )
4203                                         || ( trietype == noper_trietype )
4204                                   )
4205 #ifdef NOJUMPTRIE
4206                                   && noper_next == tail
4207 #endif
4208                                   && count < U16_MAX)
4209                             {
4210                                 /* Handle mergable triable node Either we are
4211                                  * the first node in a new trieable sequence,
4212                                  * in which case we do some bookkeeping,
4213                                  * otherwise we update the end pointer. */
4214                                 if ( !first ) {
4215                                     first = cur;
4216                                     if ( noper_trietype == NOTHING ) {
4217 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4218                                         regnode * const noper_next = regnext( noper );
4219                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4220                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4221 #endif
4222
4223                                         if ( noper_next_trietype ) {
4224                                             trietype = noper_next_trietype;
4225                                         } else if (noper_next_type)  {
4226                                             /* a NOTHING regop is 1 regop wide.
4227                                              * We need at least two for a trie
4228                                              * so we can't merge this in */
4229                                             first = NULL;
4230                                         }
4231                                     } else {
4232                                         trietype = noper_trietype;
4233                                     }
4234                                 } else {
4235                                     if ( trietype == NOTHING )
4236                                         trietype = noper_trietype;
4237                                     last = cur;
4238                                 }
4239                                 if (first)
4240                                     count++;
4241                             } /* end handle mergable triable node */
4242                             else {
4243                                 /* handle unmergable node -
4244                                  * noper may either be a triable node which can
4245                                  * not be tried together with the current trie,
4246                                  * or a non triable node */
4247                                 if ( last ) {
4248                                     /* If last is set and trietype is not
4249                                      * NOTHING then we have found at least two
4250                                      * triable branch sequences in a row of a
4251                                      * similar trietype so we can turn them
4252                                      * into a trie. If/when we allow NOTHING to
4253                                      * start a trie sequence this condition
4254                                      * will be required, and it isn't expensive
4255                                      * so we leave it in for now. */
4256                                     if ( trietype && trietype != NOTHING )
4257                                         make_trie( pRExC_state,
4258                                                 startbranch, first, cur, tail,
4259                                                 count, trietype, depth+1 );
4260                                     last = NULL; /* note: we clear/update
4261                                                     first, trietype etc below,
4262                                                     so we dont do it here */
4263                                 }
4264                                 if ( noper_trietype
4265 #ifdef NOJUMPTRIE
4266                                      && noper_next == tail
4267 #endif
4268                                 ){
4269                                     /* noper is triable, so we can start a new
4270                                      * trie sequence */
4271                                     count = 1;
4272                                     first = cur;
4273                                     trietype = noper_trietype;
4274                                 } else if (first) {
4275                                     /* if we already saw a first but the
4276                                      * current node is not triable then we have
4277                                      * to reset the first information. */
4278                                     count = 0;
4279                                     first = NULL;
4280                                     trietype = 0;
4281                                 }
4282                             } /* end handle unmergable node */
4283                         } /* loop over branches */
4284                         DEBUG_TRIE_COMPILE_r({
4285                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4286                             PerlIO_printf( Perl_debug_log,
4287                               "%*s- %s (%d) <SCAN FINISHED>\n",
4288                               (int)depth * 2 + 2,
4289                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4290
4291                         });
4292                         if ( last && trietype ) {
4293                             if ( trietype != NOTHING ) {
4294                                 /* the last branch of the sequence was part of
4295                                  * a trie, so we have to construct it here
4296                                  * outside of the loop */
4297                                 made= make_trie( pRExC_state, startbranch,
4298                                                  first, scan, tail, count,
4299                                                  trietype, depth+1 );
4300 #ifdef TRIE_STUDY_OPT
4301                                 if ( ((made == MADE_EXACT_TRIE &&
4302                                      startbranch == first)
4303                                      || ( first_non_open == first )) &&
4304                                      depth==0 ) {
4305                                     flags |= SCF_TRIE_RESTUDY;
4306                                     if ( startbranch == first
4307                                          && scan == tail )
4308                                     {
4309                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4310                                     }
4311                                 }
4312 #endif
4313                             } else {
4314                                 /* at this point we know whatever we have is a
4315                                  * NOTHING sequence/branch AND if 'startbranch'
4316                                  * is 'first' then we can turn the whole thing
4317                                  * into a NOTHING
4318                                  */
4319                                 if ( startbranch == first ) {
4320                                     regnode *opt;
4321                                     /* the entire thing is a NOTHING sequence,
4322                                      * something like this: (?:|) So we can
4323                                      * turn it into a plain NOTHING op. */
4324                                     DEBUG_TRIE_COMPILE_r({
4325                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4326                                         PerlIO_printf( Perl_debug_log,
4327                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4328                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4329
4330                                     });
4331                                     OP(startbranch)= NOTHING;
4332                                     NEXT_OFF(startbranch)= tail - startbranch;
4333                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4334                                         OP(opt)= OPTIMIZED;
4335                                 }
4336                             }
4337                         } /* end if ( last) */
4338                     } /* TRIE_MAXBUF is non zero */
4339
4340                 } /* do trie */
4341
4342             }
4343             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4344                 scan = NEXTOPER(NEXTOPER(scan));
4345             } else                      /* single branch is optimized. */
4346                 scan = NEXTOPER(scan);
4347             continue;
4348         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4349             I32 paren = 0;
4350             regnode *start = NULL;
4351             regnode *end = NULL;
4352             U32 my_recursed_depth= recursed_depth;
4353
4354
4355             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4356                 /* Do setup, note this code has side effects beyond
4357                  * the rest of this block. Specifically setting
4358                  * RExC_recurse[] must happen at least once during
4359                  * study_chunk(). */
4360                 if (OP(scan) == GOSUB) {
4361                     paren = ARG(scan);
4362                     RExC_recurse[ARG2L(scan)] = scan;
4363                     start = RExC_open_parens[paren-1];
4364                     end   = RExC_close_parens[paren-1];
4365                 } else {
4366                     start = RExC_rxi->program + 1;
4367                     end   = RExC_opend;
4368                 }
4369                 /* NOTE we MUST always execute the above code, even
4370                  * if we do nothing with a GOSUB/GOSTART */
4371                 if (
4372                     ( flags & SCF_IN_DEFINE )
4373                     ||
4374                     (
4375                         (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4376                         &&
4377                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4378                     )
4379                 ) {
4380                     /* no need to do anything here if we are in a define. */
4381                     /* or we are after some kind of infinite construct
4382                      * so we can skip recursing into this item.
4383                      * Since it is infinite we will not change the maxlen
4384                      * or delta, and if we miss something that might raise
4385                      * the minlen it will merely pessimise a little.
4386                      *
4387                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4388                      * might result in a minlen of 1 and not of 4,
4389                      * but this doesn't make us mismatch, just try a bit
4390                      * harder than we should.
4391                      * */
4392                     scan= regnext(scan);
4393                     continue;
4394                 }
4395
4396                 if (
4397                     !recursed_depth
4398                     ||
4399                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4400                 ) {
4401                     /* it is quite possible that there are more efficient ways
4402                      * to do this. We maintain a bitmap per level of recursion
4403                      * of which patterns we have entered so we can detect if a
4404                      * pattern creates a possible infinite loop. When we
4405                      * recurse down a level we copy the previous levels bitmap
4406                      * down. When we are at recursion level 0 we zero the top
4407                      * level bitmap. It would be nice to implement a different
4408                      * more efficient way of doing this. In particular the top
4409                      * level bitmap may be unnecessary.
4410                      */
4411                     if (!recursed_depth) {
4412                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4413                     } else {
4414                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4415                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4416                              RExC_study_chunk_recursed_bytes, U8);
4417                     }
4418                     /* we havent recursed into this paren yet, so recurse into it */
4419                     DEBUG_STUDYDATA("set:", data,depth);
4420                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4421                     my_recursed_depth= recursed_depth + 1;
4422                 } else {
4423                     DEBUG_STUDYDATA("inf:", data,depth);
4424                     /* some form of infinite recursion, assume infinite length
4425                      * */
4426                     if (flags & SCF_DO_SUBSTR) {
4427                         scan_commit(pRExC_state, data, minlenp, is_inf);
4428                         data->longest = &(data->longest_float);
4429                     }
4430                     is_inf = is_inf_internal = 1;
4431                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4432                         ssc_anything(data->start_class);
4433                     flags &= ~SCF_DO_STCLASS;
4434
4435                     start= NULL; /* reset start so we dont recurse later on. */
4436                 }
4437             } else {
4438                 paren = stopparen;
4439                 start = scan + 2;
4440                 end = regnext(scan);
4441             }
4442             if (start) {
4443                 scan_frame *newframe;
4444                 assert(end);
4445                 if (!RExC_frame_last) {
4446                     Newxz(newframe, 1, scan_frame);
4447                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4448                     RExC_frame_head= newframe;
4449                     RExC_frame_count++;
4450                 } else if (!RExC_frame_last->next_frame) {
4451                     Newxz(newframe,1,scan_frame);
4452                     RExC_frame_last->next_frame= newframe;
4453                     newframe->prev_frame= RExC_frame_last;
4454                     RExC_frame_count++;
4455                 } else {
4456                     newframe= RExC_frame_last->next_frame;
4457                 }
4458                 RExC_frame_last= newframe;
4459
4460                 newframe->next_regnode = regnext(scan);
4461                 newframe->last_regnode = last;
4462                 newframe->stopparen = stopparen;
4463                 newframe->prev_recursed_depth = recursed_depth;
4464                 newframe->this_prev_frame= frame;
4465
4466                 DEBUG_STUDYDATA("frame-new:",data,depth);
4467                 DEBUG_PEEP("fnew", scan, depth);
4468
4469                 frame = newframe;
4470                 scan =  start;
4471                 stopparen = paren;
4472                 last = end;
4473                 depth = depth + 1;
4474                 recursed_depth= my_recursed_depth;
4475
4476                 continue;
4477             }
4478         }
4479         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4480             SSize_t l = STR_LEN(scan);
4481             UV uc;
4482             if (UTF) {
4483                 const U8 * const s = (U8*)STRING(scan);
4484                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4485                 l = utf8_length(s, s + l);
4486             } else {
4487                 uc = *((U8*)STRING(scan));
4488             }
4489             min += l;
4490             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4491                 /* The code below prefers earlier match for fixed
4492                    offset, later match for variable offset.  */
4493                 if (data->last_end == -1) { /* Update the start info. */
4494                     data->last_start_min = data->pos_min;
4495                     data->last_start_max = is_inf
4496                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4497                 }
4498                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4499                 if (UTF)
4500                     SvUTF8_on(data->last_found);
4501                 {
4502                     SV * const sv = data->last_found;
4503                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4504                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4505                     if (mg && mg->mg_len >= 0)
4506                         mg->mg_len += utf8_length((U8*)STRING(scan),
4507                                               (U8*)STRING(scan)+STR_LEN(scan));
4508                 }
4509                 data->last_end = data->pos_min + l;
4510                 data->pos_min += l; /* As in the first entry. */
4511                 data->flags &= ~SF_BEFORE_EOL;
4512             }
4513
4514             /* ANDing the code point leaves at most it, and not in locale, and
4515              * can't match null string */
4516             if (flags & SCF_DO_STCLASS_AND) {
4517                 ssc_cp_and(data->start_class, uc);
4518                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4519                 ssc_clear_locale(data->start_class);
4520             }
4521             else if (flags & SCF_DO_STCLASS_OR) {
4522                 ssc_add_cp(data->start_class, uc);
4523                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4524
4525                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4526                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4527             }
4528             flags &= ~SCF_DO_STCLASS;
4529         }
4530         else if (PL_regkind[OP(scan)] == EXACT) {
4531             /* But OP != EXACT!, so is EXACTFish */
4532             SSize_t l = STR_LEN(scan);
4533             const U8 * s = (U8*)STRING(scan);
4534
4535             /* Search for fixed substrings supports EXACT only. */
4536             if (flags & SCF_DO_SUBSTR) {
4537                 assert(data);
4538                 scan_commit(pRExC_state, data, minlenp, is_inf);
4539             }
4540             if (UTF) {
4541                 l = utf8_length(s, s + l);
4542             }
4543             if (unfolded_multi_char) {
4544                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4545             }
4546             min += l - min_subtract;
4547             assert (min >= 0);
4548             delta += min_subtract;
4549             if (flags & SCF_DO_SUBSTR) {
4550                 data->pos_min += l - min_subtract;
4551                 if (data->pos_min < 0) {
4552                     data->pos_min = 0;
4553                 }
4554                 data->pos_delta += min_subtract;
4555                 if (min_subtract) {
4556                     data->longest = &(data->longest_float);
4557                 }
4558             }
4559
4560             if (flags & SCF_DO_STCLASS) {
4561                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4562
4563                 assert(EXACTF_invlist);
4564                 if (flags & SCF_DO_STCLASS_AND) {
4565                     if (OP(scan) != EXACTFL)
4566                         ssc_clear_locale(data->start_class);
4567                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4568                     ANYOF_POSIXL_ZERO(data->start_class);
4569                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4570                 }
4571                 else {  /* SCF_DO_STCLASS_OR */
4572                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4573                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4574
4575                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4576                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4577                 }
4578                 flags &= ~SCF_DO_STCLASS;
4579                 SvREFCNT_dec(EXACTF_invlist);
4580             }
4581         }
4582         else if (REGNODE_VARIES(OP(scan))) {
4583             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4584             I32 fl = 0, f = flags;
4585             regnode * const oscan = scan;
4586             regnode_ssc this_class;
4587             regnode_ssc *oclass = NULL;
4588             I32 next_is_eval = 0;
4589
4590             switch (PL_regkind[OP(scan)]) {
4591             case WHILEM:                /* End of (?:...)* . */
4592                 scan = NEXTOPER(scan);
4593                 goto finish;
4594             case PLUS:
4595                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4596                     next = NEXTOPER(scan);
4597                     if (OP(next) == EXACT
4598                         || OP(next) == EXACTL
4599                         || (flags & SCF_DO_STCLASS))
4600                     {
4601                         mincount = 1;
4602                         maxcount = REG_INFTY;
4603                         next = regnext(scan);
4604                         scan = NEXTOPER(scan);
4605                         goto do_curly;
4606                     }
4607                 }
4608                 if (flags & SCF_DO_SUBSTR)
4609                     data->pos_min++;
4610                 min++;
4611                 /* FALLTHROUGH */
4612             case STAR:
4613                 if (flags & SCF_DO_STCLASS) {
4614                     mincount = 0;
4615                     maxcount = REG_INFTY;
4616                     next = regnext(scan);
4617                     scan = NEXTOPER(scan);
4618                     goto do_curly;
4619                 }
4620                 if (flags & SCF_DO_SUBSTR) {
4621                     scan_commit(pRExC_state, data, minlenp, is_inf);
4622                     /* Cannot extend fixed substrings */
4623                     data->longest = &(data->longest_float);
4624                 }
4625                 is_inf = is_inf_internal = 1;
4626                 scan = regnext(scan);
4627                 goto optimize_curly_tail;
4628             case CURLY:
4629                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4630                     && (scan->flags == stopparen))
4631                 {
4632                     mincount = 1;
4633                     maxcount = 1;
4634                 } else {
4635                     mincount = ARG1(scan);
4636                     maxcount = ARG2(scan);
4637                 }
4638                 next = regnext(scan);
4639                 if (OP(scan) == CURLYX) {
4640                     I32 lp = (data ? *(data->last_closep) : 0);
4641                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4642                 }
4643                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4644                 next_is_eval = (OP(scan) == EVAL);
4645               do_curly:
4646                 if (flags & SCF_DO_SUBSTR) {
4647                     if (mincount == 0)
4648                         scan_commit(pRExC_state, data, minlenp, is_inf);
4649                     /* Cannot extend fixed substrings */
4650                     pos_before = data->pos_min;
4651                 }
4652                 if (data) {
4653                     fl = data->flags;
4654                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4655                     if (is_inf)
4656                         data->flags |= SF_IS_INF;
4657                 }
4658                 if (flags & SCF_DO_STCLASS) {
4659                     ssc_init(pRExC_state, &this_class);
4660                     oclass = data->start_class;
4661                     data->start_class = &this_class;
4662                     f |= SCF_DO_STCLASS_AND;
4663                     f &= ~SCF_DO_STCLASS_OR;
4664                 }
4665                 /* Exclude from super-linear cache processing any {n,m}
4666                    regops for which the combination of input pos and regex
4667                    pos is not enough information to determine if a match
4668                    will be possible.
4669
4670                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4671                    regex pos at the \s*, the prospects for a match depend not
4672                    only on the input position but also on how many (bar\s*)
4673                    repeats into the {4,8} we are. */
4674                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4675                     f &= ~SCF_WHILEM_VISITED_POS;
4676
4677                 /* This will finish on WHILEM, setting scan, or on NULL: */
4678                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4679                                   last, data, stopparen, recursed_depth, NULL,
4680                                   (mincount == 0
4681                                    ? (f & ~SCF_DO_SUBSTR)
4682                                    : f)
4683                                   ,depth+1);
4684
4685                 if (flags & SCF_DO_STCLASS)
4686                     data->start_class = oclass;
4687                 if (mincount == 0 || minnext == 0) {
4688                     if (flags & SCF_DO_STCLASS_OR) {
4689                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4690                     }
4691                     else if (flags & SCF_DO_STCLASS_AND) {
4692                         /* Switch to OR mode: cache the old value of
4693                          * data->start_class */
4694                         INIT_AND_WITHP;
4695                         StructCopy(data->start_class, and_withp, regnode_ssc);
4696                         flags &= ~SCF_DO_STCLASS_AND;
4697                         StructCopy(&this_class, data->start_class, regnode_ssc);
4698                         flags |= SCF_DO_STCLASS_OR;
4699                         ANYOF_FLAGS(data->start_class)
4700                                                 |= SSC_MATCHES_EMPTY_STRING;
4701                     }
4702                 } else {                /* Non-zero len */
4703                     if (flags & SCF_DO_STCLASS_OR) {
4704                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4705                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4706                     }
4707                     else if (flags & SCF_DO_STCLASS_AND)
4708                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4709                     flags &= ~SCF_DO_STCLASS;
4710                 }
4711                 if (!scan)              /* It was not CURLYX, but CURLY. */
4712                     scan = next;
4713                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4714                     /* ? quantifier ok, except for (?{ ... }) */
4715                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4716                     && (minnext == 0) && (deltanext == 0)
4717                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4718                     && maxcount <= REG_INFTY/3) /* Complement check for big
4719                                                    count */
4720                 {
4721                     /* Fatal warnings may leak the regexp without this: */
4722                     SAVEFREESV(RExC_rx_sv);
4723                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4724                         "Quantifier unexpected on zero-length expression "
4725                         "in regex m/%"UTF8f"/",
4726                          UTF8fARG(UTF, RExC_end - RExC_precomp,
4727                                   RExC_precomp));
4728                     (void)ReREFCNT_inc(RExC_rx_sv);
4729                 }
4730
4731                 min += minnext * mincount;
4732                 is_inf_internal |= deltanext == SSize_t_MAX
4733                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4734                 is_inf |= is_inf_internal;
4735                 if (is_inf) {
4736                     delta = SSize_t_MAX;
4737                 } else {
4738                     delta += (minnext + deltanext) * maxcount
4739                              - minnext * mincount;
4740                 }
4741                 /* Try powerful optimization CURLYX => CURLYN. */
4742                 if (  OP(oscan) == CURLYX && data
4743                       && data->flags & SF_IN_PAR
4744                       && !(data->flags & SF_HAS_EVAL)
4745                       && !deltanext && minnext == 1 ) {
4746                     /* Try to optimize to CURLYN.  */
4747                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4748                     regnode * const nxt1 = nxt;
4749 #ifdef DEBUGGING
4750                     regnode *nxt2;
4751 #endif
4752
4753                     /* Skip open. */
4754                     nxt = regnext(nxt);
4755                     if (!REGNODE_SIMPLE(OP(nxt))
4756                         && !(PL_regkind[OP(nxt)] == EXACT
4757                              && STR_LEN(nxt) == 1))
4758                         goto nogo;
4759 #ifdef DEBUGGING
4760                     nxt2 = nxt;
4761 #endif
4762                     nxt = regnext(nxt);
4763                     if (OP(nxt) != CLOSE)
4764                         goto nogo;
4765                     if (RExC_open_parens) {
4766                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4767                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4768                     }
4769                     /* Now we know that nxt2 is the only contents: */
4770                     oscan->flags = (U8)ARG(nxt);
4771                     OP(oscan) = CURLYN;
4772                     OP(nxt1) = NOTHING; /* was OPEN. */
4773
4774 #ifdef DEBUGGING
4775                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4776                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4777                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4778                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4779                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4780                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4781 #endif
4782                 }
4783               nogo:
4784
4785                 /* Try optimization CURLYX => CURLYM. */
4786                 if (  OP(oscan) == CURLYX && data
4787                       && !(data->flags & SF_HAS_PAR)
4788                       && !(data->flags & SF_HAS_EVAL)
4789                       && !deltanext     /* atom is fixed width */
4790                       && minnext != 0   /* CURLYM can't handle zero width */
4791
4792                          /* Nor characters whose fold at run-time may be
4793                           * multi-character */
4794                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4795                 ) {
4796                     /* XXXX How to optimize if data == 0? */
4797                     /* Optimize to a simpler form.  */
4798                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4799                     regnode *nxt2;
4800
4801                     OP(oscan) = CURLYM;
4802                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4803                             && (OP(nxt2) != WHILEM))
4804                         nxt = nxt2;
4805                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4806                     /* Need to optimize away parenths. */
4807                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4808                         /* Set the parenth number.  */
4809                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4810
4811                         oscan->flags = (U8)ARG(nxt);
4812                         if (RExC_open_parens) {
4813                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4814                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4815                         }
4816                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4817                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4818
4819 #ifdef DEBUGGING
4820                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4821                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4822                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4823                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4824 #endif
4825 #if 0
4826                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4827                             regnode *nnxt = regnext(nxt1);
4828                             if (nnxt == nxt) {
4829                                 if (reg_off_by_arg[OP(nxt1)])
4830                                     ARG_SET(nxt1, nxt2 - nxt1);
4831                                 else if (nxt2 - nxt1 < U16_MAX)
4832                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4833                                 else
4834                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4835                             }
4836                             nxt1 = nnxt;
4837                         }
4838 #endif
4839                         /* Optimize again: */
4840                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4841                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4842                     }
4843                     else
4844                         oscan->flags = 0;
4845                 }
4846                 else if ((OP(oscan) == CURLYX)
4847                          && (flags & SCF_WHILEM_VISITED_POS)
4848                          /* See the comment on a similar expression above.
4849                             However, this time it's not a subexpression
4850                             we care about, but the expression itself. */
4851                          && (maxcount == REG_INFTY)
4852                          && data && ++data->whilem_c < 16) {
4853                     /* This stays as CURLYX, we can put the count/of pair. */
4854                     /* Find WHILEM (as in regexec.c) */
4855                     regnode *nxt = oscan + NEXT_OFF(oscan);
4856
4857                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4858                         nxt += ARG(nxt);
4859                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4860                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4861                 }
4862                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4863                     pars++;
4864                 if (flags & SCF_DO_SUBSTR) {
4865                     SV *last_str = NULL;
4866                     STRLEN last_chrs = 0;
4867                     int counted = mincount != 0;
4868
4869                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4870                                                                   string. */
4871                         SSize_t b = pos_before >= data->last_start_min
4872                             ? pos_before : data->last_start_min;
4873                         STRLEN l;
4874                         const char * const s = SvPV_const(data->last_found, l);
4875                         SSize_t old = b - data->last_start_min;
4876
4877                         if (UTF)
4878                             old = utf8_hop((U8*)s, old) - (U8*)s;
4879                         l -= old;
4880                         /* Get the added string: */
4881                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4882                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4883                                             (U8*)(s + old + l)) : l;
4884                         if (deltanext == 0 && pos_before == b) {
4885                             /* What was added is a constant string */
4886                             if (mincount > 1) {
4887
4888                                 SvGROW(last_str, (mincount * l) + 1);
4889                                 repeatcpy(SvPVX(last_str) + l,
4890                                           SvPVX_const(last_str), l,
4891                                           mincount - 1);
4892                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4893                                 /* Add additional parts. */
4894                                 SvCUR_set(data->last_found,
4895                                           SvCUR(data->last_found) - l);
4896                                 sv_catsv(data->last_found, last_str);
4897                                 {
4898                                     SV * sv = data->last_found;
4899                                     MAGIC *mg =
4900                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4901                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4902                                     if (mg && mg->mg_len >= 0)
4903                                         mg->mg_len += last_chrs * (mincount-1);
4904                                 }
4905                                 last_chrs *= mincount;
4906                                 data->last_end += l * (mincount - 1);
4907                             }
4908                         } else {
4909                             /* start offset must point into the last copy */
4910                             data->last_start_min += minnext * (mincount - 1);
4911                             data->last_start_max =
4912                               is_inf
4913                                ? SSize_t_MAX
4914                                : data->last_start_max +
4915                                  (maxcount - 1) * (minnext + data->pos_delta);
4916                         }
4917                     }
4918                     /* It is counted once already... */
4919                     data->pos_min += minnext * (mincount - counted);
4920 #if 0
4921 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4922                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4923                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4924     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4925     (UV)mincount);
4926 if (deltanext != SSize_t_MAX)
4927 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4928     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4929           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4930 #endif
4931                     if (deltanext == SSize_t_MAX
4932                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4933                         data->pos_delta = SSize_t_MAX;
4934                     else
4935                         data->pos_delta += - counted * deltanext +
4936                         (minnext + deltanext) * maxcount - minnext * mincount;
4937                     if (mincount != maxcount) {
4938                          /* Cannot extend fixed substrings found inside
4939                             the group.  */
4940                         scan_commit(pRExC_state, data, minlenp, is_inf);
4941                         if (mincount && last_str) {
4942                             SV * const sv = data->last_found;
4943                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4944                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4945
4946                             if (mg)
4947                                 mg->mg_len = -1;
4948                             sv_setsv(sv, last_str);
4949                             data->last_end = data->pos_min;
4950                             data->last_start_min = data->pos_min - last_chrs;
4951                             data->last_start_max = is_inf
4952                                 ? SSize_t_MAX
4953                                 : data->pos_min + data->pos_delta - last_chrs;
4954                         }
4955                         data->longest = &(data->longest_float);
4956                     }
4957                     SvREFCNT_dec(last_str);
4958                 }
4959                 if (data && (fl & SF_HAS_EVAL))
4960                     data->flags |= SF_HAS_EVAL;
4961               optimize_curly_tail:
4962                 if (OP(oscan) != CURLYX) {
4963                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4964                            && NEXT_OFF(next))
4965                         NEXT_OFF(oscan) += NEXT_OFF(next);
4966                 }
4967                 continue;
4968
4969             default:
4970 #ifdef DEBUGGING
4971                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4972                                                                     OP(scan));
4973 #endif
4974             case REF:
4975             case CLUMP:
4976                 if (flags & SCF_DO_SUBSTR) {
4977                     /* Cannot expect anything... */
4978                     scan_commit(pRExC_state, data, minlenp, is_inf);
4979                     data->longest = &(data->longest_float);
4980                 }
4981                 is_inf = is_inf_internal = 1;
4982                 if (flags & SCF_DO_STCLASS_OR) {
4983                     if (OP(scan) == CLUMP) {
4984                         /* Actually is any start char, but very few code points
4985                          * aren't start characters */
4986                         ssc_match_all_cp(data->start_class);
4987                     }
4988                     else {
4989                         ssc_anything(data->start_class);
4990                     }
4991                 }
4992                 flags &= ~SCF_DO_STCLASS;
4993                 break;
4994             }
4995         }
4996         else if (OP(scan) == LNBREAK) {
4997             if (flags & SCF_DO_STCLASS) {
4998                 if (flags & SCF_DO_STCLASS_AND) {
4999                     ssc_intersection(data->start_class,
5000                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5001                     ssc_clear_locale(data->start_class);
5002                     ANYOF_FLAGS(data->start_class)
5003                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5004                 }
5005                 else if (flags & SCF_DO_STCLASS_OR) {
5006                     ssc_union(data->start_class,
5007                               PL_XPosix_ptrs[_CC_VERTSPACE],
5008                               FALSE);
5009                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5010
5011                     /* See commit msg for
5012                      * 749e076fceedeb708a624933726e7989f2302f6a */
5013                     ANYOF_FLAGS(data->start_class)
5014                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5015                 }
5016                 flags &= ~SCF_DO_STCLASS;
5017             }
5018             min++;
5019             if (delta != SSize_t_MAX)
5020                 delta++;    /* Because of the 2 char string cr-lf */
5021             if (flags & SCF_DO_SUBSTR) {
5022                 /* Cannot expect anything... */
5023                 scan_commit(pRExC_state, data, minlenp, is_inf);
5024                 data->pos_min += 1;
5025                 data->pos_delta += 1;
5026                 data->longest = &(data->longest_float);
5027             }
5028         }
5029         else if (REGNODE_SIMPLE(OP(scan))) {
5030
5031             if (flags & SCF_DO_SUBSTR) {
5032                 scan_commit(pRExC_state, data, minlenp, is_inf);
5033                 data->pos_min++;
5034             }
5035             min++;
5036             if (flags & SCF_DO_STCLASS) {
5037                 bool invert = 0;
5038                 SV* my_invlist = NULL;
5039                 U8 namedclass;
5040
5041                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5042                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5043
5044                 /* Some of the logic below assumes that switching
5045                    locale on will only add false positives. */
5046                 switch (OP(scan)) {
5047
5048                 default:
5049 #ifdef DEBUGGING
5050                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5051                                                                      OP(scan));
5052 #endif
5053                 case CANY:
5054                 case SANY:
5055                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5056                         ssc_match_all_cp(data->start_class);
5057                     break;
5058
5059                 case REG_ANY:
5060                     {
5061                         SV* REG_ANY_invlist = _new_invlist(2);
5062                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5063                                                             '\n');
5064                         if (flags & SCF_DO_STCLASS_OR) {
5065                             ssc_union(data->start_class,
5066                                       REG_ANY_invlist,
5067                                       TRUE /* TRUE => invert, hence all but \n
5068                                             */
5069                                       );
5070                         }
5071                         else if (flags & SCF_DO_STCLASS_AND) {
5072                             ssc_intersection(data->start_class,
5073                                              REG_ANY_invlist,
5074                                              TRUE  /* TRUE => invert */
5075                                              );
5076                             ssc_clear_locale(data->start_class);
5077                         }
5078                         SvREFCNT_dec_NN(REG_ANY_invlist);
5079                     }
5080                     break;
5081
5082                 case ANYOFL:
5083                 case ANYOF:
5084                     if (flags & SCF_DO_STCLASS_AND)
5085                         ssc_and(pRExC_state, data->start_class,
5086                                 (regnode_charclass *) scan);
5087                     else
5088                         ssc_or(pRExC_state, data->start_class,
5089                                                           (regnode_charclass *) scan);
5090                     break;
5091
5092                 case NPOSIXL:
5093                     invert = 1;
5094                     /* FALLTHROUGH */
5095
5096                 case POSIXL:
5097                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5098                     if (flags & SCF_DO_STCLASS_AND) {
5099                         bool was_there = cBOOL(
5100                                           ANYOF_POSIXL_TEST(data->start_class,
5101                                                                  namedclass));
5102                         ANYOF_POSIXL_ZERO(data->start_class);
5103                         if (was_there) {    /* Do an AND */
5104                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5105                         }
5106                         /* No individual code points can now match */
5107                         data->start_class->invlist
5108                                                 = sv_2mortal(_new_invlist(0));
5109                     }
5110                     else {
5111                         int complement = namedclass + ((invert) ? -1 : 1);
5112
5113                         assert(flags & SCF_DO_STCLASS_OR);
5114
5115                         /* If the complement of this class was already there,
5116                          * the result is that they match all code points,
5117                          * (\d + \D == everything).  Remove the classes from
5118                          * future consideration.  Locale is not relevant in
5119                          * this case */
5120                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5121                             ssc_match_all_cp(data->start_class);
5122                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5123                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5124                         }
5125                         else {  /* The usual case; just add this class to the
5126                                    existing set */
5127                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5128                         }
5129                     }
5130                     break;
5131
5132                 case NPOSIXA:   /* For these, we always know the exact set of
5133                                    what's matched */
5134                     invert = 1;
5135                     /* FALLTHROUGH */
5136                 case POSIXA:
5137                     if (FLAGS(scan) == _CC_ASCII) {
5138                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5139                     }
5140                     else {
5141                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5142                                               PL_XPosix_ptrs[_CC_ASCII],
5143                                               &my_invlist);
5144                     }
5145                     goto join_posix;
5146
5147                 case NPOSIXD:
5148                 case NPOSIXU:
5149                     invert = 1;
5150                     /* FALLTHROUGH */
5151                 case POSIXD:
5152                 case POSIXU:
5153                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5154
5155                     /* NPOSIXD matches all upper Latin1 code points unless the
5156                      * target string being matched is UTF-8, which is
5157                      * unknowable until match time.  Since we are going to
5158                      * invert, we want to get rid of all of them so that the
5159                      * inversion will match all */
5160                     if (OP(scan) == NPOSIXD) {
5161                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5162                                           &my_invlist);
5163                     }
5164
5165                   join_posix:
5166
5167                     if (flags & SCF_DO_STCLASS_AND) {
5168                         ssc_intersection(data->start_class, my_invlist, invert);
5169                         ssc_clear_locale(data->start_class);
5170                     }
5171                     else {
5172                         assert(flags & SCF_DO_STCLASS_OR);
5173                         ssc_union(data->start_class, my_invlist, invert);
5174                     }
5175                     SvREFCNT_dec(my_invlist);
5176                 }
5177                 if (flags & SCF_DO_STCLASS_OR)
5178                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5179                 flags &= ~SCF_DO_STCLASS;
5180             }
5181         }
5182         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5183             data->flags |= (OP(scan) == MEOL
5184                             ? SF_BEFORE_MEOL
5185                             : SF_BEFORE_SEOL);
5186             scan_commit(pRExC_state, data, minlenp, is_inf);
5187
5188         }
5189         else if (  PL_regkind[OP(scan)] == BRANCHJ
5190                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5191                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5192                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5193         {
5194             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5195                 || OP(scan) == UNLESSM )
5196             {
5197                 /* Negative Lookahead/lookbehind
5198                    In this case we can't do fixed string optimisation.
5199                 */
5200
5201                 SSize_t deltanext, minnext, fake = 0;
5202                 regnode *nscan;
5203                 regnode_ssc intrnl;
5204                 int f = 0;
5205
5206                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5207                 if (data) {
5208                     data_fake.whilem_c = data->whilem_c;
5209                     data_fake.last_closep = data->last_closep;
5210                 }
5211                 else
5212                     data_fake.last_closep = &fake;
5213                 data_fake.pos_delta = delta;
5214                 if ( flags & SCF_DO_STCLASS && !scan->flags
5215                      && OP(scan) == IFMATCH ) { /* Lookahead */
5216                     ssc_init(pRExC_state, &intrnl);
5217                     data_fake.start_class = &intrnl;
5218                     f |= SCF_DO_STCLASS_AND;
5219                 }
5220                 if (flags & SCF_WHILEM_VISITED_POS)
5221                     f |= SCF_WHILEM_VISITED_POS;
5222                 next = regnext(scan);
5223                 nscan = NEXTOPER(NEXTOPER(scan));
5224                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5225                                       last, &data_fake, stopparen,
5226                                       recursed_depth, NULL, f, depth+1);
5227                 if (scan->flags) {
5228                     if (deltanext) {
5229                         FAIL("Variable length lookbehind not implemented");
5230                     }
5231                     else if (minnext > (I32)U8_MAX) {
5232                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5233                               (UV)U8_MAX);
5234                     }
5235                     scan->flags = (U8)minnext;
5236                 }
5237                 if (data) {
5238                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5239                         pars++;
5240                     if (data_fake.flags & SF_HAS_EVAL)
5241                         data->flags |= SF_HAS_EVAL;
5242                     data->whilem_c = data_fake.whilem_c;
5243                 }
5244                 if (f & SCF_DO_STCLASS_AND) {
5245                     if (flags & SCF_DO_STCLASS_OR) {
5246                         /* OR before, AND after: ideally we would recurse with
5247                          * data_fake to get the AND applied by study of the
5248                          * remainder of the pattern, and then derecurse;
5249                          * *** HACK *** for now just treat as "no information".
5250                          * See [perl #56690].
5251                          */
5252                         ssc_init(pRExC_state, data->start_class);
5253                     }  else {
5254                         /* AND before and after: combine and continue.  These
5255                          * assertions are zero-length, so can match an EMPTY
5256                          * string */
5257                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5258                         ANYOF_FLAGS(data->start_class)
5259                                                    |= SSC_MATCHES_EMPTY_STRING;
5260                     }
5261                 }
5262             }
5263 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5264             else {
5265                 /* Positive Lookahead/lookbehind
5266                    In this case we can do fixed string optimisation,
5267                    but we must be careful about it. Note in the case of
5268                    lookbehind the positions will be offset by the minimum
5269                    length of the pattern, something we won't know about
5270                    until after the recurse.
5271                 */
5272                 SSize_t deltanext, fake = 0;
5273                 regnode *nscan;
5274                 regnode_ssc intrnl;
5275                 int f = 0;
5276                 /* We use SAVEFREEPV so that when the full compile
5277                     is finished perl will clean up the allocated
5278                     minlens when it's all done. This way we don't
5279                     have to worry about freeing them when we know
5280                     they wont be used, which would be a pain.
5281                  */
5282                 SSize_t *minnextp;
5283                 Newx( minnextp, 1, SSize_t );
5284                 SAVEFREEPV(minnextp);
5285
5286                 if (data) {
5287                     StructCopy(data, &data_fake, scan_data_t);
5288                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5289                         f |= SCF_DO_SUBSTR;
5290                         if (scan->flags)
5291                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5292                         data_fake.last_found=newSVsv(data->last_found);
5293                     }
5294                 }
5295                 else
5296                     data_fake.last_closep = &fake;
5297                 data_fake.flags = 0;
5298                 data_fake.pos_delta = delta;
5299                 if (is_inf)
5300                     data_fake.flags |= SF_IS_INF;
5301                 if ( flags & SCF_DO_STCLASS && !scan->flags
5302                      && OP(scan) == IFMATCH ) { /* Lookahead */
5303                     ssc_init(pRExC_state, &intrnl);
5304                     data_fake.start_class = &intrnl;
5305                     f |= SCF_DO_STCLASS_AND;
5306                 }
5307                 if (flags & SCF_WHILEM_VISITED_POS)
5308                     f |= SCF_WHILEM_VISITED_POS;
5309                 next = regnext(scan);
5310                 nscan = NEXTOPER(NEXTOPER(scan));
5311
5312                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5313                                         &deltanext, last, &data_fake,
5314                                         stopparen, recursed_depth, NULL,
5315                                         f,depth+1);
5316                 if (scan->flags) {
5317                     if (deltanext) {
5318                         FAIL("Variable length lookbehind not implemented");
5319                     }
5320                     else if (*minnextp > (I32)U8_MAX) {
5321                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5322                               (UV)U8_MAX);
5323                     }
5324                     scan->flags = (U8)*minnextp;
5325                 }
5326
5327                 *minnextp += min;
5328
5329                 if (f & SCF_DO_STCLASS_AND) {
5330                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5331                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5332                 }
5333                 if (data) {
5334                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5335                         pars++;
5336                     if (data_fake.flags & SF_HAS_EVAL)
5337                         data->flags |= SF_HAS_EVAL;
5338                     data->whilem_c = data_fake.whilem_c;
5339                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5340                         if (RExC_rx->minlen<*minnextp)
5341                             RExC_rx->minlen=*minnextp;
5342                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5343                         SvREFCNT_dec_NN(data_fake.last_found);
5344
5345                         if ( data_fake.minlen_fixed != minlenp )
5346                         {
5347                             data->offset_fixed= data_fake.offset_fixed;
5348                             data->minlen_fixed= data_fake.minlen_fixed;
5349                             data->lookbehind_fixed+= scan->flags;
5350                         }
5351                         if ( data_fake.minlen_float != minlenp )
5352                         {
5353                             data->minlen_float= data_fake.minlen_float;
5354                             data->offset_float_min=data_fake.offset_float_min;
5355                             data->offset_float_max=data_fake.offset_float_max;
5356                             data->lookbehind_float+= scan->flags;
5357                         }
5358                     }
5359                 }
5360             }
5361 #endif
5362         }
5363         else if (OP(scan) == OPEN) {
5364             if (stopparen != (I32)ARG(scan))
5365                 pars++;
5366         }
5367         else if (OP(scan) == CLOSE) {
5368             if (stopparen == (I32)ARG(scan)) {
5369                 break;
5370             }
5371             if ((I32)ARG(scan) == is_par) {
5372                 next = regnext(scan);
5373
5374                 if ( next && (OP(next) != WHILEM) && next < last)
5375                     is_par = 0;         /* Disable optimization */
5376             }
5377             if (data)
5378                 *(data->last_closep) = ARG(scan);
5379         }
5380         else if (OP(scan) == EVAL) {
5381                 if (data)
5382                     data->flags |= SF_HAS_EVAL;
5383         }
5384         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5385             if (flags & SCF_DO_SUBSTR) {
5386                 scan_commit(pRExC_state, data, minlenp, is_inf);
5387                 flags &= ~SCF_DO_SUBSTR;
5388             }
5389             if (data && OP(scan)==ACCEPT) {
5390                 data->flags |= SCF_SEEN_ACCEPT;
5391                 if (stopmin > min)
5392                     stopmin = min;
5393             }
5394         }
5395         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5396         {
5397                 if (flags & SCF_DO_SUBSTR) {
5398                     scan_commit(pRExC_state, data, minlenp, is_inf);
5399                     data->longest = &(data->longest_float);
5400                 }
5401                 is_inf = is_inf_internal = 1;
5402                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5403                     ssc_anything(data->start_class);
5404                 flags &= ~SCF_DO_STCLASS;
5405         }
5406         else if (OP(scan) == GPOS) {
5407             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5408                 !(delta || is_inf || (data && data->pos_delta)))
5409             {
5410                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5411                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5412                 if (RExC_rx->gofs < (STRLEN)min)
5413                     RExC_rx->gofs = min;
5414             } else {
5415                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5416                 RExC_rx->gofs = 0;
5417             }
5418         }
5419 #ifdef TRIE_STUDY_OPT
5420 #ifdef FULL_TRIE_STUDY
5421         else if (PL_regkind[OP(scan)] == TRIE) {
5422             /* NOTE - There is similar code to this block above for handling
5423                BRANCH nodes on the initial study.  If you change stuff here
5424                check there too. */
5425             regnode *trie_node= scan;
5426             regnode *tail= regnext(scan);
5427             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5428             SSize_t max1 = 0, min1 = SSize_t_MAX;
5429             regnode_ssc accum;
5430
5431             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5432                 /* Cannot merge strings after this. */
5433                 scan_commit(pRExC_state, data, minlenp, is_inf);
5434             }
5435             if (flags & SCF_DO_STCLASS)
5436                 ssc_init_zero(pRExC_state, &accum);
5437
5438             if (!trie->jump) {
5439                 min1= trie->minlen;
5440                 max1= trie->maxlen;
5441             } else {
5442                 const regnode *nextbranch= NULL;
5443                 U32 word;
5444
5445                 for ( word=1 ; word <= trie->wordcount ; word++)
5446                 {
5447                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5448                     regnode_ssc this_class;
5449
5450                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5451                     if (data) {
5452                         data_fake.whilem_c = data->whilem_c;
5453                         data_fake.last_closep = data->last_closep;
5454                     }
5455                     else
5456                         data_fake.last_closep = &fake;
5457                     data_fake.pos_delta = delta;
5458                     if (flags & SCF_DO_STCLASS) {
5459                         ssc_init(pRExC_state, &this_class);
5460                         data_fake.start_class = &this_class;
5461                         f = SCF_DO_STCLASS_AND;
5462                     }
5463                     if (flags & SCF_WHILEM_VISITED_POS)
5464                         f |= SCF_WHILEM_VISITED_POS;
5465
5466                     if (trie->jump[word]) {
5467                         if (!nextbranch)
5468                             nextbranch = trie_node + trie->jump[0];
5469                         scan= trie_node + trie->jump[word];
5470                         /* We go from the jump point to the branch that follows
5471                            it. Note this means we need the vestigal unused
5472                            branches even though they arent otherwise used. */
5473                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5474                             &deltanext, (regnode *)nextbranch, &data_fake,
5475                             stopparen, recursed_depth, NULL, f,depth+1);
5476                     }
5477                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5478                         nextbranch= regnext((regnode*)nextbranch);
5479
5480                     if (min1 > (SSize_t)(minnext + trie->minlen))
5481                         min1 = minnext + trie->minlen;
5482                     if (deltanext == SSize_t_MAX) {
5483                         is_inf = is_inf_internal = 1;
5484                         max1 = SSize_t_MAX;
5485                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5486                         max1 = minnext + deltanext + trie->maxlen;
5487
5488                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5489                         pars++;
5490                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5491                         if ( stopmin > min + min1)
5492                             stopmin = min + min1;
5493                         flags &= ~SCF_DO_SUBSTR;
5494                         if (data)
5495                             data->flags |= SCF_SEEN_ACCEPT;
5496                     }
5497                     if (data) {
5498                         if (data_fake.flags & SF_HAS_EVAL)
5499                             data->flags |= SF_HAS_EVAL;
5500                         data->whilem_c = data_fake.whilem_c;
5501                     }
5502                     if (flags & SCF_DO_STCLASS)
5503                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5504                 }
5505             }
5506             if (flags & SCF_DO_SUBSTR) {
5507                 data->pos_min += min1;
5508                 data->pos_delta += max1 - min1;
5509                 if (max1 != min1 || is_inf)
5510                     data->longest = &(data->longest_float);
5511             }
5512             min += min1;
5513             if (delta != SSize_t_MAX)
5514                 delta += max1 - min1;
5515             if (flags & SCF_DO_STCLASS_OR) {
5516                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5517                 if (min1) {
5518                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5519                     flags &= ~SCF_DO_STCLASS;
5520                 }
5521             }
5522             else if (flags & SCF_DO_STCLASS_AND) {
5523                 if (min1) {
5524                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5525                     flags &= ~SCF_DO_STCLASS;
5526                 }
5527                 else {
5528                     /* Switch to OR mode: cache the old value of
5529                      * data->start_class */
5530                     INIT_AND_WITHP;
5531                     StructCopy(data->start_class, and_withp, regnode_ssc);
5532                     flags &= ~SCF_DO_STCLASS_AND;
5533                     StructCopy(&accum, data->start_class, regnode_ssc);
5534                     flags |= SCF_DO_STCLASS_OR;
5535                 }
5536             }
5537             scan= tail;
5538             continue;
5539         }
5540 #else
5541         else if (PL_regkind[OP(scan)] == TRIE) {
5542             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5543             U8*bang=NULL;
5544
5545             min += trie->minlen;
5546             delta += (trie->maxlen - trie->minlen);
5547             flags &= ~SCF_DO_STCLASS; /* xxx */
5548             if (flags & SCF_DO_SUBSTR) {
5549                 /* Cannot expect anything... */
5550                 scan_commit(pRExC_state, data, minlenp, is_inf);
5551                 data->pos_min += trie->minlen;
5552                 data->pos_delta += (trie->maxlen - trie->minlen);
5553                 if (trie->maxlen != trie->minlen)
5554                     data->longest = &(data->longest_float);
5555             }
5556             if (trie->jump) /* no more substrings -- for now /grr*/
5557                flags &= ~SCF_DO_SUBSTR;
5558         }
5559 #endif /* old or new */
5560 #endif /* TRIE_STUDY_OPT */
5561
5562         /* Else: zero-length, ignore. */
5563         scan = regnext(scan);
5564     }
5565     /* If we are exiting a recursion we can unset its recursed bit
5566      * and allow ourselves to enter it again - no danger of an
5567      * infinite loop there.
5568     if (stopparen > -1 && recursed) {
5569         DEBUG_STUDYDATA("unset:", data,depth);
5570         PAREN_UNSET( recursed, stopparen);
5571     }
5572     */
5573     if (frame) {
5574         depth = depth - 1;
5575
5576         DEBUG_STUDYDATA("frame-end:",data,depth);
5577         DEBUG_PEEP("fend", scan, depth);
5578
5579         /* restore previous context */
5580         last = frame->last_regnode;
5581         scan = frame->next_regnode;
5582         stopparen = frame->stopparen;
5583         recursed_depth = frame->prev_recursed_depth;
5584
5585         RExC_frame_last = frame->prev_frame;
5586         frame = frame->this_prev_frame;
5587         goto fake_study_recurse;
5588     }
5589
5590   finish:
5591     assert(!frame);
5592     DEBUG_STUDYDATA("pre-fin:",data,depth);
5593
5594     *scanp = scan;
5595     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5596
5597     if (flags & SCF_DO_SUBSTR && is_inf)
5598         data->pos_delta = SSize_t_MAX - data->pos_min;
5599     if (is_par > (I32)U8_MAX)
5600         is_par = 0;
5601     if (is_par && pars==1 && data) {
5602         data->flags |= SF_IN_PAR;
5603         data->flags &= ~SF_HAS_PAR;
5604     }
5605     else if (pars && data) {
5606         data->flags |= SF_HAS_PAR;
5607         data->flags &= ~SF_IN_PAR;
5608     }
5609     if (flags & SCF_DO_STCLASS_OR)
5610         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5611     if (flags & SCF_TRIE_RESTUDY)
5612         data->flags |=  SCF_TRIE_RESTUDY;
5613
5614     DEBUG_STUDYDATA("post-fin:",data,depth);
5615
5616     {
5617         SSize_t final_minlen= min < stopmin ? min : stopmin;
5618
5619         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5620             if (final_minlen > SSize_t_MAX - delta)
5621                 RExC_maxlen = SSize_t_MAX;
5622             else if (RExC_maxlen < final_minlen + delta)
5623                 RExC_maxlen = final_minlen + delta;
5624         }
5625         return final_minlen;
5626     }
5627     NOT_REACHED;
5628 }
5629
5630 STATIC U32
5631 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5632 {
5633     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5634
5635     PERL_ARGS_ASSERT_ADD_DATA;
5636
5637     Renewc(RExC_rxi->data,
5638            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5639            char, struct reg_data);
5640     if(count)
5641         Renew(RExC_rxi->data->what, count + n, U8);
5642     else
5643         Newx(RExC_rxi->data->what, n, U8);
5644     RExC_rxi->data->count = count + n;
5645     Copy(s, RExC_rxi->data->what + count, n, U8);
5646     return count;
5647 }
5648
5649 /*XXX: todo make this not included in a non debugging perl, but appears to be
5650  * used anyway there, in 'use re' */
5651 #ifndef PERL_IN_XSUB_RE
5652 void
5653 Perl_reginitcolors(pTHX)
5654 {
5655     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5656     if (s) {
5657         char *t = savepv(s);
5658         int i = 0;
5659         PL_colors[0] = t;
5660         while (++i < 6) {
5661             t = strchr(t, '\t');
5662             if (t) {
5663                 *t = '\0';
5664                 PL_colors[i] = ++t;
5665             }
5666             else
5667                 PL_colors[i] = t = (char *)"";
5668         }
5669     } else {
5670         int i = 0;
5671         while (i < 6)
5672             PL_colors[i++] = (char *)"";
5673     }
5674     PL_colorset = 1;
5675 }
5676 #endif
5677
5678
5679 #ifdef TRIE_STUDY_OPT
5680 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5681     STMT_START {                                            \
5682         if (                                                \
5683               (data.flags & SCF_TRIE_RESTUDY)               \
5684               && ! restudied++                              \
5685         ) {                                                 \
5686             dOsomething;                                    \
5687             goto reStudy;                                   \
5688         }                                                   \
5689     } STMT_END
5690 #else
5691 #define CHECK_RESTUDY_GOTO_butfirst
5692 #endif
5693
5694 /*
5695  * pregcomp - compile a regular expression into internal code
5696  *
5697  * Decides which engine's compiler to call based on the hint currently in
5698  * scope
5699  */
5700
5701 #ifndef PERL_IN_XSUB_RE
5702
5703 /* return the currently in-scope regex engine (or the default if none)  */
5704
5705 regexp_engine const *
5706 Perl_current_re_engine(pTHX)
5707 {
5708     if (IN_PERL_COMPILETIME) {
5709         HV * const table = GvHV(PL_hintgv);
5710         SV **ptr;
5711
5712         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5713             return &PL_core_reg_engine;
5714         ptr = hv_fetchs(table, "regcomp", FALSE);
5715         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5716             return &PL_core_reg_engine;
5717         return INT2PTR(regexp_engine*,SvIV(*ptr));
5718     }
5719     else {
5720         SV *ptr;
5721         if (!PL_curcop->cop_hints_hash)
5722             return &PL_core_reg_engine;
5723         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5724         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5725             return &PL_core_reg_engine;
5726         return INT2PTR(regexp_engine*,SvIV(ptr));
5727     }
5728 }
5729
5730
5731 REGEXP *
5732 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5733 {
5734     regexp_engine const *eng = current_re_engine();
5735     GET_RE_DEBUG_FLAGS_DECL;
5736
5737     PERL_ARGS_ASSERT_PREGCOMP;
5738
5739     /* Dispatch a request to compile a regexp to correct regexp engine. */
5740     DEBUG_COMPILE_r({
5741         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5742                         PTR2UV(eng));
5743     });
5744     return CALLREGCOMP_ENG(eng, pattern, flags);
5745 }
5746 #endif
5747
5748 /* public(ish) entry point for the perl core's own regex compiling code.
5749  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5750  * pattern rather than a list of OPs, and uses the internal engine rather
5751  * than the current one */
5752
5753 REGEXP *
5754 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5755 {
5756     SV *pat = pattern; /* defeat constness! */
5757     PERL_ARGS_ASSERT_RE_COMPILE;
5758     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5759 #ifdef PERL_IN_XSUB_RE
5760                                 &my_reg_engine,
5761 #else
5762                                 &PL_core_reg_engine,
5763 #endif
5764                                 NULL, NULL, rx_flags, 0);
5765 }
5766
5767
5768 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5769  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5770  * point to the realloced string and length.
5771  *
5772  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5773  * stuff added */
5774
5775 static void
5776 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5777                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5778 {
5779     U8 *const src = (U8*)*pat_p;
5780     U8 *dst, *d;
5781     int n=0;
5782     STRLEN s = 0;
5783     bool do_end = 0;
5784     GET_RE_DEBUG_FLAGS_DECL;
5785
5786     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5787         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5788
5789     Newx(dst, *plen_p * 2 + 1, U8);
5790     d = dst;
5791
5792     while (s < *plen_p) {
5793         append_utf8_from_native_byte(src[s], &d);
5794         if (n < num_code_blocks) {
5795             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5796                 pRExC_state->code_blocks[n].start = d - dst - 1;
5797                 assert(*(d - 1) == '(');
5798                 do_end = 1;
5799             }
5800             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5801                 pRExC_state->code_blocks[n].end = d - dst - 1;
5802                 assert(*(d - 1) == ')');
5803                 do_end = 0;
5804                 n++;
5805             }
5806         }
5807         s++;
5808     }
5809     *d = '\0';
5810     *plen_p = d - dst;
5811     *pat_p = (char*) dst;
5812     SAVEFREEPV(*pat_p);
5813     RExC_orig_utf8 = RExC_utf8 = 1;
5814 }
5815
5816
5817
5818 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5819  * while recording any code block indices, and handling overloading,
5820  * nested qr// objects etc.  If pat is null, it will allocate a new
5821  * string, or just return the first arg, if there's only one.
5822  *
5823  * Returns the malloced/updated pat.
5824  * patternp and pat_count is the array of SVs to be concatted;
5825  * oplist is the optional list of ops that generated the SVs;
5826  * recompile_p is a pointer to a boolean that will be set if
5827  *   the regex will need to be recompiled.
5828  * delim, if non-null is an SV that will be inserted between each element
5829  */
5830
5831 static SV*
5832 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5833                 SV *pat, SV ** const patternp, int pat_count,
5834                 OP *oplist, bool *recompile_p, SV *delim)
5835 {
5836     SV **svp;
5837     int n = 0;
5838     bool use_delim = FALSE;
5839     bool alloced = FALSE;
5840
5841     /* if we know we have at least two args, create an empty string,
5842      * then concatenate args to that. For no args, return an empty string */
5843     if (!pat && pat_count != 1) {
5844         pat = newSVpvs("");
5845         SAVEFREESV(pat);
5846         alloced = TRUE;
5847     }
5848
5849     for (svp = patternp; svp < patternp + pat_count; svp++) {
5850         SV *sv;
5851         SV *rx  = NULL;
5852         STRLEN orig_patlen = 0;
5853         bool code = 0;
5854         SV *msv = use_delim ? delim : *svp;
5855         if (!msv) msv = &PL_sv_undef;
5856
5857         /* if we've got a delimiter, we go round the loop twice for each
5858          * svp slot (except the last), using the delimiter the second
5859          * time round */
5860         if (use_delim) {
5861             svp--;
5862             use_delim = FALSE;
5863         }
5864         else if (delim)
5865             use_delim = TRUE;
5866
5867         if (SvTYPE(msv) == SVt_PVAV) {
5868             /* we've encountered an interpolated array within
5869              * the pattern, e.g. /...@a..../. Expand the list of elements,
5870              * then recursively append elements.
5871              * The code in this block is based on S_pushav() */
5872
5873             AV *const av = (AV*)msv;
5874             const SSize_t maxarg = AvFILL(av) + 1;
5875             SV **array;
5876
5877             if (oplist) {
5878                 assert(oplist->op_type == OP_PADAV
5879                     || oplist->op_type == OP_RV2AV);
5880                 oplist = OpSIBLING(oplist);
5881             }
5882
5883             if (SvRMAGICAL(av)) {
5884                 SSize_t i;
5885
5886                 Newx(array, maxarg, SV*);
5887                 SAVEFREEPV(array);
5888                 for (i=0; i < maxarg; i++) {
5889                     SV ** const svp = av_fetch(av, i, FALSE);
5890                     array[i] = svp ? *svp : &PL_sv_undef;
5891                 }
5892             }
5893             else
5894                 array = AvARRAY(av);
5895
5896             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5897                                 array, maxarg, NULL, recompile_p,
5898                                 /* $" */
5899                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5900
5901             continue;
5902         }
5903
5904
5905         /* we make the assumption here that each op in the list of
5906          * op_siblings maps to one SV pushed onto the stack,
5907          * except for code blocks, with have both an OP_NULL and
5908          * and OP_CONST.
5909          * This allows us to match up the list of SVs against the
5910          * list of OPs to find the next code block.
5911          *
5912          * Note that       PUSHMARK PADSV PADSV ..
5913          * is optimised to
5914          *                 PADRANGE PADSV  PADSV  ..
5915          * so the alignment still works. */
5916
5917         if (oplist) {
5918             if (oplist->op_type == OP_NULL
5919                 && (oplist->op_flags & OPf_SPECIAL))
5920             {
5921                 assert(n < pRExC_state->num_code_blocks);
5922                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5923                 pRExC_state->code_blocks[n].block = oplist;
5924                 pRExC_state->code_blocks[n].src_regex = NULL;
5925                 n++;
5926                 code = 1;
5927                 oplist = OpSIBLING(oplist); /* skip CONST */
5928                 assert(oplist);
5929             }
5930             oplist = OpSIBLING(oplist);;
5931         }
5932
5933         /* apply magic and QR overloading to arg */
5934
5935         SvGETMAGIC(msv);
5936         if (SvROK(msv) && SvAMAGIC(msv)) {
5937             SV *sv = AMG_CALLunary(msv, regexp_amg);
5938             if (sv) {
5939                 if (SvROK(sv))
5940                     sv = SvRV(sv);
5941                 if (SvTYPE(sv) != SVt_REGEXP)
5942                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5943                 msv = sv;
5944             }
5945         }
5946
5947         /* try concatenation overload ... */
5948         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5949                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5950         {
5951             sv_setsv(pat, sv);
5952             /* overloading involved: all bets are off over literal
5953              * code. Pretend we haven't seen it */
5954             pRExC_state->num_code_blocks -= n;
5955             n = 0;
5956         }
5957         else  {
5958             /* ... or failing that, try "" overload */
5959             while (SvAMAGIC(msv)
5960                     && (sv = AMG_CALLunary(msv, string_amg))
5961                     && sv != msv
5962                     &&  !(   SvROK(msv)
5963                           && SvROK(sv)
5964                           && SvRV(msv) == SvRV(sv))
5965             ) {
5966                 msv = sv;
5967                 SvGETMAGIC(msv);
5968             }
5969             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5970                 msv = SvRV(msv);
5971
5972             if (pat) {
5973                 /* this is a partially unrolled
5974                  *     sv_catsv_nomg(pat, msv);
5975                  * that allows us to adjust code block indices if
5976                  * needed */
5977                 STRLEN dlen;
5978                 char *dst = SvPV_force_nomg(pat, dlen);
5979                 orig_patlen = dlen;
5980                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5981                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5982                     sv_setpvn(pat, dst, dlen);
5983                     SvUTF8_on(pat);
5984                 }
5985                 sv_catsv_nomg(pat, msv);
5986                 rx = msv;
5987             }
5988             else
5989                 pat = msv;
5990
5991             if (code)
5992                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5993         }
5994
5995         /* extract any code blocks within any embedded qr//'s */
5996         if (rx && SvTYPE(rx) == SVt_REGEXP
5997             && RX_ENGINE((REGEXP*)rx)->op_comp)
5998         {
5999
6000             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6001             if (ri->num_code_blocks) {
6002                 int i;
6003                 /* the presence of an embedded qr// with code means
6004                  * we should always recompile: the text of the
6005                  * qr// may not have changed, but it may be a
6006                  * different closure than last time */
6007                 *recompile_p = 1;
6008                 Renew(pRExC_state->code_blocks,
6009                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6010                     struct reg_code_block);
6011                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6012
6013                 for (i=0; i < ri->num_code_blocks; i++) {
6014                     struct reg_code_block *src, *dst;
6015                     STRLEN offset =  orig_patlen
6016                         + ReANY((REGEXP *)rx)->pre_prefix;
6017                     assert(n < pRExC_state->num_code_blocks);
6018                     src = &ri->code_blocks[i];
6019                     dst = &pRExC_state->code_blocks[n];
6020                     dst->start      = src->start + offset;
6021                     dst->end        = src->end   + offset;
6022                     dst->block      = src->block;
6023                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6024                                             src->src_regex
6025                                                 ? src->src_regex
6026                                                 : (REGEXP*)rx);
6027                     n++;
6028                 }
6029             }
6030         }
6031     }
6032     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6033     if (alloced)
6034         SvSETMAGIC(pat);
6035
6036     return pat;
6037 }
6038
6039
6040
6041 /* see if there are any run-time code blocks in the pattern.
6042  * False positives are allowed */
6043
6044 static bool
6045 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6046                     char *pat, STRLEN plen)
6047 {
6048     int n = 0;
6049     STRLEN s;
6050
6051     PERL_UNUSED_CONTEXT;
6052
6053     for (s = 0; s < plen; s++) {
6054         if (n < pRExC_state->num_code_blocks
6055             && s == pRExC_state->code_blocks[n].start)
6056         {
6057             s = pRExC_state->code_blocks[n].end;
6058             n++;
6059             continue;
6060         }
6061         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6062          * positives here */
6063         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6064             (pat[s+2] == '{'
6065                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6066         )
6067             return 1;
6068     }
6069     return 0;
6070 }
6071
6072 /* Handle run-time code blocks. We will already have compiled any direct
6073  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6074  * copy of it, but with any literal code blocks blanked out and
6075  * appropriate chars escaped; then feed it into
6076  *
6077  *    eval "qr'modified_pattern'"
6078  *
6079  * For example,
6080  *
6081  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6082  *
6083  * becomes
6084  *
6085  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6086  *
6087  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6088  * and merge them with any code blocks of the original regexp.
6089  *
6090  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6091  * instead, just save the qr and return FALSE; this tells our caller that
6092  * the original pattern needs upgrading to utf8.
6093  */
6094
6095 static bool
6096 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6097     char *pat, STRLEN plen)
6098 {
6099     SV *qr;
6100
6101     GET_RE_DEBUG_FLAGS_DECL;
6102
6103     if (pRExC_state->runtime_code_qr) {
6104         /* this is the second time we've been called; this should
6105          * only happen if the main pattern got upgraded to utf8
6106          * during compilation; re-use the qr we compiled first time
6107          * round (which should be utf8 too)
6108          */
6109         qr = pRExC_state->runtime_code_qr;
6110         pRExC_state->runtime_code_qr = NULL;
6111         assert(RExC_utf8 && SvUTF8(qr));
6112     }
6113     else {
6114         int n = 0;
6115         STRLEN s;
6116         char *p, *newpat;
6117         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6118         SV *sv, *qr_ref;
6119         dSP;
6120
6121         /* determine how many extra chars we need for ' and \ escaping */
6122         for (s = 0; s < plen; s++) {
6123             if (pat[s] == '\'' || pat[s] == '\\')
6124                 newlen++;
6125         }
6126
6127         Newx(newpat, newlen, char);
6128         p = newpat;
6129         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6130
6131         for (s = 0; s < plen; s++) {
6132             if (n < pRExC_state->num_code_blocks
6133                 && s == pRExC_state->code_blocks[n].start)
6134             {
6135                 /* blank out literal code block */
6136                 assert(pat[s] == '(');
6137                 while (s <= pRExC_state->code_blocks[n].end) {
6138                     *p++ = '_';
6139                     s++;
6140                 }
6141                 s--;
6142                 n++;
6143                 continue;
6144             }
6145             if (pat[s] == '\'' || pat[s] == '\\')
6146                 *p++ = '\\';
6147             *p++ = pat[s];
6148         }
6149         *p++ = '\'';
6150         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6151             *p++ = 'x';
6152         *p++ = '\0';
6153         DEBUG_COMPILE_r({
6154             PerlIO_printf(Perl_debug_log,
6155                 "%sre-parsing pattern for runtime code:%s %s\n",
6156                 PL_colors[4],PL_colors[5],newpat);
6157         });
6158
6159         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6160         Safefree(newpat);
6161
6162         ENTER;
6163         SAVETMPS;
6164         PUSHSTACKi(PERLSI_REQUIRE);
6165         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6166          * parsing qr''; normally only q'' does this. It also alters
6167          * hints handling */
6168         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6169         SvREFCNT_dec_NN(sv);
6170         SPAGAIN;
6171         qr_ref = POPs;
6172         PUTBACK;
6173         {
6174             SV * const errsv = ERRSV;
6175             if (SvTRUE_NN(errsv))
6176             {
6177                 Safefree(pRExC_state->code_blocks);
6178                 /* use croak_sv ? */
6179                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6180             }
6181         }
6182         assert(SvROK(qr_ref));
6183         qr = SvRV(qr_ref);
6184         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6185         /* the leaving below frees the tmp qr_ref.
6186          * Give qr a life of its own */
6187         SvREFCNT_inc(qr);
6188         POPSTACK;
6189         FREETMPS;
6190         LEAVE;
6191
6192     }
6193
6194     if (!RExC_utf8 && SvUTF8(qr)) {
6195         /* first time through; the pattern got upgraded; save the
6196          * qr for the next time through */
6197         assert(!pRExC_state->runtime_code_qr);
6198         pRExC_state->runtime_code_qr = qr;
6199         return 0;
6200     }
6201
6202
6203     /* extract any code blocks within the returned qr//  */
6204
6205
6206     /* merge the main (r1) and run-time (r2) code blocks into one */
6207     {
6208         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6209         struct reg_code_block *new_block, *dst;
6210         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6211         int i1 = 0, i2 = 0;
6212
6213         if (!r2->num_code_blocks) /* we guessed wrong */
6214         {
6215             SvREFCNT_dec_NN(qr);
6216             return 1;
6217         }
6218
6219         Newx(new_block,
6220             r1->num_code_blocks + r2->num_code_blocks,
6221             struct reg_code_block);
6222         dst = new_block;
6223
6224         while (    i1 < r1->num_code_blocks
6225                 || i2 < r2->num_code_blocks)
6226         {
6227             struct reg_code_block *src;
6228             bool is_qr = 0;
6229
6230             if (i1 == r1->num_code_blocks) {
6231                 src = &r2->code_blocks[i2++];
6232                 is_qr = 1;
6233             }
6234             else if (i2 == r2->num_code_blocks)
6235                 src = &r1->code_blocks[i1++];
6236             else if (  r1->code_blocks[i1].start
6237                      < r2->code_blocks[i2].start)
6238             {
6239                 src = &r1->code_blocks[i1++];
6240                 assert(src->end < r2->code_blocks[i2].start);
6241             }
6242             else {
6243                 assert(  r1->code_blocks[i1].start
6244                        > r2->code_blocks[i2].start);
6245                 src = &r2->code_blocks[i2++];
6246                 is_qr = 1;
6247                 assert(src->end < r1->code_blocks[i1].start);
6248             }
6249
6250             assert(pat[src->start] == '(');
6251             assert(pat[src->end]   == ')');
6252             dst->start      = src->start;
6253             dst->end        = src->end;
6254             dst->block      = src->block;
6255             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6256                                     : src->src_regex;
6257             dst++;
6258         }
6259         r1->num_code_blocks += r2->num_code_blocks;
6260         Safefree(r1->code_blocks);
6261         r1->code_blocks = new_block;
6262     }
6263
6264     SvREFCNT_dec_NN(qr);
6265     return 1;
6266 }
6267
6268
6269 STATIC bool
6270 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6271                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6272                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6273                       STRLEN longest_length, bool eol, bool meol)
6274 {
6275     /* This is the common code for setting up the floating and fixed length
6276      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6277      * as to whether succeeded or not */
6278
6279     I32 t;
6280     SSize_t ml;
6281
6282     if (! (longest_length
6283            || (eol /* Can't have SEOL and MULTI */
6284                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6285           )
6286             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6287         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6288     {
6289         return FALSE;
6290     }
6291
6292     /* copy the information about the longest from the reg_scan_data
6293         over to the program. */
6294     if (SvUTF8(sv_longest)) {
6295         *rx_utf8 = sv_longest;
6296         *rx_substr = NULL;
6297     } else {
6298         *rx_substr = sv_longest;
6299         *rx_utf8 = NULL;
6300     }
6301     /* end_shift is how many chars that must be matched that
6302         follow this item. We calculate it ahead of time as once the
6303         lookbehind offset is added in we lose the ability to correctly
6304         calculate it.*/
6305     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6306     *rx_end_shift = ml - offset
6307         - longest_length + (SvTAIL(sv_longest) != 0)
6308         + lookbehind;
6309
6310     t = (eol/* Can't have SEOL and MULTI */
6311          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6312     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6313
6314     return TRUE;
6315 }
6316
6317 /*
6318  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6319  * regular expression into internal code.
6320  * The pattern may be passed either as:
6321  *    a list of SVs (patternp plus pat_count)
6322  *    a list of OPs (expr)
6323  * If both are passed, the SV list is used, but the OP list indicates
6324  * which SVs are actually pre-compiled code blocks
6325  *
6326  * The SVs in the list have magic and qr overloading applied to them (and
6327  * the list may be modified in-place with replacement SVs in the latter
6328  * case).
6329  *
6330  * If the pattern hasn't changed from old_re, then old_re will be
6331  * returned.
6332  *
6333  * eng is the current engine. If that engine has an op_comp method, then
6334  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6335  * do the initial concatenation of arguments and pass on to the external
6336  * engine.
6337  *
6338  * If is_bare_re is not null, set it to a boolean indicating whether the
6339  * arg list reduced (after overloading) to a single bare regex which has
6340  * been returned (i.e. /$qr/).
6341  *
6342  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6343  *
6344  * pm_flags contains the PMf_* flags, typically based on those from the
6345  * pm_flags field of the related PMOP. Currently we're only interested in
6346  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6347  *
6348  * We can't allocate space until we know how big the compiled form will be,
6349  * but we can't compile it (and thus know how big it is) until we've got a
6350  * place to put the code.  So we cheat:  we compile it twice, once with code
6351  * generation turned off and size counting turned on, and once "for real".
6352  * This also means that we don't allocate space until we are sure that the
6353  * thing really will compile successfully, and we never have to move the
6354  * code and thus invalidate pointers into it.  (Note that it has to be in
6355  * one piece because free() must be able to free it all.) [NB: not true in perl]
6356  *
6357  * Beware that the optimization-preparation code in here knows about some
6358  * of the structure of the compiled regexp.  [I'll say.]
6359  */
6360
6361 REGEXP *
6362 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6363                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6364                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6365 {
6366     REGEXP *rx;
6367     struct regexp *r;
6368     regexp_internal *ri;
6369     STRLEN plen;
6370     char *exp;
6371     regnode *scan;
6372     I32 flags;
6373     SSize_t minlen = 0;
6374     U32 rx_flags;
6375     SV *pat;
6376     SV *code_blocksv = NULL;
6377     SV** new_patternp = patternp;
6378
6379     /* these are all flags - maybe they should be turned
6380      * into a single int with different bit masks */
6381     I32 sawlookahead = 0;
6382     I32 sawplus = 0;
6383     I32 sawopen = 0;
6384     I32 sawminmod = 0;
6385
6386     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6387     bool recompile = 0;
6388     bool runtime_code = 0;
6389     scan_data_t data;
6390     RExC_state_t RExC_state;
6391     RExC_state_t * const pRExC_state = &RExC_state;
6392 #ifdef TRIE_STUDY_OPT
6393     int restudied = 0;
6394     RExC_state_t copyRExC_state;
6395 #endif
6396     GET_RE_DEBUG_FLAGS_DECL;
6397
6398     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6399
6400     DEBUG_r(if (!PL_colorset) reginitcolors());
6401
6402 #ifndef PERL_IN_XSUB_RE
6403     /* Initialize these here instead of as-needed, as is quick and avoids
6404      * having to test them each time otherwise */
6405     if (! PL_AboveLatin1) {
6406         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6407         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6408         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6409         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6410         PL_HasMultiCharFold =
6411                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6412
6413         /* This is calculated here, because the Perl program that generates the
6414          * static global ones doesn't currently have access to
6415          * NUM_ANYOF_CODE_POINTS */
6416         PL_InBitmap = _new_invlist(2);
6417         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6418                                                     NUM_ANYOF_CODE_POINTS - 1);
6419     }
6420 #endif
6421
6422     pRExC_state->code_blocks = NULL;
6423     pRExC_state->num_code_blocks = 0;
6424
6425     if (is_bare_re)
6426         *is_bare_re = FALSE;
6427
6428     if (expr && (expr->op_type == OP_LIST ||
6429                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6430         /* allocate code_blocks if needed */
6431         OP *o;
6432         int ncode = 0;
6433
6434         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6435             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6436                 ncode++; /* count of DO blocks */
6437         if (ncode) {
6438             pRExC_state->num_code_blocks = ncode;
6439             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6440         }
6441     }
6442
6443     if (!pat_count) {
6444         /* compile-time pattern with just OP_CONSTs and DO blocks */
6445
6446         int n;
6447         OP *o;
6448
6449         /* find how many CONSTs there are */
6450         assert(expr);
6451         n = 0;
6452         if (expr->op_type == OP_CONST)
6453             n = 1;
6454         else
6455             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6456                 if (o->op_type == OP_CONST)
6457                     n++;
6458             }
6459
6460         /* fake up an SV array */
6461
6462         assert(!new_patternp);
6463         Newx(new_patternp, n, SV*);
6464         SAVEFREEPV(new_patternp);
6465         pat_count = n;
6466
6467         n = 0;
6468         if (expr->op_type == OP_CONST)
6469             new_patternp[n] = cSVOPx_sv(expr);
6470         else
6471             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6472                 if (o->op_type == OP_CONST)
6473                     new_patternp[n++] = cSVOPo_sv;
6474             }
6475
6476     }
6477
6478     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6479         "Assembling pattern from %d elements%s\n", pat_count,
6480             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6481
6482     /* set expr to the first arg op */
6483
6484     if (pRExC_state->num_code_blocks
6485          && expr->op_type != OP_CONST)
6486     {
6487             expr = cLISTOPx(expr)->op_first;
6488             assert(   expr->op_type == OP_PUSHMARK
6489                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6490                    || expr->op_type == OP_PADRANGE);
6491             expr = OpSIBLING(expr);
6492     }
6493
6494     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6495                         expr, &recompile, NULL);
6496
6497     /* handle bare (possibly after overloading) regex: foo =~ $re */
6498     {
6499         SV *re = pat;
6500         if (SvROK(re))
6501             re = SvRV(re);
6502         if (SvTYPE(re) == SVt_REGEXP) {
6503             if (is_bare_re)
6504                 *is_bare_re = TRUE;
6505             SvREFCNT_inc(re);
6506             Safefree(pRExC_state->code_blocks);
6507             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6508                 "Precompiled pattern%s\n",
6509                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6510
6511             return (REGEXP*)re;
6512         }
6513     }
6514
6515     exp = SvPV_nomg(pat, plen);
6516
6517     if (!eng->op_comp) {
6518         if ((SvUTF8(pat) && IN_BYTES)
6519                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6520         {
6521             /* make a temporary copy; either to convert to bytes,
6522              * or to avoid repeating get-magic / overloaded stringify */
6523             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6524                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6525         }
6526         Safefree(pRExC_state->code_blocks);
6527         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6528     }
6529
6530     /* ignore the utf8ness if the pattern is 0 length */
6531     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6532     RExC_uni_semantics = 0;
6533     RExC_contains_locale = 0;
6534     RExC_contains_i = 0;
6535     pRExC_state->runtime_code_qr = NULL;
6536     RExC_frame_head= NULL;
6537     RExC_frame_last= NULL;
6538     RExC_frame_count= 0;
6539
6540     DEBUG_r({
6541         RExC_mysv1= sv_newmortal();
6542         RExC_mysv2= sv_newmortal();
6543     });
6544     DEBUG_COMPILE_r({
6545             SV *dsv= sv_newmortal();
6546             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6547             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6548                           PL_colors[4],PL_colors[5],s);
6549         });
6550
6551   redo_first_pass:
6552     /* we jump here if we upgrade the pattern to utf8 and have to
6553      * recompile */
6554
6555     if ((pm_flags & PMf_USE_RE_EVAL)
6556                 /* this second condition covers the non-regex literal case,
6557                  * i.e.  $foo =~ '(?{})'. */
6558                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6559     )
6560         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6561
6562     /* return old regex if pattern hasn't changed */
6563     /* XXX: note in the below we have to check the flags as well as the
6564      * pattern.
6565      *
6566      * Things get a touch tricky as we have to compare the utf8 flag
6567      * independently from the compile flags.  */
6568
6569     if (   old_re
6570         && !recompile
6571         && !!RX_UTF8(old_re) == !!RExC_utf8
6572         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6573         && RX_PRECOMP(old_re)
6574         && RX_PRELEN(old_re) == plen
6575         && memEQ(RX_PRECOMP(old_re), exp, plen)
6576         && !runtime_code /* with runtime code, always recompile */ )
6577     {
6578         Safefree(pRExC_state->code_blocks);
6579         return old_re;
6580     }
6581
6582     rx_flags = orig_rx_flags;
6583
6584     if (rx_flags & PMf_FOLD) {
6585         RExC_contains_i = 1;
6586     }
6587     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6588
6589         /* Set to use unicode semantics if the pattern is in utf8 and has the
6590          * 'depends' charset specified, as it means unicode when utf8  */
6591         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6592     }
6593
6594     RExC_precomp = exp;
6595     RExC_flags = rx_flags;
6596     RExC_pm_flags = pm_flags;
6597
6598     if (runtime_code) {
6599         if (TAINTING_get && TAINT_get)
6600             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6601
6602         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6603             /* whoops, we have a non-utf8 pattern, whilst run-time code
6604              * got compiled as utf8. Try again with a utf8 pattern */
6605             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6606                                     pRExC_state->num_code_blocks);
6607             goto redo_first_pass;
6608         }
6609     }
6610     assert(!pRExC_state->runtime_code_qr);
6611
6612     RExC_sawback = 0;
6613
6614     RExC_seen = 0;
6615     RExC_maxlen = 0;
6616     RExC_in_lookbehind = 0;
6617     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6618     RExC_extralen = 0;
6619     RExC_override_recoding = 0;
6620     RExC_in_multi_char_class = 0;
6621
6622     /* First pass: determine size, legality. */
6623     RExC_parse = exp;
6624     RExC_start = exp;
6625     RExC_end = exp + plen;
6626     RExC_naughty = 0;
6627     RExC_npar = 1;
6628     RExC_nestroot = 0;
6629     RExC_size = 0L;
6630     RExC_emit = (regnode *) &RExC_emit_dummy;
6631     RExC_whilem_seen = 0;
6632     RExC_open_parens = NULL;
6633     RExC_close_parens = NULL;
6634     RExC_opend = NULL;
6635     RExC_paren_names = NULL;
6636 #ifdef DEBUGGING
6637     RExC_paren_name_list = NULL;
6638 #endif
6639     RExC_recurse = NULL;
6640     RExC_study_chunk_recursed = NULL;
6641     RExC_study_chunk_recursed_bytes= 0;
6642     RExC_recurse_count = 0;
6643     pRExC_state->code_index = 0;
6644
6645     DEBUG_PARSE_r(
6646         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6647         RExC_lastnum=0;
6648         RExC_lastparse=NULL;
6649     );
6650     /* reg may croak on us, not giving us a chance to free
6651        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6652        need it to survive as long as the regexp (qr/(?{})/).
6653        We must check that code_blocksv is not already set, because we may
6654        have jumped back to restart the sizing pass. */
6655     if (pRExC_state->code_blocks && !code_blocksv) {
6656         code_blocksv = newSV_type(SVt_PV);
6657         SAVEFREESV(code_blocksv);
6658         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6659         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6660     }
6661     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6662         /* It's possible to write a regexp in ascii that represents Unicode
6663         codepoints outside of the byte range, such as via \x{100}. If we
6664         detect such a sequence we have to convert the entire pattern to utf8
6665         and then recompile, as our sizing calculation will have been based
6666         on 1 byte == 1 character, but we will need to use utf8 to encode
6667         at least some part of the pattern, and therefore must convert the whole
6668         thing.
6669         -- dmq */
6670         if (flags & RESTART_UTF8) {
6671             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6672                                     pRExC_state->num_code_blocks);
6673             goto redo_first_pass;
6674         }
6675         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6676     }
6677     if (code_blocksv)
6678         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6679
6680     DEBUG_PARSE_r({
6681         PerlIO_printf(Perl_debug_log,
6682             "Required size %"IVdf" nodes\n"
6683             "Starting second pass (creation)\n",
6684             (IV)RExC_size);
6685         RExC_lastnum=0;
6686         RExC_lastparse=NULL;
6687     });
6688
6689     /* The first pass could have found things that force Unicode semantics */
6690     if ((RExC_utf8 || RExC_uni_semantics)
6691          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6692     {
6693         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6694     }
6695
6696     /* Small enough for pointer-storage convention?
6697        If extralen==0, this means that we will not need long jumps. */
6698     if (RExC_size >= 0x10000L && RExC_extralen)
6699         RExC_size += RExC_extralen;
6700     else
6701         RExC_extralen = 0;
6702     if (RExC_whilem_seen > 15)
6703         RExC_whilem_seen = 15;
6704
6705     /* Allocate space and zero-initialize. Note, the two step process
6706        of zeroing when in debug mode, thus anything assigned has to
6707        happen after that */
6708     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6709     r = ReANY(rx);
6710     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6711          char, regexp_internal);
6712     if ( r == NULL || ri == NULL )
6713         FAIL("Regexp out of space");
6714 #ifdef DEBUGGING
6715     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6716     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6717          char);
6718 #else
6719     /* bulk initialize base fields with 0. */
6720     Zero(ri, sizeof(regexp_internal), char);
6721 #endif
6722
6723     /* non-zero initialization begins here */
6724     RXi_SET( r, ri );
6725     r->engine= eng;
6726     r->extflags = rx_flags;
6727     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6728
6729     if (pm_flags & PMf_IS_QR) {
6730         ri->code_blocks = pRExC_state->code_blocks;
6731         ri->num_code_blocks = pRExC_state->num_code_blocks;
6732     }
6733     else
6734     {
6735         int n;
6736         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6737             if (pRExC_state->code_blocks[n].src_regex)
6738                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6739         SAVEFREEPV(pRExC_state->code_blocks);
6740     }
6741
6742     {
6743         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6744         bool has_charset = (get_regex_charset(r->extflags)
6745                                                     != REGEX_DEPENDS_CHARSET);
6746
6747         /* The caret is output if there are any defaults: if not all the STD
6748          * flags are set, or if no character set specifier is needed */
6749         bool has_default =
6750                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6751                     || ! has_charset);
6752         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6753                                                    == REG_RUN_ON_COMMENT_SEEN);
6754         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6755                             >> RXf_PMf_STD_PMMOD_SHIFT);
6756         const char *fptr = STD_PAT_MODS;        /*"msixn"*/
6757         char *p;
6758         /* Allocate for the worst case, which is all the std flags are turned
6759          * on.  If more precision is desired, we could do a population count of
6760          * the flags set.  This could be done with a small lookup table, or by
6761          * shifting, masking and adding, or even, when available, assembly
6762          * language for a machine-language population count.
6763          * We never output a minus, as all those are defaults, so are
6764          * covered by the caret */
6765         const STRLEN wraplen = plen + has_p + has_runon
6766             + has_default       /* If needs a caret */
6767
6768                 /* If needs a character set specifier */
6769             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6770             + (sizeof(STD_PAT_MODS) - 1)
6771             + (sizeof("(?:)") - 1);
6772
6773         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6774         r->xpv_len_u.xpvlenu_pv = p;
6775         if (RExC_utf8)
6776             SvFLAGS(rx) |= SVf_UTF8;
6777         *p++='('; *p++='?';
6778
6779         /* If a default, cover it using the caret */
6780         if (has_default) {
6781             *p++= DEFAULT_PAT_MOD;
6782         }
6783         if (has_charset) {
6784             STRLEN len;
6785             const char* const name = get_regex_charset_name(r->extflags, &len);
6786             Copy(name, p, len, char);
6787             p += len;
6788         }
6789         if (has_p)
6790             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6791         {
6792             char ch;
6793             while((ch = *fptr++)) {
6794                 if(reganch & 1)
6795                     *p++ = ch;
6796                 reganch >>= 1;
6797             }
6798         }
6799
6800         *p++ = ':';
6801         Copy(RExC_precomp, p, plen, char);
6802         assert ((RX_WRAPPED(rx) - p) < 16);
6803         r->pre_prefix = p - RX_WRAPPED(rx);
6804         p += plen;
6805         if (has_runon)
6806             *p++ = '\n';
6807         *p++ = ')';
6808         *p = 0;
6809         SvCUR_set(rx, p - RX_WRAPPED(rx));
6810     }
6811
6812     r->intflags = 0;
6813     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6814
6815     /* setup various meta data about recursion, this all requires
6816      * RExC_npar to be correctly set, and a bit later on we clear it */
6817     if (RExC_seen & REG_RECURSE_SEEN) {
6818         Newxz(RExC_open_parens, RExC_npar,regnode *);
6819         SAVEFREEPV(RExC_open_parens);
6820         Newxz(RExC_close_parens,RExC_npar,regnode *);
6821         SAVEFREEPV(RExC_close_parens);
6822     }
6823     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6824         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6825          * So its 1 if there are no parens. */
6826         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6827                                          ((RExC_npar & 0x07) != 0);
6828         Newx(RExC_study_chunk_recursed,
6829              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6830         SAVEFREEPV(RExC_study_chunk_recursed);
6831     }
6832
6833     /* Useful during FAIL. */
6834 #ifdef RE_TRACK_PATTERN_OFFSETS
6835     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6836     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6837                           "%s %"UVuf" bytes for offset annotations.\n",
6838                           ri->u.offsets ? "Got" : "Couldn't get",
6839                           (UV)((2*RExC_size+1) * sizeof(U32))));
6840 #endif
6841     SetProgLen(ri,RExC_size);
6842     RExC_rx_sv = rx;
6843     RExC_rx = r;
6844     RExC_rxi = ri;
6845
6846     /* Second pass: emit code. */
6847     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6848     RExC_pm_flags = pm_flags;
6849     RExC_parse = exp;
6850     RExC_end = exp + plen;
6851     RExC_naughty = 0;
6852     RExC_npar = 1;
6853     RExC_emit_start = ri->program;
6854     RExC_emit = ri->program;
6855     RExC_emit_bound = ri->program + RExC_size + 1;
6856     pRExC_state->code_index = 0;
6857
6858     *((char*) RExC_emit++) = (char) REG_MAGIC;
6859     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6860         ReREFCNT_dec(rx);
6861         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6862     }
6863     /* XXXX To minimize changes to RE engine we always allocate
6864        3-units-long substrs field. */
6865     Newx(r->substrs, 1, struct reg_substr_data);
6866     if (RExC_recurse_count) {
6867         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6868         SAVEFREEPV(RExC_recurse);
6869     }
6870
6871 reStudy:
6872     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6873     DEBUG_r(
6874         RExC_study_chunk_recursed_count= 0;
6875     );
6876     Zero(r->substrs, 1, struct reg_substr_data);
6877     if (RExC_study_chunk_recursed) {
6878         Zero(RExC_study_chunk_recursed,
6879              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6880     }
6881
6882
6883 #ifdef TRIE_STUDY_OPT
6884     if (!restudied) {
6885         StructCopy(&zero_scan_data, &data, scan_data_t);
6886         copyRExC_state = RExC_state;
6887     } else {
6888         U32 seen=RExC_seen;
6889         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6890
6891         RExC_state = copyRExC_state;
6892         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6893             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6894         else
6895             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6896         StructCopy(&zero_scan_data, &data, scan_data_t);
6897     }
6898 #else
6899     StructCopy(&zero_scan_data, &data, scan_data_t);
6900 #endif
6901
6902     /* Dig out information for optimizations. */
6903     r->extflags = RExC_flags; /* was pm_op */
6904     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6905
6906     if (UTF)
6907         SvUTF8_on(rx);  /* Unicode in it? */
6908     ri->regstclass = NULL;
6909     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
6910         r->intflags |= PREGf_NAUGHTY;
6911     scan = ri->program + 1;             /* First BRANCH. */
6912
6913     /* testing for BRANCH here tells us whether there is "must appear"
6914        data in the pattern. If there is then we can use it for optimisations */
6915     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6916                                                   */
6917         SSize_t fake;
6918         STRLEN longest_float_length, longest_fixed_length;
6919         regnode_ssc ch_class; /* pointed to by data */
6920         int stclass_flag;
6921         SSize_t last_close = 0; /* pointed to by data */
6922         regnode *first= scan;
6923         regnode *first_next= regnext(first);
6924         /*
6925          * Skip introductions and multiplicators >= 1
6926          * so that we can extract the 'meat' of the pattern that must
6927          * match in the large if() sequence following.
6928          * NOTE that EXACT is NOT covered here, as it is normally
6929          * picked up by the optimiser separately.
6930          *
6931          * This is unfortunate as the optimiser isnt handling lookahead
6932          * properly currently.
6933          *
6934          */
6935         while ((OP(first) == OPEN && (sawopen = 1)) ||
6936                /* An OR of *one* alternative - should not happen now. */
6937             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6938             /* for now we can't handle lookbehind IFMATCH*/
6939             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6940             (OP(first) == PLUS) ||
6941             (OP(first) == MINMOD) ||
6942                /* An {n,m} with n>0 */
6943             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6944             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6945         {
6946                 /*
6947                  * the only op that could be a regnode is PLUS, all the rest
6948                  * will be regnode_1 or regnode_2.
6949                  *
6950                  * (yves doesn't think this is true)
6951                  */
6952                 if (OP(first) == PLUS)
6953                     sawplus = 1;
6954                 else {
6955                     if (OP(first) == MINMOD)
6956                         sawminmod = 1;
6957                     first += regarglen[OP(first)];
6958                 }
6959                 first = NEXTOPER(first);
6960                 first_next= regnext(first);
6961         }
6962
6963         /* Starting-point info. */
6964       again:
6965         DEBUG_PEEP("first:",first,0);
6966         /* Ignore EXACT as we deal with it later. */
6967         if (PL_regkind[OP(first)] == EXACT) {
6968             if (OP(first) == EXACT || OP(first) == EXACTL)
6969                 NOOP;   /* Empty, get anchored substr later. */
6970             else
6971                 ri->regstclass = first;
6972         }
6973 #ifdef TRIE_STCLASS
6974         else if (PL_regkind[OP(first)] == TRIE &&
6975                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6976         {
6977             /* this can happen only on restudy */
6978             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6979         }
6980 #endif
6981         else if (REGNODE_SIMPLE(OP(first)))
6982             ri->regstclass = first;
6983         else if (PL_regkind[OP(first)] == BOUND ||
6984                  PL_regkind[OP(first)] == NBOUND)
6985             ri->regstclass = first;
6986         else if (PL_regkind[OP(first)] == BOL) {
6987             r->intflags |= (OP(first) == MBOL
6988                            ? PREGf_ANCH_MBOL
6989                            : PREGf_ANCH_SBOL);
6990             first = NEXTOPER(first);
6991             goto again;
6992         }
6993         else if (OP(first) == GPOS) {
6994             r->intflags |= PREGf_ANCH_GPOS;
6995             first = NEXTOPER(first);
6996             goto again;
6997         }
6998         else if ((!sawopen || !RExC_sawback) &&
6999             !sawlookahead &&
7000             (OP(first) == STAR &&
7001             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7002             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7003         {
7004             /* turn .* into ^.* with an implied $*=1 */
7005             const int type =
7006                 (OP(NEXTOPER(first)) == REG_ANY)
7007                     ? PREGf_ANCH_MBOL
7008                     : PREGf_ANCH_SBOL;
7009             r->intflags |= (type | PREGf_IMPLICIT);
7010             first = NEXTOPER(first);
7011             goto again;
7012         }
7013         if (sawplus && !sawminmod && !sawlookahead
7014             && (!sawopen || !RExC_sawback)
7015             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7016             /* x+ must match at the 1st pos of run of x's */
7017             r->intflags |= PREGf_SKIP;
7018
7019         /* Scan is after the zeroth branch, first is atomic matcher. */
7020 #ifdef TRIE_STUDY_OPT
7021         DEBUG_PARSE_r(
7022             if (!restudied)
7023                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7024                               (IV)(first - scan + 1))
7025         );
7026 #else
7027         DEBUG_PARSE_r(
7028             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7029                 (IV)(first - scan + 1))
7030         );
7031 #endif
7032
7033
7034         /*
7035         * If there's something expensive in the r.e., find the
7036         * longest literal string that must appear and make it the
7037         * regmust.  Resolve ties in favor of later strings, since
7038         * the regstart check works with the beginning of the r.e.
7039         * and avoiding duplication strengthens checking.  Not a
7040         * strong reason, but sufficient in the absence of others.
7041         * [Now we resolve ties in favor of the earlier string if
7042         * it happens that c_offset_min has been invalidated, since the
7043         * earlier string may buy us something the later one won't.]
7044         */
7045
7046         data.longest_fixed = newSVpvs("");
7047         data.longest_float = newSVpvs("");
7048         data.last_found = newSVpvs("");
7049         data.longest = &(data.longest_fixed);
7050         ENTER_with_name("study_chunk");
7051         SAVEFREESV(data.longest_fixed);
7052         SAVEFREESV(data.longest_float);
7053         SAVEFREESV(data.last_found);
7054         first = scan;
7055         if (!ri->regstclass) {
7056             ssc_init(pRExC_state, &ch_class);
7057             data.start_class = &ch_class;
7058             stclass_flag = SCF_DO_STCLASS_AND;
7059         } else                          /* XXXX Check for BOUND? */
7060             stclass_flag = 0;
7061         data.last_closep = &last_close;
7062
7063         DEBUG_RExC_seen();
7064         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7065                              scan + RExC_size, /* Up to end */
7066             &data, -1, 0, NULL,
7067             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7068                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7069             0);
7070
7071
7072         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7073
7074
7075         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7076              && data.last_start_min == 0 && data.last_end > 0
7077              && !RExC_seen_zerolen
7078              && !(RExC_seen & REG_VERBARG_SEEN)
7079              && !(RExC_seen & REG_GPOS_SEEN)
7080         ){
7081             r->extflags |= RXf_CHECK_ALL;
7082         }
7083         scan_commit(pRExC_state, &data,&minlen,0);
7084
7085         longest_float_length = CHR_SVLEN(data.longest_float);
7086
7087         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7088                    && data.offset_fixed == data.offset_float_min
7089                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7090             && S_setup_longest (aTHX_ pRExC_state,
7091                                     data.longest_float,
7092                                     &(r->float_utf8),
7093                                     &(r->float_substr),
7094                                     &(r->float_end_shift),
7095                                     data.lookbehind_float,
7096                                     data.offset_float_min,
7097                                     data.minlen_float,
7098                                     longest_float_length,
7099                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7100                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7101         {
7102             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7103             r->float_max_offset = data.offset_float_max;
7104             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7105                 r->float_max_offset -= data.lookbehind_float;
7106             SvREFCNT_inc_simple_void_NN(data.longest_float);
7107         }
7108         else {
7109             r->float_substr = r->float_utf8 = NULL;
7110             longest_float_length = 0;
7111         }
7112
7113         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7114
7115         if (S_setup_longest (aTHX_ pRExC_state,
7116                                 data.longest_fixed,
7117                                 &(r->anchored_utf8),
7118                                 &(r->anchored_substr),
7119                                 &(r->anchored_end_shift),
7120                                 data.lookbehind_fixed,
7121                                 data.offset_fixed,
7122                                 data.minlen_fixed,
7123                                 longest_fixed_length,
7124                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7125                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7126         {
7127             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7128             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7129         }
7130         else {
7131             r->anchored_substr = r->anchored_utf8 = NULL;
7132             longest_fixed_length = 0;
7133         }
7134         LEAVE_with_name("study_chunk");
7135
7136         if (ri->regstclass
7137             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7138             ri->regstclass = NULL;
7139
7140         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7141             && stclass_flag
7142             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7143             && is_ssc_worth_it(pRExC_state, data.start_class))
7144         {
7145             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7146
7147             ssc_finalize(pRExC_state, data.start_class);
7148
7149             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7150             StructCopy(data.start_class,
7151                        (regnode_ssc*)RExC_rxi->data->data[n],
7152                        regnode_ssc);
7153             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7154             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7155             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7156                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7157                       PerlIO_printf(Perl_debug_log,
7158                                     "synthetic stclass \"%s\".\n",
7159                                     SvPVX_const(sv));});
7160             data.start_class = NULL;
7161         }
7162
7163         /* A temporary algorithm prefers floated substr to fixed one to dig
7164          * more info. */
7165         if (longest_fixed_length > longest_float_length) {
7166             r->substrs->check_ix = 0;
7167             r->check_end_shift = r->anchored_end_shift;
7168             r->check_substr = r->anchored_substr;
7169             r->check_utf8 = r->anchored_utf8;
7170             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7171             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7172                 r->intflags |= PREGf_NOSCAN;
7173         }
7174         else {
7175             r->substrs->check_ix = 1;
7176             r->check_end_shift = r->float_end_shift;
7177             r->check_substr = r->float_substr;
7178             r->check_utf8 = r->float_utf8;
7179             r->check_offset_min = r->float_min_offset;
7180             r->check_offset_max = r->float_max_offset;
7181         }
7182         if ((r->check_substr || r->check_utf8) ) {
7183             r->extflags |= RXf_USE_INTUIT;
7184             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7185                 r->extflags |= RXf_INTUIT_TAIL;
7186         }
7187         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7188
7189         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7190         if ( (STRLEN)minlen < longest_float_length )
7191             minlen= longest_float_length;
7192         if ( (STRLEN)minlen < longest_fixed_length )
7193             minlen= longest_fixed_length;
7194         */
7195     }
7196     else {
7197         /* Several toplevels. Best we can is to set minlen. */
7198         SSize_t fake;
7199         regnode_ssc ch_class;
7200         SSize_t last_close = 0;
7201
7202         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7203
7204         scan = ri->program + 1;
7205         ssc_init(pRExC_state, &ch_class);
7206         data.start_class = &ch_class;
7207         data.last_closep = &last_close;
7208
7209         DEBUG_RExC_seen();
7210         minlen = study_chunk(pRExC_state,
7211             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7212             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7213                                                       ? SCF_TRIE_DOING_RESTUDY
7214                                                       : 0),
7215             0);
7216
7217         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7218
7219         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7220                 = r->float_substr = r->float_utf8 = NULL;
7221
7222         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7223             && is_ssc_worth_it(pRExC_state, data.start_class))
7224         {
7225             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7226
7227             ssc_finalize(pRExC_state, data.start_class);
7228
7229             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7230             StructCopy(data.start_class,
7231                        (regnode_ssc*)RExC_rxi->data->data[n],
7232                        regnode_ssc);
7233             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7234             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7235             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7236                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7237                       PerlIO_printf(Perl_debug_log,
7238                                     "synthetic stclass \"%s\".\n",
7239                                     SvPVX_const(sv));});
7240             data.start_class = NULL;
7241         }
7242     }
7243
7244     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7245         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7246         r->maxlen = REG_INFTY;
7247     }
7248     else {
7249         r->maxlen = RExC_maxlen;
7250     }
7251
7252     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7253        the "real" pattern. */
7254     DEBUG_OPTIMISE_r({
7255         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7256                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7257     });
7258     r->minlenret = minlen;
7259     if (r->minlen < minlen)
7260         r->minlen = minlen;
7261
7262     if (RExC_seen & REG_GPOS_SEEN)
7263         r->intflags |= PREGf_GPOS_SEEN;
7264     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7265         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7266                                                 lookbehind */
7267     if (pRExC_state->num_code_blocks)
7268         r->extflags |= RXf_EVAL_SEEN;
7269     if (RExC_seen & REG_CANY_SEEN)
7270         r->intflags |= PREGf_CANY_SEEN;
7271     if (RExC_seen & REG_VERBARG_SEEN)
7272     {
7273         r->intflags |= PREGf_VERBARG_SEEN;
7274         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7275     }
7276     if (RExC_seen & REG_CUTGROUP_SEEN)
7277         r->intflags |= PREGf_CUTGROUP_SEEN;
7278     if (pm_flags & PMf_USE_RE_EVAL)
7279         r->intflags |= PREGf_USE_RE_EVAL;
7280     if (RExC_paren_names)
7281         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7282     else
7283         RXp_PAREN_NAMES(r) = NULL;
7284
7285     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7286      * so it can be used in pp.c */
7287     if (r->intflags & PREGf_ANCH)
7288         r->extflags |= RXf_IS_ANCHORED;
7289
7290
7291     {
7292         /* this is used to identify "special" patterns that might result
7293          * in Perl NOT calling the regex engine and instead doing the match "itself",
7294          * particularly special cases in split//. By having the regex compiler
7295          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7296          * we avoid weird issues with equivalent patterns resulting in different behavior,
7297          * AND we allow non Perl engines to get the same optimizations by the setting the
7298          * flags appropriately - Yves */
7299         regnode *first = ri->program + 1;
7300         U8 fop = OP(first);
7301         regnode *next = NEXTOPER(first);
7302         U8 nop = OP(next);
7303
7304         if (PL_regkind[fop] == NOTHING && nop == END)
7305             r->extflags |= RXf_NULL;
7306         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7307             /* when fop is SBOL first->flags will be true only when it was
7308              * produced by parsing /\A/, and not when parsing /^/. This is
7309              * very important for the split code as there we want to
7310              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7311              * See rt #122761 for more details. -- Yves */
7312             r->extflags |= RXf_START_ONLY;
7313         else if (fop == PLUS
7314                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7315                  && OP(regnext(first)) == END)
7316             r->extflags |= RXf_WHITE;
7317         else if ( r->extflags & RXf_SPLIT
7318                   && (fop == EXACT || fop == EXACTL)
7319                   && STR_LEN(first) == 1
7320                   && *(STRING(first)) == ' '
7321                   && OP(regnext(first)) == END )
7322             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7323
7324     }
7325
7326     if (RExC_contains_locale) {
7327         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7328     }
7329
7330 #ifdef DEBUGGING
7331     if (RExC_paren_names) {
7332         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7333         ri->data->data[ri->name_list_idx]
7334                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7335     } else
7336 #endif
7337         ri->name_list_idx = 0;
7338
7339     if (RExC_recurse_count) {
7340         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7341             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7342             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7343         }
7344     }
7345     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7346     /* assume we don't need to swap parens around before we match */
7347     DEBUG_TEST_r({
7348         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7349             (unsigned long)RExC_study_chunk_recursed_count);
7350     });
7351     DEBUG_DUMP_r({
7352         DEBUG_RExC_seen();
7353         PerlIO_printf(Perl_debug_log,"Final program:\n");
7354         regdump(r);
7355     });
7356 #ifdef RE_TRACK_PATTERN_OFFSETS
7357     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7358         const STRLEN len = ri->u.offsets[0];
7359         STRLEN i;
7360         GET_RE_DEBUG_FLAGS_DECL;
7361         PerlIO_printf(Perl_debug_log,
7362                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7363         for (i = 1; i <= len; i++) {
7364             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7365                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7366                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7367             }
7368         PerlIO_printf(Perl_debug_log, "\n");
7369     });
7370 #endif
7371
7372 #ifdef USE_ITHREADS
7373     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7374      * by setting the regexp SV to readonly-only instead. If the
7375      * pattern's been recompiled, the USEDness should remain. */
7376     if (old_re && SvREADONLY(old_re))
7377         SvREADONLY_on(rx);
7378 #endif
7379     return rx;
7380 }
7381
7382
7383 SV*
7384 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7385                     const U32 flags)
7386 {
7387     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7388
7389     PERL_UNUSED_ARG(value);
7390
7391     if (flags & RXapif_FETCH) {
7392         return reg_named_buff_fetch(rx, key, flags);
7393     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7394         Perl_croak_no_modify();
7395         return NULL;
7396     } else if (flags & RXapif_EXISTS) {
7397         return reg_named_buff_exists(rx, key, flags)
7398             ? &PL_sv_yes
7399             : &PL_sv_no;
7400     } else if (flags & RXapif_REGNAMES) {
7401         return reg_named_buff_all(rx, flags);
7402     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7403         return reg_named_buff_scalar(rx, flags);
7404     } else {
7405         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7406         return NULL;
7407     }
7408 }
7409
7410 SV*
7411 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7412                          const U32 flags)
7413 {
7414     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7415     PERL_UNUSED_ARG(lastkey);
7416
7417     if (flags & RXapif_FIRSTKEY)
7418         return reg_named_buff_firstkey(rx, flags);
7419     else if (flags & RXapif_NEXTKEY)
7420         return reg_named_buff_nextkey(rx, flags);
7421     else {
7422         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7423                                             (int)flags);
7424         return NULL;
7425     }
7426 }
7427
7428 SV*
7429 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7430                           const U32 flags)
7431 {
7432     AV *retarray = NULL;
7433     SV *ret;
7434     struct regexp *const rx = ReANY(r);
7435
7436     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7437
7438     if (flags & RXapif_ALL)
7439         retarray=newAV();
7440
7441     if (rx && RXp_PAREN_NAMES(rx)) {
7442         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7443         if (he_str) {
7444             IV i;
7445             SV* sv_dat=HeVAL(he_str);
7446             I32 *nums=(I32*)SvPVX(sv_dat);
7447             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7448                 if ((I32)(rx->nparens) >= nums[i]
7449                     && rx->offs[nums[i]].start != -1
7450                     && rx->offs[nums[i]].end != -1)
7451                 {
7452                     ret = newSVpvs("");
7453                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7454                     if (!retarray)
7455                         return ret;
7456                 } else {
7457                     if (retarray)
7458                         ret = newSVsv(&PL_sv_undef);
7459                 }
7460                 if (retarray)
7461                     av_push(retarray, ret);
7462             }
7463             if (retarray)
7464                 return newRV_noinc(MUTABLE_SV(retarray));
7465         }
7466     }
7467     return NULL;
7468 }
7469
7470 bool
7471 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7472                            const U32 flags)
7473 {
7474     struct regexp *const rx = ReANY(r);
7475
7476     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7477
7478     if (rx && RXp_PAREN_NAMES(rx)) {
7479         if (flags & RXapif_ALL) {
7480             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7481         } else {
7482             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7483             if (sv) {
7484                 SvREFCNT_dec_NN(sv);
7485                 return TRUE;
7486             } else {
7487                 return FALSE;
7488             }
7489         }
7490     } else {
7491         return FALSE;
7492     }
7493 }
7494
7495 SV*
7496 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7497 {
7498     struct regexp *const rx = ReANY(r);
7499
7500     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7501
7502     if ( rx && RXp_PAREN_NAMES(rx) ) {
7503         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7504
7505         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7506     } else {
7507         return FALSE;
7508     }
7509 }
7510
7511 SV*
7512 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7513 {
7514     struct regexp *const rx = ReANY(r);
7515     GET_RE_DEBUG_FLAGS_DECL;
7516
7517     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7518
7519     if (rx && RXp_PAREN_NAMES(rx)) {
7520         HV *hv = RXp_PAREN_NAMES(rx);
7521         HE *temphe;
7522         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7523             IV i;
7524             IV parno = 0;
7525             SV* sv_dat = HeVAL(temphe);
7526             I32 *nums = (I32*)SvPVX(sv_dat);
7527             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7528                 if ((I32)(rx->lastparen) >= nums[i] &&
7529                     rx->offs[nums[i]].start != -1 &&
7530                     rx->offs[nums[i]].end != -1)
7531                 {
7532                     parno = nums[i];
7533                     break;
7534                 }
7535             }
7536             if (parno || flags & RXapif_ALL) {
7537                 return newSVhek(HeKEY_hek(temphe));
7538             }
7539         }
7540     }
7541     return NULL;
7542 }
7543
7544 SV*
7545 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7546 {
7547     SV *ret;
7548     AV *av;
7549     SSize_t length;
7550     struct regexp *const rx = ReANY(r);
7551
7552     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7553
7554     if (rx && RXp_PAREN_NAMES(rx)) {
7555         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7556             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7557         } else if (flags & RXapif_ONE) {
7558             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7559             av = MUTABLE_AV(SvRV(ret));
7560             length = av_tindex(av);
7561             SvREFCNT_dec_NN(ret);
7562             return newSViv(length + 1);
7563         } else {
7564             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7565                                                 (int)flags);
7566             return NULL;
7567         }
7568     }
7569     return &PL_sv_undef;
7570 }
7571
7572 SV*
7573 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7574 {
7575     struct regexp *const rx = ReANY(r);
7576     AV *av = newAV();
7577
7578     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7579
7580     if (rx && RXp_PAREN_NAMES(rx)) {
7581         HV *hv= RXp_PAREN_NAMES(rx);
7582         HE *temphe;
7583         (void)hv_iterinit(hv);
7584         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7585             IV i;
7586             IV parno = 0;
7587             SV* sv_dat = HeVAL(temphe);
7588             I32 *nums = (I32*)SvPVX(sv_dat);
7589             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7590                 if ((I32)(rx->lastparen) >= nums[i] &&
7591                     rx->offs[nums[i]].start != -1 &&
7592                     rx->offs[nums[i]].end != -1)
7593                 {
7594                     parno = nums[i];
7595                     break;
7596                 }
7597             }
7598             if (parno || flags & RXapif_ALL) {
7599                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7600             }
7601         }
7602     }
7603
7604     return newRV_noinc(MUTABLE_SV(av));
7605 }
7606
7607 void
7608 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7609                              SV * const sv)
7610 {
7611     struct regexp *const rx = ReANY(r);
7612     char *s = NULL;
7613     SSize_t i = 0;
7614     SSize_t s1, t1;
7615     I32 n = paren;
7616
7617     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7618
7619     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7620            || n == RX_BUFF_IDX_CARET_FULLMATCH
7621            || n == RX_BUFF_IDX_CARET_POSTMATCH
7622        )
7623     {
7624         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7625         if (!keepcopy) {
7626             /* on something like
7627              *    $r = qr/.../;
7628              *    /$qr/p;
7629              * the KEEPCOPY is set on the PMOP rather than the regex */
7630             if (PL_curpm && r == PM_GETRE(PL_curpm))
7631                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7632         }
7633         if (!keepcopy)
7634             goto ret_undef;
7635     }
7636
7637     if (!rx->subbeg)
7638         goto ret_undef;
7639
7640     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7641         /* no need to distinguish between them any more */
7642         n = RX_BUFF_IDX_FULLMATCH;
7643
7644     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7645         && rx->offs[0].start != -1)
7646     {
7647         /* $`, ${^PREMATCH} */
7648         i = rx->offs[0].start;
7649         s = rx->subbeg;
7650     }
7651     else
7652     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7653         && rx->offs[0].end != -1)
7654     {
7655         /* $', ${^POSTMATCH} */
7656         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7657         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7658     }
7659     else
7660     if ( 0 <= n && n <= (I32)rx->nparens &&
7661         (s1 = rx->offs[n].start) != -1 &&
7662         (t1 = rx->offs[n].end) != -1)
7663     {
7664         /* $&, ${^MATCH},  $1 ... */
7665         i = t1 - s1;
7666         s = rx->subbeg + s1 - rx->suboffset;
7667     } else {
7668         goto ret_undef;
7669     }
7670
7671     assert(s >= rx->subbeg);
7672     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7673     if (i >= 0) {
7674 #ifdef NO_TAINT_SUPPORT
7675         sv_setpvn(sv, s, i);
7676 #else
7677         const int oldtainted = TAINT_get;
7678         TAINT_NOT;
7679         sv_setpvn(sv, s, i);
7680         TAINT_set(oldtainted);
7681 #endif
7682         if ( (rx->intflags & PREGf_CANY_SEEN)
7683             ? (RXp_MATCH_UTF8(rx)
7684                         && (!i || is_utf8_string((U8*)s, i)))
7685             : (RXp_MATCH_UTF8(rx)) )
7686         {
7687             SvUTF8_on(sv);
7688         }
7689         else
7690             SvUTF8_off(sv);
7691         if (TAINTING_get) {
7692             if (RXp_MATCH_TAINTED(rx)) {
7693                 if (SvTYPE(sv) >= SVt_PVMG) {
7694                     MAGIC* const mg = SvMAGIC(sv);
7695                     MAGIC* mgt;
7696                     TAINT;
7697                     SvMAGIC_set(sv, mg->mg_moremagic);
7698                     SvTAINT(sv);
7699                     if ((mgt = SvMAGIC(sv))) {
7700                         mg->mg_moremagic = mgt;
7701                         SvMAGIC_set(sv, mg);
7702                     }
7703                 } else {
7704                     TAINT;
7705                     SvTAINT(sv);
7706                 }
7707             } else
7708                 SvTAINTED_off(sv);
7709         }
7710     } else {
7711       ret_undef:
7712         sv_setsv(sv,&PL_sv_undef);
7713         return;
7714     }
7715 }
7716
7717 void
7718 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7719                                                          SV const * const value)
7720 {
7721     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7722
7723     PERL_UNUSED_ARG(rx);
7724     PERL_UNUSED_ARG(paren);
7725     PERL_UNUSED_ARG(value);
7726
7727     if (!PL_localizing)
7728         Perl_croak_no_modify();
7729 }
7730
7731 I32
7732 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7733                               const I32 paren)
7734 {
7735     struct regexp *const rx = ReANY(r);
7736     I32 i;
7737     I32 s1, t1;
7738
7739     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7740
7741     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7742         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7743         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7744     )
7745     {
7746         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7747         if (!keepcopy) {
7748             /* on something like
7749              *    $r = qr/.../;
7750              *    /$qr/p;
7751              * the KEEPCOPY is set on the PMOP rather than the regex */
7752             if (PL_curpm && r == PM_GETRE(PL_curpm))
7753                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7754         }
7755         if (!keepcopy)
7756             goto warn_undef;
7757     }
7758
7759     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7760     switch (paren) {
7761       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7762       case RX_BUFF_IDX_PREMATCH:       /* $` */
7763         if (rx->offs[0].start != -1) {
7764                         i = rx->offs[0].start;
7765                         if (i > 0) {
7766                                 s1 = 0;
7767                                 t1 = i;
7768                                 goto getlen;
7769                         }
7770             }
7771         return 0;
7772
7773       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7774       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7775             if (rx->offs[0].end != -1) {
7776                         i = rx->sublen - rx->offs[0].end;
7777                         if (i > 0) {
7778                                 s1 = rx->offs[0].end;
7779                                 t1 = rx->sublen;
7780                                 goto getlen;
7781                         }
7782             }
7783         return 0;
7784
7785       default: /* $& / ${^MATCH}, $1, $2, ... */
7786             if (paren <= (I32)rx->nparens &&
7787             (s1 = rx->offs[paren].start) != -1 &&
7788             (t1 = rx->offs[paren].end) != -1)
7789             {
7790             i = t1 - s1;
7791             goto getlen;
7792         } else {
7793           warn_undef:
7794             if (ckWARN(WARN_UNINITIALIZED))
7795                 report_uninit((const SV *)sv);
7796             return 0;
7797         }
7798     }
7799   getlen:
7800     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7801         const char * const s = rx->subbeg - rx->suboffset + s1;
7802         const U8 *ep;
7803         STRLEN el;
7804
7805         i = t1 - s1;
7806         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7807                         i = el;
7808     }
7809     return i;
7810 }
7811
7812 SV*
7813 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7814 {
7815     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7816         PERL_UNUSED_ARG(rx);
7817         if (0)
7818             return NULL;
7819         else
7820             return newSVpvs("Regexp");
7821 }
7822
7823 /* Scans the name of a named buffer from the pattern.
7824  * If flags is REG_RSN_RETURN_NULL returns null.
7825  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7826  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7827  * to the parsed name as looked up in the RExC_paren_names hash.
7828  * If there is an error throws a vFAIL().. type exception.
7829  */
7830
7831 #define REG_RSN_RETURN_NULL    0
7832 #define REG_RSN_RETURN_NAME    1
7833 #define REG_RSN_RETURN_DATA    2
7834
7835 STATIC SV*
7836 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7837 {
7838     char *name_start = RExC_parse;
7839
7840     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7841
7842     assert (RExC_parse <= RExC_end);
7843     if (RExC_parse == RExC_end) NOOP;
7844     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7845          /* skip IDFIRST by using do...while */
7846         if (UTF)
7847             do {
7848                 RExC_parse += UTF8SKIP(RExC_parse);
7849             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7850         else
7851             do {
7852                 RExC_parse++;
7853             } while (isWORDCHAR(*RExC_parse));
7854     } else {
7855         RExC_parse++; /* so the <- from the vFAIL is after the offending
7856                          character */
7857         vFAIL("Group name must start with a non-digit word character");
7858     }
7859     if ( flags ) {
7860         SV* sv_name
7861             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7862                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7863         if ( flags == REG_RSN_RETURN_NAME)
7864             return sv_name;
7865         else if (flags==REG_RSN_RETURN_DATA) {
7866             HE *he_str = NULL;
7867             SV *sv_dat = NULL;
7868             if ( ! sv_name )      /* should not happen*/
7869                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7870             if (RExC_paren_names)
7871                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7872             if ( he_str )
7873                 sv_dat = HeVAL(he_str);
7874             if ( ! sv_dat )
7875                 vFAIL("Reference to nonexistent named group");
7876             return sv_dat;
7877         }
7878         else {
7879             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7880                        (unsigned long) flags);
7881         }
7882         NOT_REACHED; /* NOT REACHED */
7883     }
7884     return NULL;
7885 }
7886
7887 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7888     int num;                                                    \
7889     if (RExC_lastparse!=RExC_parse) {                           \
7890         PerlIO_printf(Perl_debug_log, "%s",                     \
7891             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
7892                 RExC_end - RExC_parse, 16,                      \
7893                 "", "",                                         \
7894                 PERL_PV_ESCAPE_UNI_DETECT |                     \
7895                 PERL_PV_PRETTY_ELLIPSES   |                     \
7896                 PERL_PV_PRETTY_LTGT       |                     \
7897                 PERL_PV_ESCAPE_RE         |                     \
7898                 PERL_PV_PRETTY_EXACTSIZE                        \
7899             )                                                   \
7900         );                                                      \
7901     } else                                                      \
7902         PerlIO_printf(Perl_debug_log,"%16s","");                \
7903                                                                 \
7904     if (SIZE_ONLY)                                              \
7905        num = RExC_size + 1;                                     \
7906     else                                                        \
7907        num=REG_NODE_NUM(RExC_emit);                             \
7908     if (RExC_lastnum!=num)                                      \
7909        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7910     else                                                        \
7911        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7912     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7913         (int)((depth*2)), "",                                   \
7914         (funcname)                                              \
7915     );                                                          \
7916     RExC_lastnum=num;                                           \
7917     RExC_lastparse=RExC_parse;                                  \
7918 })
7919
7920
7921
7922 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7923     DEBUG_PARSE_MSG((funcname));                            \
7924     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7925 })
7926 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7927     DEBUG_PARSE_MSG((funcname));                            \
7928     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7929 })
7930
7931 /* This section of code defines the inversion list object and its methods.  The
7932  * interfaces are highly subject to change, so as much as possible is static to
7933  * this file.  An inversion list is here implemented as a malloc'd C UV array
7934  * as an SVt_INVLIST scalar.
7935  *
7936  * An inversion list for Unicode is an array of code points, sorted by ordinal
7937  * number.  The zeroth element is the first code point in the list.  The 1th
7938  * element is the first element beyond that not in the list.  In other words,
7939  * the first range is
7940  *  invlist[0]..(invlist[1]-1)
7941  * The other ranges follow.  Thus every element whose index is divisible by two
7942  * marks the beginning of a range that is in the list, and every element not
7943  * divisible by two marks the beginning of a range not in the list.  A single
7944  * element inversion list that contains the single code point N generally
7945  * consists of two elements
7946  *  invlist[0] == N
7947  *  invlist[1] == N+1
7948  * (The exception is when N is the highest representable value on the
7949  * machine, in which case the list containing just it would be a single
7950  * element, itself.  By extension, if the last range in the list extends to
7951  * infinity, then the first element of that range will be in the inversion list
7952  * at a position that is divisible by two, and is the final element in the
7953  * list.)
7954  * Taking the complement (inverting) an inversion list is quite simple, if the
7955  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7956  * This implementation reserves an element at the beginning of each inversion
7957  * list to always contain 0; there is an additional flag in the header which
7958  * indicates if the list begins at the 0, or is offset to begin at the next
7959  * element.
7960  *
7961  * More about inversion lists can be found in "Unicode Demystified"
7962  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7963  * More will be coming when functionality is added later.
7964  *
7965  * The inversion list data structure is currently implemented as an SV pointing
7966  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7967  * array of UV whose memory management is automatically handled by the existing
7968  * facilities for SV's.
7969  *
7970  * Some of the methods should always be private to the implementation, and some
7971  * should eventually be made public */
7972
7973 /* The header definitions are in F<inline_invlist.c> */
7974
7975 PERL_STATIC_INLINE UV*
7976 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7977 {
7978     /* Returns a pointer to the first element in the inversion list's array.
7979      * This is called upon initialization of an inversion list.  Where the
7980      * array begins depends on whether the list has the code point U+0000 in it
7981      * or not.  The other parameter tells it whether the code that follows this
7982      * call is about to put a 0 in the inversion list or not.  The first
7983      * element is either the element reserved for 0, if TRUE, or the element
7984      * after it, if FALSE */
7985
7986     bool* offset = get_invlist_offset_addr(invlist);
7987     UV* zero_addr = (UV *) SvPVX(invlist);
7988
7989     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7990
7991     /* Must be empty */
7992     assert(! _invlist_len(invlist));
7993
7994     *zero_addr = 0;
7995
7996     /* 1^1 = 0; 1^0 = 1 */
7997     *offset = 1 ^ will_have_0;
7998     return zero_addr + *offset;
7999 }
8000
8001 PERL_STATIC_INLINE UV*
8002 S_invlist_array(SV* const invlist)
8003 {
8004     /* Returns the pointer to the inversion list's array.  Every time the
8005      * length changes, this needs to be called in case malloc or realloc moved
8006      * it */
8007
8008     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8009
8010     /* Must not be empty.  If these fail, you probably didn't check for <len>
8011      * being non-zero before trying to get the array */
8012     assert(_invlist_len(invlist));
8013
8014     /* The very first element always contains zero, The array begins either
8015      * there, or if the inversion list is offset, at the element after it.
8016      * The offset header field determines which; it contains 0 or 1 to indicate
8017      * how much additionally to add */
8018     assert(0 == *(SvPVX(invlist)));
8019     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8020 }
8021
8022 PERL_STATIC_INLINE void
8023 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8024 {
8025     /* Sets the current number of elements stored in the inversion list.
8026      * Updates SvCUR correspondingly */
8027     PERL_UNUSED_CONTEXT;
8028     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8029
8030     assert(SvTYPE(invlist) == SVt_INVLIST);
8031
8032     SvCUR_set(invlist,
8033               (len == 0)
8034                ? 0
8035                : TO_INTERNAL_SIZE(len + offset));
8036     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8037 }
8038
8039 #ifndef PERL_IN_XSUB_RE
8040
8041 PERL_STATIC_INLINE IV*
8042 S_get_invlist_previous_index_addr(SV* invlist)
8043 {
8044     /* Return the address of the IV that is reserved to hold the cached index
8045      * */
8046     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8047
8048     assert(SvTYPE(invlist) == SVt_INVLIST);
8049
8050     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8051 }
8052
8053 PERL_STATIC_INLINE IV
8054 S_invlist_previous_index(SV* const invlist)
8055 {
8056     /* Returns cached index of previous search */
8057
8058     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8059
8060     return *get_invlist_previous_index_addr(invlist);
8061 }
8062
8063 PERL_STATIC_INLINE void
8064 S_invlist_set_previous_index(SV* const invlist, const IV index)
8065 {
8066     /* Caches <index> for later retrieval */
8067
8068     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8069
8070     assert(index == 0 || index < (int) _invlist_len(invlist));
8071
8072     *get_invlist_previous_index_addr(invlist) = index;
8073 }
8074
8075 PERL_STATIC_INLINE void
8076 S_invlist_trim(SV* const invlist)
8077 {
8078     PERL_ARGS_ASSERT_INVLIST_TRIM;
8079
8080     assert(SvTYPE(invlist) == SVt_INVLIST);
8081
8082     /* Change the length of the inversion list to how many entries it currently
8083      * has */
8084     SvPV_shrink_to_cur((SV *) invlist);
8085 }
8086
8087 PERL_STATIC_INLINE bool
8088 S_invlist_is_iterating(SV* const invlist)
8089 {
8090     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8091
8092     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8093 }
8094
8095 #endif /* ifndef PERL_IN_XSUB_RE */
8096
8097 PERL_STATIC_INLINE UV
8098 S_invlist_max(SV* const invlist)
8099 {
8100     /* Returns the maximum number of elements storable in the inversion list's
8101      * array, without having to realloc() */
8102
8103     PERL_ARGS_ASSERT_INVLIST_MAX;
8104
8105     assert(SvTYPE(invlist) == SVt_INVLIST);
8106
8107     /* Assumes worst case, in which the 0 element is not counted in the
8108      * inversion list, so subtracts 1 for that */
8109     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8110            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8111            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8112 }
8113
8114 #ifndef PERL_IN_XSUB_RE
8115 SV*
8116 Perl__new_invlist(pTHX_ IV initial_size)
8117 {
8118
8119     /* Return a pointer to a newly constructed inversion list, with enough
8120      * space to store 'initial_size' elements.  If that number is negative, a
8121      * system default is used instead */
8122
8123     SV* new_list;
8124
8125     if (initial_size < 0) {
8126         initial_size = 10;
8127     }
8128
8129     /* Allocate the initial space */
8130     new_list = newSV_type(SVt_INVLIST);
8131
8132     /* First 1 is in case the zero element isn't in the list; second 1 is for
8133      * trailing NUL */
8134     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8135     invlist_set_len(new_list, 0, 0);
8136
8137     /* Force iterinit() to be used to get iteration to work */
8138     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8139
8140     *get_invlist_previous_index_addr(new_list) = 0;
8141
8142     return new_list;
8143 }
8144
8145 SV*
8146 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8147 {
8148     /* Return a pointer to a newly constructed inversion list, initialized to
8149      * point to <list>, which has to be in the exact correct inversion list
8150      * form, including internal fields.  Thus this is a dangerous routine that
8151      * should not be used in the wrong hands.  The passed in 'list' contains
8152      * several header fields at the beginning that are not part of the
8153      * inversion list body proper */
8154
8155     const STRLEN length = (STRLEN) list[0];
8156     const UV version_id =          list[1];
8157     const bool offset   =    cBOOL(list[2]);
8158 #define HEADER_LENGTH 3
8159     /* If any of the above changes in any way, you must change HEADER_LENGTH
8160      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8161      *      perl -E 'say int(rand 2**31-1)'
8162      */
8163 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8164                                         data structure type, so that one being
8165                                         passed in can be validated to be an
8166                                         inversion list of the correct vintage.
8167                                        */
8168
8169     SV* invlist = newSV_type(SVt_INVLIST);
8170
8171     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8172
8173     if (version_id != INVLIST_VERSION_ID) {
8174         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8175     }
8176
8177     /* The generated array passed in includes header elements that aren't part
8178      * of the list proper, so start it just after them */
8179     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8180
8181     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8182                                shouldn't touch it */
8183
8184     *(get_invlist_offset_addr(invlist)) = offset;
8185
8186     /* The 'length' passed to us is the physical number of elements in the
8187      * inversion list.  But if there is an offset the logical number is one
8188      * less than that */
8189     invlist_set_len(invlist, length  - offset, offset);
8190
8191     invlist_set_previous_index(invlist, 0);
8192
8193     /* Initialize the iteration pointer. */
8194     invlist_iterfinish(invlist);
8195
8196     SvREADONLY_on(invlist);
8197
8198     return invlist;
8199 }
8200 #endif /* ifndef PERL_IN_XSUB_RE */
8201
8202 STATIC void
8203 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8204 {
8205     /* Grow the maximum size of an inversion list */
8206
8207     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8208
8209     assert(SvTYPE(invlist) == SVt_INVLIST);
8210
8211     /* Add one to account for the zero element at the beginning which may not
8212      * be counted by the calling parameters */
8213     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8214 }
8215
8216 STATIC void
8217 S__append_range_to_invlist(pTHX_ SV* const invlist,
8218                                  const UV start, const UV end)
8219 {
8220    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8221     * the end of the inversion list.  The range must be above any existing
8222     * ones. */
8223
8224     UV* array;
8225     UV max = invlist_max(invlist);
8226     UV len = _invlist_len(invlist);
8227     bool offset;
8228
8229     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8230
8231     if (len == 0) { /* Empty lists must be initialized */
8232         offset = start != 0;
8233         array = _invlist_array_init(invlist, ! offset);
8234     }
8235     else {
8236         /* Here, the existing list is non-empty. The current max entry in the
8237          * list is generally the first value not in the set, except when the
8238          * set extends to the end of permissible values, in which case it is
8239          * the first entry in that final set, and so this call is an attempt to
8240          * append out-of-order */
8241
8242         UV final_element = len - 1;
8243         array = invlist_array(invlist);
8244         if (array[final_element] > start
8245             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8246         {
8247             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8248                      array[final_element], start,
8249                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8250         }
8251
8252         /* Here, it is a legal append.  If the new range begins with the first
8253          * value not in the set, it is extending the set, so the new first
8254          * value not in the set is one greater than the newly extended range.
8255          * */
8256         offset = *get_invlist_offset_addr(invlist);
8257         if (array[final_element] == start) {
8258             if (end != UV_MAX) {
8259                 array[final_element] = end + 1;
8260             }
8261             else {
8262                 /* But if the end is the maximum representable on the machine,
8263                  * just let the range that this would extend to have no end */
8264                 invlist_set_len(invlist, len - 1, offset);
8265             }
8266             return;
8267         }
8268     }
8269
8270     /* Here the new range doesn't extend any existing set.  Add it */
8271
8272     len += 2;   /* Includes an element each for the start and end of range */
8273
8274     /* If wll overflow the existing space, extend, which may cause the array to
8275      * be moved */
8276     if (max < len) {
8277         invlist_extend(invlist, len);
8278
8279         /* Have to set len here to avoid assert failure in invlist_array() */
8280         invlist_set_len(invlist, len, offset);
8281
8282         array = invlist_array(invlist);
8283     }
8284     else {
8285         invlist_set_len(invlist, len, offset);
8286     }
8287
8288     /* The next item on the list starts the range, the one after that is
8289      * one past the new range.  */
8290     array[len - 2] = start;
8291     if (end != UV_MAX) {
8292         array[len - 1] = end + 1;
8293     }
8294     else {
8295         /* But if the end is the maximum representable on the machine, just let
8296          * the range have no end */
8297         invlist_set_len(invlist, len - 1, offset);
8298     }
8299 }
8300
8301 #ifndef PERL_IN_XSUB_RE
8302
8303 IV
8304 Perl__invlist_search(SV* const invlist, const UV cp)
8305 {
8306     /* Searches the inversion list for the entry that contains the input code
8307      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8308      * return value is the index into the list's array of the range that
8309      * contains <cp> */
8310
8311     IV low = 0;
8312     IV mid;
8313     IV high = _invlist_len(invlist);
8314     const IV highest_element = high - 1;
8315     const UV* array;
8316
8317     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8318
8319     /* If list is empty, return failure. */
8320     if (high == 0) {
8321         return -1;
8322     }
8323
8324     /* (We can't get the array unless we know the list is non-empty) */
8325     array = invlist_array(invlist);
8326
8327     mid = invlist_previous_index(invlist);
8328     assert(mid >=0 && mid <= highest_element);
8329
8330     /* <mid> contains the cache of the result of the previous call to this
8331      * function (0 the first time).  See if this call is for the same result,
8332      * or if it is for mid-1.  This is under the theory that calls to this
8333      * function will often be for related code points that are near each other.
8334      * And benchmarks show that caching gives better results.  We also test
8335      * here if the code point is within the bounds of the list.  These tests
8336      * replace others that would have had to be made anyway to make sure that
8337      * the array bounds were not exceeded, and these give us extra information
8338      * at the same time */
8339     if (cp >= array[mid]) {
8340         if (cp >= array[highest_element]) {
8341             return highest_element;
8342         }
8343
8344         /* Here, array[mid] <= cp < array[highest_element].  This means that
8345          * the final element is not the answer, so can exclude it; it also
8346          * means that <mid> is not the final element, so can refer to 'mid + 1'
8347          * safely */
8348         if (cp < array[mid + 1]) {
8349             return mid;
8350         }
8351         high--;
8352         low = mid + 1;
8353     }
8354     else { /* cp < aray[mid] */
8355         if (cp < array[0]) { /* Fail if outside the array */
8356             return -1;
8357         }
8358         high = mid;
8359         if (cp >= array[mid - 1]) {
8360             goto found_entry;
8361         }
8362     }
8363
8364     /* Binary search.  What we are looking for is <i> such that
8365      *  array[i] <= cp < array[i+1]
8366      * The loop below converges on the i+1.  Note that there may not be an
8367      * (i+1)th element in the array, and things work nonetheless */
8368     while (low < high) {
8369         mid = (low + high) / 2;
8370         assert(mid <= highest_element);
8371         if (array[mid] <= cp) { /* cp >= array[mid] */
8372             low = mid + 1;
8373
8374             /* We could do this extra test to exit the loop early.
8375             if (cp < array[low]) {
8376                 return mid;
8377             }
8378             */
8379         }
8380         else { /* cp < array[mid] */
8381             high = mid;
8382         }
8383     }
8384
8385   found_entry:
8386     high--;
8387     invlist_set_previous_index(invlist, high);
8388     return high;
8389 }
8390
8391 void
8392 Perl__invlist_populate_swatch(SV* const invlist,
8393                               const UV start, const UV end, U8* swatch)
8394 {
8395     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8396      * but is used when the swash has an inversion list.  This makes this much
8397      * faster, as it uses a binary search instead of a linear one.  This is
8398      * intimately tied to that function, and perhaps should be in utf8.c,
8399      * except it is intimately tied to inversion lists as well.  It assumes
8400      * that <swatch> is all 0's on input */
8401
8402     UV current = start;
8403     const IV len = _invlist_len(invlist);
8404     IV i;
8405     const UV * array;
8406
8407     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8408
8409     if (len == 0) { /* Empty inversion list */
8410         return;
8411     }
8412
8413     array = invlist_array(invlist);
8414
8415     /* Find which element it is */
8416     i = _invlist_search(invlist, start);
8417
8418     /* We populate from <start> to <end> */
8419     while (current < end) {
8420         UV upper;
8421
8422         /* The inversion list gives the results for every possible code point
8423          * after the first one in the list.  Only those ranges whose index is
8424          * even are ones that the inversion list matches.  For the odd ones,
8425          * and if the initial code point is not in the list, we have to skip
8426          * forward to the next element */
8427         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8428             i++;
8429             if (i >= len) { /* Finished if beyond the end of the array */
8430                 return;
8431             }
8432             current = array[i];
8433             if (current >= end) {   /* Finished if beyond the end of what we
8434                                        are populating */
8435                 if (LIKELY(end < UV_MAX)) {
8436                     return;
8437                 }
8438
8439                 /* We get here when the upper bound is the maximum
8440                  * representable on the machine, and we are looking for just
8441                  * that code point.  Have to special case it */
8442                 i = len;
8443                 goto join_end_of_list;
8444             }
8445         }
8446         assert(current >= start);
8447
8448         /* The current range ends one below the next one, except don't go past
8449          * <end> */
8450         i++;
8451         upper = (i < len && array[i] < end) ? array[i] : end;
8452
8453         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8454          * for each code point in it */
8455         for (; current < upper; current++) {
8456             const STRLEN offset = (STRLEN)(current - start);
8457             swatch[offset >> 3] |= 1 << (offset & 7);
8458         }
8459
8460     join_end_of_list:
8461
8462         /* Quit if at the end of the list */
8463         if (i >= len) {
8464
8465             /* But first, have to deal with the highest possible code point on
8466              * the platform.  The previous code assumes that <end> is one
8467              * beyond where we want to populate, but that is impossible at the
8468              * platform's infinity, so have to handle it specially */
8469             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8470             {
8471                 const STRLEN offset = (STRLEN)(end - start);
8472                 swatch[offset >> 3] |= 1 << (offset & 7);
8473             }
8474             return;
8475         }
8476
8477         /* Advance to the next range, which will be for code points not in the
8478          * inversion list */
8479         current = array[i];
8480     }
8481
8482     return;
8483 }
8484
8485 void
8486 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8487                                          const bool complement_b, SV** output)
8488 {
8489     /* Take the union of two inversion lists and point <output> to it.  *output
8490      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8491      * the reference count to that list will be decremented if not already a
8492      * temporary (mortal); otherwise *output will be made correspondingly
8493      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8494      * second list is returned.  If <complement_b> is TRUE, the union is taken
8495      * of the complement (inversion) of <b> instead of b itself.
8496      *
8497      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8498      * Richard Gillam, published by Addison-Wesley, and explained at some
8499      * length there.  The preface says to incorporate its examples into your
8500      * code at your own risk.
8501      *
8502      * The algorithm is like a merge sort.
8503      *
8504      * XXX A potential performance improvement is to keep track as we go along
8505      * if only one of the inputs contributes to the result, meaning the other
8506      * is a subset of that one.  In that case, we can skip the final copy and
8507      * return the larger of the input lists, but then outside code might need
8508      * to keep track of whether to free the input list or not */
8509
8510     const UV* array_a;    /* a's array */
8511     const UV* array_b;
8512     UV len_a;       /* length of a's array */
8513     UV len_b;
8514
8515     SV* u;                      /* the resulting union */
8516     UV* array_u;
8517     UV len_u;
8518
8519     UV i_a = 0;             /* current index into a's array */
8520     UV i_b = 0;
8521     UV i_u = 0;
8522
8523     /* running count, as explained in the algorithm source book; items are
8524      * stopped accumulating and are output when the count changes to/from 0.
8525      * The count is incremented when we start a range that's in the set, and
8526      * decremented when we start a range that's not in the set.  So its range
8527      * is 0 to 2.  Only when the count is zero is something not in the set.
8528      */
8529     UV count = 0;
8530
8531     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8532     assert(a != b);
8533
8534     /* If either one is empty, the union is the other one */
8535     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8536         bool make_temp = FALSE; /* Should we mortalize the result? */
8537
8538         if (*output == a) {
8539             if (a != NULL) {
8540                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8541                     SvREFCNT_dec_NN(a);
8542                 }
8543             }
8544         }
8545         if (*output != b) {
8546             *output = invlist_clone(b);
8547             if (complement_b) {
8548                 _invlist_invert(*output);
8549             }
8550         } /* else *output already = b; */
8551
8552         if (make_temp) {
8553             sv_2mortal(*output);
8554         }
8555         return;
8556     }
8557     else if ((len_b = _invlist_len(b)) == 0) {
8558         bool make_temp = FALSE;
8559         if (*output == b) {
8560             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8561                 SvREFCNT_dec_NN(b);
8562             }
8563         }
8564
8565         /* The complement of an empty list is a list that has everything in it,
8566          * so the union with <a> includes everything too */
8567         if (complement_b) {
8568             if (a == *output) {
8569                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8570                     SvREFCNT_dec_NN(a);
8571                 }
8572             }
8573             *output = _new_invlist(1);
8574             _append_range_to_invlist(*output, 0, UV_MAX);
8575         }
8576         else if (*output != a) {
8577             *output = invlist_clone(a);
8578         }
8579         /* else *output already = a; */
8580
8581         if (make_temp) {
8582             sv_2mortal(*output);
8583         }
8584         return;
8585     }
8586
8587     /* Here both lists exist and are non-empty */
8588     array_a = invlist_array(a);
8589     array_b = invlist_array(b);
8590
8591     /* If are to take the union of 'a' with the complement of b, set it
8592      * up so are looking at b's complement. */
8593     if (complement_b) {
8594
8595         /* To complement, we invert: if the first element is 0, remove it.  To
8596          * do this, we just pretend the array starts one later */
8597         if (array_b[0] == 0) {
8598             array_b++;
8599             len_b--;
8600         }
8601         else {
8602
8603             /* But if the first element is not zero, we pretend the list starts
8604              * at the 0 that is always stored immediately before the array. */
8605             array_b--;
8606             len_b++;
8607         }
8608     }
8609
8610     /* Size the union for the worst case: that the sets are completely
8611      * disjoint */
8612     u = _new_invlist(len_a + len_b);
8613
8614     /* Will contain U+0000 if either component does */
8615     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8616                                       || (len_b > 0 && array_b[0] == 0));
8617
8618     /* Go through each list item by item, stopping when exhausted one of
8619      * them */
8620     while (i_a < len_a && i_b < len_b) {
8621         UV cp;      /* The element to potentially add to the union's array */
8622         bool cp_in_set;   /* is it in the the input list's set or not */
8623
8624         /* We need to take one or the other of the two inputs for the union.
8625          * Since we are merging two sorted lists, we take the smaller of the
8626          * next items.  In case of a tie, we take the one that is in its set
8627          * first.  If we took one not in the set first, it would decrement the
8628          * count, possibly to 0 which would cause it to be output as ending the
8629          * range, and the next time through we would take the same number, and
8630          * output it again as beginning the next range.  By doing it the
8631          * opposite way, there is no possibility that the count will be
8632          * momentarily decremented to 0, and thus the two adjoining ranges will
8633          * be seamlessly merged.  (In a tie and both are in the set or both not
8634          * in the set, it doesn't matter which we take first.) */
8635         if (array_a[i_a] < array_b[i_b]
8636             || (array_a[i_a] == array_b[i_b]
8637                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8638         {
8639             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8640             cp= array_a[i_a++];
8641         }
8642         else {
8643             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8644             cp = array_b[i_b++];
8645         }
8646
8647         /* Here, have chosen which of the two inputs to look at.  Only output
8648          * if the running count changes to/from 0, which marks the
8649          * beginning/end of a range in that's in the set */
8650         if (cp_in_set) {
8651             if (count == 0) {
8652                 array_u[i_u++] = cp;
8653             }
8654             count++;
8655         }
8656         else {
8657             count--;
8658             if (count == 0) {
8659                 array_u[i_u++] = cp;
8660             }
8661         }
8662     }
8663
8664     /* Here, we are finished going through at least one of the lists, which
8665      * means there is something remaining in at most one.  We check if the list
8666      * that hasn't been exhausted is positioned such that we are in the middle
8667      * of a range in its set or not.  (i_a and i_b point to the element beyond
8668      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8669      * is potentially more to output.
8670      * There are four cases:
8671      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8672      *     in the union is entirely from the non-exhausted set.
8673      *  2) Both were in their sets, count is 2.  Nothing further should
8674      *     be output, as everything that remains will be in the exhausted
8675      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8676      *     that
8677      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8678      *     Nothing further should be output because the union includes
8679      *     everything from the exhausted set.  Not decrementing ensures that.
8680      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8681      *     decrementing to 0 insures that we look at the remainder of the
8682      *     non-exhausted set */
8683     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8684         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8685     {
8686         count--;
8687     }
8688
8689     /* The final length is what we've output so far, plus what else is about to
8690      * be output.  (If 'count' is non-zero, then the input list we exhausted
8691      * has everything remaining up to the machine's limit in its set, and hence
8692      * in the union, so there will be no further output. */
8693     len_u = i_u;
8694     if (count == 0) {
8695         /* At most one of the subexpressions will be non-zero */
8696         len_u += (len_a - i_a) + (len_b - i_b);
8697     }
8698
8699     /* Set result to final length, which can change the pointer to array_u, so
8700      * re-find it */
8701     if (len_u != _invlist_len(u)) {
8702         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8703         invlist_trim(u);
8704         array_u = invlist_array(u);
8705     }
8706
8707     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8708      * the other) ended with everything above it not in its set.  That means
8709      * that the remaining part of the union is precisely the same as the
8710      * non-exhausted list, so can just copy it unchanged.  (If both list were
8711      * exhausted at the same time, then the operations below will be both 0.)
8712      */
8713     if (count == 0) {
8714         IV copy_count; /* At most one will have a non-zero copy count */
8715         if ((copy_count = len_a - i_a) > 0) {
8716             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8717         }
8718         else if ((copy_count = len_b - i_b) > 0) {
8719             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8720         }
8721     }
8722
8723     /*  We may be removing a reference to one of the inputs.  If so, the output
8724      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8725      *  count decremented) */
8726     if (a == *output || b == *output) {
8727         assert(! invlist_is_iterating(*output));
8728         if ((SvTEMP(*output))) {
8729             sv_2mortal(u);
8730         }
8731         else {
8732             SvREFCNT_dec_NN(*output);
8733         }
8734     }
8735
8736     *output = u;
8737
8738     return;
8739 }
8740
8741 void
8742 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8743                                                const bool complement_b, SV** i)
8744 {
8745     /* Take the intersection of two inversion lists and point <i> to it.  *i
8746      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8747      * the reference count to that list will be decremented if not already a
8748      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8749      * The first list, <a>, may be NULL, in which case an empty list is
8750      * returned.  If <complement_b> is TRUE, the result will be the
8751      * intersection of <a> and the complement (or inversion) of <b> instead of
8752      * <b> directly.
8753      *
8754      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8755      * Richard Gillam, published by Addison-Wesley, and explained at some
8756      * length there.  The preface says to incorporate its examples into your
8757      * code at your own risk.  In fact, it had bugs
8758      *
8759      * The algorithm is like a merge sort, and is essentially the same as the
8760      * union above
8761      */
8762
8763     const UV* array_a;          /* a's array */
8764     const UV* array_b;
8765     UV len_a;   /* length of a's array */
8766     UV len_b;
8767
8768     SV* r;                   /* the resulting intersection */
8769     UV* array_r;
8770     UV len_r;
8771
8772     UV i_a = 0;             /* current index into a's array */
8773     UV i_b = 0;
8774     UV i_r = 0;
8775
8776     /* running count, as explained in the algorithm source book; items are
8777      * stopped accumulating and are output when the count changes to/from 2.
8778      * The count is incremented when we start a range that's in the set, and
8779      * decremented when we start a range that's not in the set.  So its range
8780      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8781      */
8782     UV count = 0;
8783
8784     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8785     assert(a != b);
8786
8787     /* Special case if either one is empty */
8788     len_a = (a == NULL) ? 0 : _invlist_len(a);
8789     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8790         bool make_temp = FALSE;
8791
8792         if (len_a != 0 && complement_b) {
8793
8794             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8795              * be empty.  Here, also we are using 'b's complement, which hence
8796              * must be every possible code point.  Thus the intersection is
8797              * simply 'a'. */
8798             if (*i != a) {
8799                 if (*i == b) {
8800                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8801                         SvREFCNT_dec_NN(b);
8802                     }
8803                 }
8804
8805                 *i = invlist_clone(a);
8806             }
8807             /* else *i is already 'a' */
8808
8809             if (make_temp) {
8810                 sv_2mortal(*i);
8811             }
8812             return;
8813         }
8814
8815         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8816          * intersection must be empty */
8817         if (*i == a) {
8818             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8819                 SvREFCNT_dec_NN(a);
8820             }
8821         }
8822         else if (*i == b) {
8823             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8824                 SvREFCNT_dec_NN(b);
8825             }
8826         }
8827         *i = _new_invlist(0);
8828         if (make_temp) {
8829             sv_2mortal(*i);
8830         }
8831
8832         return;
8833     }
8834
8835     /* Here both lists exist and are non-empty */
8836     array_a = invlist_array(a);
8837     array_b = invlist_array(b);
8838
8839     /* If are to take the intersection of 'a' with the complement of b, set it
8840      * up so are looking at b's complement. */
8841     if (complement_b) {
8842
8843         /* To complement, we invert: if the first element is 0, remove it.  To
8844          * do this, we just pretend the array starts one later */
8845         if (array_b[0] == 0) {
8846             array_b++;
8847             len_b--;
8848         }
8849         else {
8850
8851             /* But if the first element is not zero, we pretend the list starts
8852              * at the 0 that is always stored immediately before the array. */
8853             array_b--;
8854             len_b++;
8855         }
8856     }
8857
8858     /* Size the intersection for the worst case: that the intersection ends up
8859      * fragmenting everything to be completely disjoint */
8860     r= _new_invlist(len_a + len_b);
8861
8862     /* Will contain U+0000 iff both components do */
8863     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8864                                      && len_b > 0 && array_b[0] == 0);
8865
8866     /* Go through each list item by item, stopping when exhausted one of
8867      * them */
8868     while (i_a < len_a && i_b < len_b) {
8869         UV cp;      /* The element to potentially add to the intersection's
8870                        array */
8871         bool cp_in_set; /* Is it in the input list's set or not */
8872
8873         /* We need to take one or the other of the two inputs for the
8874          * intersection.  Since we are merging two sorted lists, we take the
8875          * smaller of the next items.  In case of a tie, we take the one that
8876          * is not in its set first (a difference from the union algorithm).  If
8877          * we took one in the set first, it would increment the count, possibly
8878          * to 2 which would cause it to be output as starting a range in the
8879          * intersection, and the next time through we would take that same
8880          * number, and output it again as ending the set.  By doing it the
8881          * opposite of this, there is no possibility that the count will be
8882          * momentarily incremented to 2.  (In a tie and both are in the set or
8883          * both not in the set, it doesn't matter which we take first.) */
8884         if (array_a[i_a] < array_b[i_b]
8885             || (array_a[i_a] == array_b[i_b]
8886                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8887         {
8888             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8889             cp= array_a[i_a++];
8890         }
8891         else {
8892             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8893             cp= array_b[i_b++];
8894         }
8895
8896         /* Here, have chosen which of the two inputs to look at.  Only output
8897          * if the running count changes to/from 2, which marks the
8898          * beginning/end of a range that's in the intersection */
8899         if (cp_in_set) {
8900             count++;
8901             if (count == 2) {
8902                 array_r[i_r++] = cp;
8903             }
8904         }
8905         else {
8906             if (count == 2) {
8907                 array_r[i_r++] = cp;
8908             }
8909             count--;
8910         }
8911     }
8912
8913     /* Here, we are finished going through at least one of the lists, which
8914      * means there is something remaining in at most one.  We check if the list
8915      * that has been exhausted is positioned such that we are in the middle
8916      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8917      * the ones we care about.)  There are four cases:
8918      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8919      *     nothing left in the intersection.
8920      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8921      *     above 2.  What should be output is exactly that which is in the
8922      *     non-exhausted set, as everything it has is also in the intersection
8923      *     set, and everything it doesn't have can't be in the intersection
8924      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8925      *     gets incremented to 2.  Like the previous case, the intersection is
8926      *     everything that remains in the non-exhausted set.
8927      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8928      *     remains 1.  And the intersection has nothing more. */
8929     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8930         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8931     {
8932         count++;
8933     }
8934
8935     /* The final length is what we've output so far plus what else is in the
8936      * intersection.  At most one of the subexpressions below will be non-zero
8937      * */
8938     len_r = i_r;
8939     if (count >= 2) {
8940         len_r += (len_a - i_a) + (len_b - i_b);
8941     }
8942
8943     /* Set result to final length, which can change the pointer to array_r, so
8944      * re-find it */
8945     if (len_r != _invlist_len(r)) {
8946         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8947         invlist_trim(r);
8948         array_r = invlist_array(r);
8949     }
8950
8951     /* Finish outputting any remaining */
8952     if (count >= 2) { /* At most one will have a non-zero copy count */
8953         IV copy_count;
8954         if ((copy_count = len_a - i_a) > 0) {
8955             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8956         }
8957         else if ((copy_count = len_b - i_b) > 0) {
8958             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8959         }
8960     }
8961
8962     /*  We may be removing a reference to one of the inputs.  If so, the output
8963      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8964      *  count decremented) */
8965     if (a == *i || b == *i) {
8966         assert(! invlist_is_iterating(*i));
8967         if (SvTEMP(*i)) {
8968             sv_2mortal(r);
8969         }
8970         else {
8971             SvREFCNT_dec_NN(*i);
8972         }
8973     }
8974
8975     *i = r;
8976
8977     return;
8978 }
8979
8980 SV*
8981 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8982 {
8983     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8984      * set.  A pointer to the inversion list is returned.  This may actually be
8985      * a new list, in which case the passed in one has been destroyed.  The
8986      * passed-in inversion list can be NULL, in which case a new one is created
8987      * with just the one range in it */
8988
8989     SV* range_invlist;
8990     UV len;
8991
8992     if (invlist == NULL) {
8993         invlist = _new_invlist(2);
8994         len = 0;
8995     }
8996     else {
8997         len = _invlist_len(invlist);
8998     }
8999
9000     /* If comes after the final entry actually in the list, can just append it
9001      * to the end, */
9002     if (len == 0
9003         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9004             && start >= invlist_array(invlist)[len - 1]))
9005     {
9006         _append_range_to_invlist(invlist, start, end);
9007         return invlist;
9008     }
9009
9010     /* Here, can't just append things, create and return a new inversion list
9011      * which is the union of this range and the existing inversion list */
9012     range_invlist = _new_invlist(2);
9013     _append_range_to_invlist(range_invlist, start, end);
9014
9015     _invlist_union(invlist, range_invlist, &invlist);
9016
9017     /* The temporary can be freed */
9018     SvREFCNT_dec_NN(range_invlist);
9019
9020     return invlist;
9021 }
9022
9023 SV*
9024 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9025                                  UV** other_elements_ptr)
9026 {
9027     /* Create and return an inversion list whose contents are to be populated
9028      * by the caller.  The caller gives the number of elements (in 'size') and
9029      * the very first element ('element0').  This function will set
9030      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9031      * are to be placed.
9032      *
9033      * Obviously there is some trust involved that the caller will properly
9034      * fill in the other elements of the array.
9035      *
9036      * (The first element needs to be passed in, as the underlying code does
9037      * things differently depending on whether it is zero or non-zero) */
9038
9039     SV* invlist = _new_invlist(size);
9040     bool offset;
9041
9042     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9043
9044     _append_range_to_invlist(invlist, element0, element0);
9045     offset = *get_invlist_offset_addr(invlist);
9046
9047     invlist_set_len(invlist, size, offset);
9048     *other_elements_ptr = invlist_array(invlist) + 1;
9049     return invlist;
9050 }
9051
9052 #endif
9053
9054 PERL_STATIC_INLINE SV*
9055 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9056     return _add_range_to_invlist(invlist, cp, cp);
9057 }
9058
9059 #ifndef PERL_IN_XSUB_RE
9060 void
9061 Perl__invlist_invert(pTHX_ SV* const invlist)
9062 {
9063     /* Complement the input inversion list.  This adds a 0 if the list didn't
9064      * have a zero; removes it otherwise.  As described above, the data
9065      * structure is set up so that this is very efficient */
9066
9067     PERL_ARGS_ASSERT__INVLIST_INVERT;
9068
9069     assert(! invlist_is_iterating(invlist));
9070
9071     /* The inverse of matching nothing is matching everything */
9072     if (_invlist_len(invlist) == 0) {
9073         _append_range_to_invlist(invlist, 0, UV_MAX);
9074         return;
9075     }
9076
9077     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9078 }
9079
9080 #endif
9081
9082 PERL_STATIC_INLINE SV*
9083 S_invlist_clone(pTHX_ SV* const invlist)
9084 {
9085
9086     /* Return a new inversion list that is a copy of the input one, which is
9087      * unchanged.  The new list will not be mortal even if the old one was. */
9088
9089     /* Need to allocate extra space to accommodate Perl's addition of a
9090      * trailing NUL to SvPV's, since it thinks they are always strings */
9091     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9092     STRLEN physical_length = SvCUR(invlist);
9093     bool offset = *(get_invlist_offset_addr(invlist));
9094
9095     PERL_ARGS_ASSERT_INVLIST_CLONE;
9096
9097     *(get_invlist_offset_addr(new_invlist)) = offset;
9098     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9099     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9100
9101     return new_invlist;
9102 }
9103
9104 PERL_STATIC_INLINE STRLEN*
9105 S_get_invlist_iter_addr(SV* invlist)
9106 {
9107     /* Return the address of the UV that contains the current iteration
9108      * position */
9109
9110     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9111
9112     assert(SvTYPE(invlist) == SVt_INVLIST);
9113
9114     return &(((XINVLIST*) SvANY(invlist))->iterator);
9115 }
9116
9117 PERL_STATIC_INLINE void
9118 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9119 {
9120     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9121
9122     *get_invlist_iter_addr(invlist) = 0;
9123 }
9124
9125 PERL_STATIC_INLINE void
9126 S_invlist_iterfinish(SV* invlist)
9127 {
9128     /* Terminate iterator for invlist.  This is to catch development errors.
9129      * Any iteration that is interrupted before completed should call this
9130      * function.  Functions that add code points anywhere else but to the end
9131      * of an inversion list assert that they are not in the middle of an
9132      * iteration.  If they were, the addition would make the iteration
9133      * problematical: if the iteration hadn't reached the place where things
9134      * were being added, it would be ok */
9135
9136     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9137
9138     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9139 }
9140
9141 STATIC bool
9142 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9143 {
9144     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9145      * This call sets in <*start> and <*end>, the next range in <invlist>.
9146      * Returns <TRUE> if successful and the next call will return the next
9147      * range; <FALSE> if was already at the end of the list.  If the latter,
9148      * <*start> and <*end> are unchanged, and the next call to this function
9149      * will start over at the beginning of the list */
9150
9151     STRLEN* pos = get_invlist_iter_addr(invlist);
9152     UV len = _invlist_len(invlist);
9153     UV *array;
9154
9155     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9156
9157     if (*pos >= len) {
9158         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9159         return FALSE;
9160     }
9161
9162     array = invlist_array(invlist);
9163
9164     *start = array[(*pos)++];
9165
9166     if (*pos >= len) {
9167         *end = UV_MAX;
9168     }
9169     else {
9170         *end = array[(*pos)++] - 1;
9171     }
9172
9173     return TRUE;
9174 }
9175
9176 PERL_STATIC_INLINE UV
9177 S_invlist_highest(SV* const invlist)
9178 {
9179     /* Returns the highest code point that matches an inversion list.  This API
9180      * has an ambiguity, as it returns 0 under either the highest is actually
9181      * 0, or if the list is empty.  If this distinction matters to you, check
9182      * for emptiness before calling this function */
9183
9184     UV len = _invlist_len(invlist);
9185     UV *array;
9186
9187     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9188
9189     if (len == 0) {
9190         return 0;
9191     }
9192
9193     array = invlist_array(invlist);
9194
9195     /* The last element in the array in the inversion list always starts a
9196      * range that goes to infinity.  That range may be for code points that are
9197      * matched in the inversion list, or it may be for ones that aren't
9198      * matched.  In the latter case, the highest code point in the set is one
9199      * less than the beginning of this range; otherwise it is the final element
9200      * of this range: infinity */
9201     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9202            ? UV_MAX
9203            : array[len - 1] - 1;
9204 }
9205
9206 #ifndef PERL_IN_XSUB_RE
9207 SV *
9208 Perl__invlist_contents(pTHX_ SV* const invlist)
9209 {
9210     /* Get the contents of an inversion list into a string SV so that they can
9211      * be printed out.  It uses the format traditionally done for debug tracing
9212      */
9213
9214     UV start, end;
9215     SV* output = newSVpvs("\n");
9216
9217     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9218
9219     assert(! invlist_is_iterating(invlist));
9220
9221     invlist_iterinit(invlist);
9222     while (invlist_iternext(invlist, &start, &end)) {
9223         if (end == UV_MAX) {
9224             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9225         }
9226         else if (end != start) {
9227             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9228                     start,       end);
9229         }
9230         else {
9231             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9232         }
9233     }
9234
9235     return output;
9236 }
9237 #endif
9238
9239 #ifndef PERL_IN_XSUB_RE
9240 void
9241 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9242                          const char * const indent, SV* const invlist)
9243 {
9244     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9245      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9246      * the string 'indent'.  The output looks like this:
9247          [0] 0x000A .. 0x000D
9248          [2] 0x0085
9249          [4] 0x2028 .. 0x2029
9250          [6] 0x3104 .. INFINITY
9251      * This means that the first range of code points matched by the list are
9252      * 0xA through 0xD; the second range contains only the single code point
9253      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9254      * are used to define each range (except if the final range extends to
9255      * infinity, only a single element is needed).  The array index of the
9256      * first element for the corresponding range is given in brackets. */
9257
9258     UV start, end;
9259     STRLEN count = 0;
9260
9261     PERL_ARGS_ASSERT__INVLIST_DUMP;
9262
9263     if (invlist_is_iterating(invlist)) {
9264         Perl_dump_indent(aTHX_ level, file,
9265              "%sCan't dump inversion list because is in middle of iterating\n",
9266              indent);
9267         return;
9268     }
9269
9270     invlist_iterinit(invlist);
9271     while (invlist_iternext(invlist, &start, &end)) {
9272         if (end == UV_MAX) {
9273             Perl_dump_indent(aTHX_ level, file,
9274                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9275                                    indent, (UV)count, start);
9276         }
9277         else if (end != start) {
9278             Perl_dump_indent(aTHX_ level, file,
9279                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9280                                 indent, (UV)count, start,         end);
9281         }
9282         else {
9283             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9284                                             indent, (UV)count, start);
9285         }
9286         count += 2;
9287     }
9288 }
9289
9290 void
9291 Perl__load_PL_utf8_foldclosures (pTHX)
9292 {
9293     assert(! PL_utf8_foldclosures);
9294
9295     /* If the folds haven't been read in, call a fold function
9296      * to force that */
9297     if (! PL_utf8_tofold) {
9298         U8 dummy[UTF8_MAXBYTES_CASE+1];
9299
9300         /* This string is just a short named one above \xff */
9301         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9302         assert(PL_utf8_tofold); /* Verify that worked */
9303     }
9304     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9305 }
9306 #endif
9307
9308 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9309 bool
9310 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9311 {
9312     /* Return a boolean as to if the two passed in inversion lists are
9313      * identical.  The final argument, if TRUE, says to take the complement of
9314      * the second inversion list before doing the comparison */
9315
9316     const UV* array_a = invlist_array(a);
9317     const UV* array_b = invlist_array(b);
9318     UV len_a = _invlist_len(a);
9319     UV len_b = _invlist_len(b);
9320
9321     UV i = 0;               /* current index into the arrays */
9322     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9323
9324     PERL_ARGS_ASSERT__INVLISTEQ;
9325
9326     /* If are to compare 'a' with the complement of b, set it
9327      * up so are looking at b's complement. */
9328     if (complement_b) {
9329
9330         /* The complement of nothing is everything, so <a> would have to have
9331          * just one element, starting at zero (ending at infinity) */
9332         if (len_b == 0) {
9333             return (len_a == 1 && array_a[0] == 0);
9334         }
9335         else if (array_b[0] == 0) {
9336
9337             /* Otherwise, to complement, we invert.  Here, the first element is
9338              * 0, just remove it.  To do this, we just pretend the array starts
9339              * one later */
9340
9341             array_b++;
9342             len_b--;
9343         }
9344         else {
9345
9346             /* But if the first element is not zero, we pretend the list starts
9347              * at the 0 that is always stored immediately before the array. */
9348             array_b--;
9349             len_b++;
9350         }
9351     }
9352
9353     /* Make sure that the lengths are the same, as well as the final element
9354      * before looping through the remainder.  (Thus we test the length, final,
9355      * and first elements right off the bat) */
9356     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9357         retval = FALSE;
9358     }
9359     else for (i = 0; i < len_a - 1; i++) {
9360         if (array_a[i] != array_b[i]) {
9361             retval = FALSE;
9362             break;
9363         }
9364     }
9365
9366     return retval;
9367 }
9368 #endif
9369
9370 /*
9371  * As best we can, determine the characters that can match the start of
9372  * the given EXACTF-ish node.
9373  *
9374  * Returns the invlist as a new SV*; it is the caller's responsibility to
9375  * call SvREFCNT_dec() when done with it.
9376  */
9377 STATIC SV*
9378 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9379 {
9380     const U8 * s = (U8*)STRING(node);
9381     SSize_t bytelen = STR_LEN(node);
9382     UV uc;
9383     /* Start out big enough for 2 separate code points */
9384     SV* invlist = _new_invlist(4);
9385
9386     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9387
9388     if (! UTF) {
9389         uc = *s;
9390
9391         /* We punt and assume can match anything if the node begins
9392          * with a multi-character fold.  Things are complicated.  For
9393          * example, /ffi/i could match any of:
9394          *  "\N{LATIN SMALL LIGATURE FFI}"
9395          *  "\N{LATIN SMALL LIGATURE FF}I"
9396          *  "F\N{LATIN SMALL LIGATURE FI}"
9397          *  plus several other things; and making sure we have all the
9398          *  possibilities is hard. */
9399         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9400             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9401         }
9402         else {
9403             /* Any Latin1 range character can potentially match any
9404              * other depending on the locale */
9405             if (OP(node) == EXACTFL) {
9406                 _invlist_union(invlist, PL_Latin1, &invlist);
9407             }
9408             else {
9409                 /* But otherwise, it matches at least itself.  We can
9410                  * quickly tell if it has a distinct fold, and if so,
9411                  * it matches that as well */
9412                 invlist = add_cp_to_invlist(invlist, uc);
9413                 if (IS_IN_SOME_FOLD_L1(uc))
9414                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9415             }
9416
9417             /* Some characters match above-Latin1 ones under /i.  This
9418              * is true of EXACTFL ones when the locale is UTF-8 */
9419             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9420                 && (! isASCII(uc) || (OP(node) != EXACTFA
9421                                     && OP(node) != EXACTFA_NO_TRIE)))
9422             {
9423                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9424             }
9425         }
9426     }
9427     else {  /* Pattern is UTF-8 */
9428         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9429         STRLEN foldlen = UTF8SKIP(s);
9430         const U8* e = s + bytelen;
9431         SV** listp;
9432
9433         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9434
9435         /* The only code points that aren't folded in a UTF EXACTFish
9436          * node are are the problematic ones in EXACTFL nodes */
9437         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9438             /* We need to check for the possibility that this EXACTFL
9439              * node begins with a multi-char fold.  Therefore we fold
9440              * the first few characters of it so that we can make that
9441              * check */
9442             U8 *d = folded;
9443             int i;
9444
9445             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9446                 if (isASCII(*s)) {
9447                     *(d++) = (U8) toFOLD(*s);
9448                     s++;
9449                 }
9450                 else {
9451                     STRLEN len;
9452                     to_utf8_fold(s, d, &len);
9453                     d += len;
9454                     s += UTF8SKIP(s);
9455                 }
9456             }
9457
9458             /* And set up so the code below that looks in this folded
9459              * buffer instead of the node's string */
9460             e = d;
9461             foldlen = UTF8SKIP(folded);
9462             s = folded;
9463         }
9464
9465         /* When we reach here 's' points to the fold of the first
9466          * character(s) of the node; and 'e' points to far enough along
9467          * the folded string to be just past any possible multi-char
9468          * fold. 'foldlen' is the length in bytes of the first
9469          * character in 's'
9470          *
9471          * Unlike the non-UTF-8 case, the macro for determining if a
9472          * string is a multi-char fold requires all the characters to
9473          * already be folded.  This is because of all the complications
9474          * if not.  Note that they are folded anyway, except in EXACTFL
9475          * nodes.  Like the non-UTF case above, we punt if the node
9476          * begins with a multi-char fold  */
9477
9478         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9479             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9480         }
9481         else {  /* Single char fold */
9482
9483             /* It matches all the things that fold to it, which are
9484              * found in PL_utf8_foldclosures (including itself) */
9485             invlist = add_cp_to_invlist(invlist, uc);
9486             if (! PL_utf8_foldclosures)
9487                 _load_PL_utf8_foldclosures();
9488             if ((listp = hv_fetch(PL_utf8_foldclosures,
9489                                 (char *) s, foldlen, FALSE)))
9490             {
9491                 AV* list = (AV*) *listp;
9492                 IV k;
9493                 for (k = 0; k <= av_tindex(list); k++) {
9494                     SV** c_p = av_fetch(list, k, FALSE);
9495                     UV c;
9496                     assert(c_p);
9497
9498                     c = SvUV(*c_p);
9499
9500                     /* /aa doesn't allow folds between ASCII and non- */
9501                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9502                         && isASCII(c) != isASCII(uc))
9503                     {
9504                         continue;
9505                     }
9506
9507                     invlist = add_cp_to_invlist(invlist, c);
9508                 }
9509             }
9510         }
9511     }
9512
9513     return invlist;
9514 }
9515
9516 #undef HEADER_LENGTH
9517 #undef TO_INTERNAL_SIZE
9518 #undef FROM_INTERNAL_SIZE
9519 #undef INVLIST_VERSION_ID
9520
9521 /* End of inversion list object */
9522
9523 STATIC void
9524 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9525 {
9526     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9527      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9528      * should point to the first flag; it is updated on output to point to the
9529      * final ')' or ':'.  There needs to be at least one flag, or this will
9530      * abort */
9531
9532     /* for (?g), (?gc), and (?o) warnings; warning
9533        about (?c) will warn about (?g) -- japhy    */
9534
9535 #define WASTED_O  0x01
9536 #define WASTED_G  0x02
9537 #define WASTED_C  0x04
9538 #define WASTED_GC (WASTED_G|WASTED_C)
9539     I32 wastedflags = 0x00;
9540     U32 posflags = 0, negflags = 0;
9541     U32 *flagsp = &posflags;
9542     char has_charset_modifier = '\0';
9543     regex_charset cs;
9544     bool has_use_defaults = FALSE;
9545     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9546     int x_mod_count = 0;
9547
9548     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9549
9550     /* '^' as an initial flag sets certain defaults */
9551     if (UCHARAT(RExC_parse) == '^') {
9552         RExC_parse++;
9553         has_use_defaults = TRUE;
9554         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9555         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9556                                         ? REGEX_UNICODE_CHARSET
9557                                         : REGEX_DEPENDS_CHARSET);
9558     }
9559
9560     cs = get_regex_charset(RExC_flags);
9561     if (cs == REGEX_DEPENDS_CHARSET
9562         && (RExC_utf8 || RExC_uni_semantics))
9563     {
9564         cs = REGEX_UNICODE_CHARSET;
9565     }
9566
9567     while (*RExC_parse) {
9568         /* && strchr("iogcmsx", *RExC_parse) */
9569         /* (?g), (?gc) and (?o) are useless here
9570            and must be globally applied -- japhy */
9571         switch (*RExC_parse) {
9572
9573             /* Code for the imsxn flags */
9574             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9575
9576             case LOCALE_PAT_MOD:
9577                 if (has_charset_modifier) {
9578                     goto excess_modifier;
9579                 }
9580                 else if (flagsp == &negflags) {
9581                     goto neg_modifier;
9582                 }
9583                 cs = REGEX_LOCALE_CHARSET;
9584                 has_charset_modifier = LOCALE_PAT_MOD;
9585                 break;
9586             case UNICODE_PAT_MOD:
9587                 if (has_charset_modifier) {
9588                     goto excess_modifier;
9589                 }
9590                 else if (flagsp == &negflags) {
9591                     goto neg_modifier;
9592                 }
9593                 cs = REGEX_UNICODE_CHARSET;
9594                 has_charset_modifier = UNICODE_PAT_MOD;
9595                 break;
9596             case ASCII_RESTRICT_PAT_MOD:
9597                 if (flagsp == &negflags) {
9598                     goto neg_modifier;
9599                 }
9600                 if (has_charset_modifier) {
9601                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9602                         goto excess_modifier;
9603                     }
9604                     /* Doubled modifier implies more restricted */
9605                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9606                 }
9607                 else {
9608                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9609                 }
9610                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9611                 break;
9612             case DEPENDS_PAT_MOD:
9613                 if (has_use_defaults) {
9614                     goto fail_modifiers;
9615                 }
9616                 else if (flagsp == &negflags) {
9617                     goto neg_modifier;
9618                 }
9619                 else if (has_charset_modifier) {
9620                     goto excess_modifier;
9621                 }
9622
9623                 /* The dual charset means unicode semantics if the
9624                  * pattern (or target, not known until runtime) are
9625                  * utf8, or something in the pattern indicates unicode
9626                  * semantics */
9627                 cs = (RExC_utf8 || RExC_uni_semantics)
9628                      ? REGEX_UNICODE_CHARSET
9629                      : REGEX_DEPENDS_CHARSET;
9630                 has_charset_modifier = DEPENDS_PAT_MOD;
9631                 break;
9632             excess_modifier:
9633                 RExC_parse++;
9634                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9635                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9636                 }
9637                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9638                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9639                                         *(RExC_parse - 1));
9640                 }
9641                 else {
9642                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9643                 }
9644                 NOT_REACHED; /*NOTREACHED*/
9645             neg_modifier:
9646                 RExC_parse++;
9647                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9648                                     *(RExC_parse - 1));
9649                 NOT_REACHED; /*NOTREACHED*/
9650             case ONCE_PAT_MOD: /* 'o' */
9651             case GLOBAL_PAT_MOD: /* 'g' */
9652                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9653                     const I32 wflagbit = *RExC_parse == 'o'
9654                                          ? WASTED_O
9655                                          : WASTED_G;
9656                     if (! (wastedflags & wflagbit) ) {
9657                         wastedflags |= wflagbit;
9658                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9659                         vWARN5(
9660                             RExC_parse + 1,
9661                             "Useless (%s%c) - %suse /%c modifier",
9662                             flagsp == &negflags ? "?-" : "?",
9663                             *RExC_parse,
9664                             flagsp == &negflags ? "don't " : "",
9665                             *RExC_parse
9666                         );
9667                     }
9668                 }
9669                 break;
9670
9671             case CONTINUE_PAT_MOD: /* 'c' */
9672                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9673                     if (! (wastedflags & WASTED_C) ) {
9674                         wastedflags |= WASTED_GC;
9675                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9676                         vWARN3(
9677                             RExC_parse + 1,
9678                             "Useless (%sc) - %suse /gc modifier",
9679                             flagsp == &negflags ? "?-" : "?",
9680                             flagsp == &negflags ? "don't " : ""
9681                         );
9682                     }
9683                 }
9684                 break;
9685             case KEEPCOPY_PAT_MOD: /* 'p' */
9686                 if (flagsp == &negflags) {
9687                     if (PASS2)
9688                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9689                 } else {
9690                     *flagsp |= RXf_PMf_KEEPCOPY;
9691                 }
9692                 break;
9693             case '-':
9694                 /* A flag is a default iff it is following a minus, so
9695                  * if there is a minus, it means will be trying to
9696                  * re-specify a default which is an error */
9697                 if (has_use_defaults || flagsp == &negflags) {
9698                     goto fail_modifiers;
9699                 }
9700                 flagsp = &negflags;
9701                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9702                 break;
9703             case ':':
9704             case ')':
9705                 RExC_flags |= posflags;
9706                 RExC_flags &= ~negflags;
9707                 set_regex_charset(&RExC_flags, cs);
9708                 if (RExC_flags & RXf_PMf_FOLD) {
9709                     RExC_contains_i = 1;
9710                 }
9711                 if (PASS2) {
9712                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9713                 }
9714                 return;
9715                 /*NOTREACHED*/
9716             default:
9717             fail_modifiers:
9718                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9719                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9720                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9721                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9722                 NOT_REACHED; /*NOTREACHED*/
9723         }
9724
9725         ++RExC_parse;
9726     }
9727
9728     if (PASS2) {
9729         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9730     }
9731 }
9732
9733 /*
9734  - reg - regular expression, i.e. main body or parenthesized thing
9735  *
9736  * Caller must absorb opening parenthesis.
9737  *
9738  * Combining parenthesis handling with the base level of regular expression
9739  * is a trifle forced, but the need to tie the tails of the branches to what
9740  * follows makes it hard to avoid.
9741  */
9742 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9743 #ifdef DEBUGGING
9744 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9745 #else
9746 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9747 #endif
9748
9749 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9750    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9751    needs to be restarted.
9752    Otherwise would only return NULL if regbranch() returns NULL, which
9753    cannot happen.  */
9754 STATIC regnode *
9755 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9756     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9757      * 2 is like 1, but indicates that nextchar() has been called to advance
9758      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9759      * this flag alerts us to the need to check for that */
9760 {
9761     regnode *ret;               /* Will be the head of the group. */
9762     regnode *br;
9763     regnode *lastbr;
9764     regnode *ender = NULL;
9765     I32 parno = 0;
9766     I32 flags;
9767     U32 oregflags = RExC_flags;
9768     bool have_branch = 0;
9769     bool is_open = 0;
9770     I32 freeze_paren = 0;
9771     I32 after_freeze = 0;
9772     I32 num; /* numeric backreferences */
9773
9774     char * parse_start = RExC_parse; /* MJD */
9775     char * const oregcomp_parse = RExC_parse;
9776
9777     GET_RE_DEBUG_FLAGS_DECL;
9778
9779     PERL_ARGS_ASSERT_REG;
9780     DEBUG_PARSE("reg ");
9781
9782     *flagp = 0;                         /* Tentatively. */
9783
9784
9785     /* Make an OPEN node, if parenthesized. */
9786     if (paren) {
9787
9788         /* Under /x, space and comments can be gobbled up between the '(' and
9789          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9790          * intervening space, as the sequence is a token, and a token should be
9791          * indivisible */
9792         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9793
9794         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9795             char *start_verb = RExC_parse;
9796             STRLEN verb_len = 0;
9797             char *start_arg = NULL;
9798             unsigned char op = 0;
9799             int argok = 1;
9800             int internal_argval = 0; /* internal_argval is only useful if
9801                                         !argok */
9802
9803             if (has_intervening_patws) {
9804                 RExC_parse++;
9805                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9806             }
9807             while ( *RExC_parse && *RExC_parse != ')' ) {
9808                 if ( *RExC_parse == ':' ) {
9809                     start_arg = RExC_parse + 1;
9810                     break;
9811                 }
9812                 RExC_parse++;
9813             }
9814             ++start_verb;
9815             verb_len = RExC_parse - start_verb;
9816             if ( start_arg ) {
9817                 RExC_parse++;
9818                 while ( *RExC_parse && *RExC_parse != ')' )
9819                     RExC_parse++;
9820                 if ( *RExC_parse != ')' )
9821                     vFAIL("Unterminated verb pattern argument");
9822                 if ( RExC_parse == start_arg )
9823                     start_arg = NULL;
9824             } else {
9825                 if ( *RExC_parse != ')' )
9826                     vFAIL("Unterminated verb pattern");
9827             }
9828
9829             switch ( *start_verb ) {
9830             case 'A':  /* (*ACCEPT) */
9831                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9832                     op = ACCEPT;
9833                     internal_argval = RExC_nestroot;
9834                 }
9835                 break;
9836             case 'C':  /* (*COMMIT) */
9837                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9838                     op = COMMIT;
9839                 break;
9840             case 'F':  /* (*FAIL) */
9841                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9842                     op = OPFAIL;
9843                     argok = 0;
9844                 }
9845                 break;
9846             case ':':  /* (*:NAME) */
9847             case 'M':  /* (*MARK:NAME) */
9848                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9849                     op = MARKPOINT;
9850                     argok = -1;
9851                 }
9852                 break;
9853             case 'P':  /* (*PRUNE) */
9854                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9855                     op = PRUNE;
9856                 break;
9857             case 'S':   /* (*SKIP) */
9858                 if ( memEQs(start_verb,verb_len,"SKIP") )
9859                     op = SKIP;
9860                 break;
9861             case 'T':  /* (*THEN) */
9862                 /* [19:06] <TimToady> :: is then */
9863                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9864                     op = CUTGROUP;
9865                     RExC_seen |= REG_CUTGROUP_SEEN;
9866                 }
9867                 break;
9868             }
9869             if ( ! op ) {
9870                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9871                 vFAIL2utf8f(
9872                     "Unknown verb pattern '%"UTF8f"'",
9873                     UTF8fARG(UTF, verb_len, start_verb));
9874             }
9875             if ( argok ) {
9876                 if ( start_arg && internal_argval ) {
9877                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9878                         verb_len, start_verb);
9879                 } else if ( argok < 0 && !start_arg ) {
9880                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9881                         verb_len, start_verb);
9882                 } else {
9883                     ret = reganode(pRExC_state, op, internal_argval);
9884                     if ( ! internal_argval && ! SIZE_ONLY ) {
9885                         if (start_arg) {
9886                             SV *sv = newSVpvn( start_arg,
9887                                                RExC_parse - start_arg);
9888                             ARG(ret) = add_data( pRExC_state,
9889                                                  STR_WITH_LEN("S"));
9890                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9891                             ret->flags = 0;
9892                         } else {
9893                             ret->flags = 1;
9894                         }
9895                     }
9896                 }
9897                 if (!internal_argval)
9898                     RExC_seen |= REG_VERBARG_SEEN;
9899             } else if ( start_arg ) {
9900                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9901                         verb_len, start_verb);
9902             } else {
9903                 ret = reg_node(pRExC_state, op);
9904             }
9905             nextchar(pRExC_state);
9906             return ret;
9907         }
9908         else if (*RExC_parse == '?') { /* (?...) */
9909             bool is_logical = 0;
9910             const char * const seqstart = RExC_parse;
9911             const char * endptr;
9912             if (has_intervening_patws) {
9913                 RExC_parse++;
9914                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9915             }
9916
9917             RExC_parse++;
9918             paren = *RExC_parse++;
9919             ret = NULL;                 /* For look-ahead/behind. */
9920             switch (paren) {
9921
9922             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9923                 paren = *RExC_parse++;
9924                 if ( paren == '<')         /* (?P<...>) named capture */
9925                     goto named_capture;
9926                 else if (paren == '>') {   /* (?P>name) named recursion */
9927                     goto named_recursion;
9928                 }
9929                 else if (paren == '=') {   /* (?P=...)  named backref */
9930                     /* this pretty much dupes the code for \k<NAME> in
9931                      * regatom(), if you change this make sure you change that
9932                      * */
9933                     char* name_start = RExC_parse;
9934                     U32 num = 0;
9935                     SV *sv_dat = reg_scan_name(pRExC_state,
9936                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9937                     if (RExC_parse == name_start || *RExC_parse != ')')
9938                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9939                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9940
9941                     if (!SIZE_ONLY) {
9942                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9943                         RExC_rxi->data->data[num]=(void*)sv_dat;
9944                         SvREFCNT_inc_simple_void(sv_dat);
9945                     }
9946                     RExC_sawback = 1;
9947                     ret = reganode(pRExC_state,
9948                                    ((! FOLD)
9949                                      ? NREF
9950                                      : (ASCII_FOLD_RESTRICTED)
9951                                        ? NREFFA
9952                                        : (AT_LEAST_UNI_SEMANTICS)
9953                                          ? NREFFU
9954                                          : (LOC)
9955                                            ? NREFFL
9956                                            : NREFF),
9957                                     num);
9958                     *flagp |= HASWIDTH;
9959
9960                     Set_Node_Offset(ret, parse_start+1);
9961                     Set_Node_Cur_Length(ret, parse_start);
9962
9963                     nextchar(pRExC_state);
9964                     return ret;
9965                 }
9966                 RExC_parse++;
9967                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9968                 vFAIL3("Sequence (%.*s...) not recognized",
9969                                 RExC_parse-seqstart, seqstart);
9970                 NOT_REACHED; /*NOTREACHED*/
9971             case '<':           /* (?<...) */
9972                 if (*RExC_parse == '!')
9973                     paren = ',';
9974                 else if (*RExC_parse != '=')
9975               named_capture:
9976                 {               /* (?<...>) */
9977                     char *name_start;
9978                     SV *svname;
9979                     paren= '>';
9980             case '\'':          /* (?'...') */
9981                     name_start= RExC_parse;
9982                     svname = reg_scan_name(pRExC_state,
9983                         SIZE_ONLY    /* reverse test from the others */
9984                         ? REG_RSN_RETURN_NAME
9985                         : REG_RSN_RETURN_NULL);
9986                     if (RExC_parse == name_start || *RExC_parse != paren)
9987                         vFAIL2("Sequence (?%c... not terminated",
9988                             paren=='>' ? '<' : paren);
9989                     if (SIZE_ONLY) {
9990                         HE *he_str;
9991                         SV *sv_dat = NULL;
9992                         if (!svname) /* shouldn't happen */
9993                             Perl_croak(aTHX_
9994                                 "panic: reg_scan_name returned NULL");
9995                         if (!RExC_paren_names) {
9996                             RExC_paren_names= newHV();
9997                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9998 #ifdef DEBUGGING
9999                             RExC_paren_name_list= newAV();
10000                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10001 #endif
10002                         }
10003                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10004                         if ( he_str )
10005                             sv_dat = HeVAL(he_str);
10006                         if ( ! sv_dat ) {
10007                             /* croak baby croak */
10008                             Perl_croak(aTHX_
10009                                 "panic: paren_name hash element allocation failed");
10010                         } else if ( SvPOK(sv_dat) ) {
10011                             /* (?|...) can mean we have dupes so scan to check
10012                                its already been stored. Maybe a flag indicating
10013                                we are inside such a construct would be useful,
10014                                but the arrays are likely to be quite small, so
10015                                for now we punt -- dmq */
10016                             IV count = SvIV(sv_dat);
10017                             I32 *pv = (I32*)SvPVX(sv_dat);
10018                             IV i;
10019                             for ( i = 0 ; i < count ; i++ ) {
10020                                 if ( pv[i] == RExC_npar ) {
10021                                     count = 0;
10022                                     break;
10023                                 }
10024                             }
10025                             if ( count ) {
10026                                 pv = (I32*)SvGROW(sv_dat,
10027                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10028                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10029                                 pv[count] = RExC_npar;
10030                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10031                             }
10032                         } else {
10033                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10034                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10035                                                                 sizeof(I32));
10036                             SvIOK_on(sv_dat);
10037                             SvIV_set(sv_dat, 1);
10038                         }
10039 #ifdef DEBUGGING
10040                         /* Yes this does cause a memory leak in debugging Perls
10041                          * */
10042                         if (!av_store(RExC_paren_name_list,
10043                                       RExC_npar, SvREFCNT_inc(svname)))
10044                             SvREFCNT_dec_NN(svname);
10045 #endif
10046
10047                         /*sv_dump(sv_dat);*/
10048                     }
10049                     nextchar(pRExC_state);
10050                     paren = 1;
10051                     goto capturing_parens;
10052                 }
10053                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10054                 RExC_in_lookbehind++;
10055                 RExC_parse++;
10056                 /* FALLTHROUGH */
10057             case '=':           /* (?=...) */
10058                 RExC_seen_zerolen++;
10059                 break;
10060             case '!':           /* (?!...) */
10061                 RExC_seen_zerolen++;
10062                 /* check if we're really just a "FAIL" assertion */
10063                 --RExC_parse;
10064                 nextchar(pRExC_state);
10065                 if (*RExC_parse == ')') {
10066                     ret=reg_node(pRExC_state, OPFAIL);
10067                     nextchar(pRExC_state);
10068                     return ret;
10069                 }
10070                 break;
10071             case '|':           /* (?|...) */
10072                 /* branch reset, behave like a (?:...) except that
10073                    buffers in alternations share the same numbers */
10074                 paren = ':';
10075                 after_freeze = freeze_paren = RExC_npar;
10076                 break;
10077             case ':':           /* (?:...) */
10078             case '>':           /* (?>...) */
10079                 break;
10080             case '$':           /* (?$...) */
10081             case '@':           /* (?@...) */
10082                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10083                 break;
10084             case '0' :           /* (?0) */
10085             case 'R' :           /* (?R) */
10086                 if (*RExC_parse != ')')
10087                     FAIL("Sequence (?R) not terminated");
10088                 ret = reg_node(pRExC_state, GOSTART);
10089                     RExC_seen |= REG_GOSTART_SEEN;
10090                 *flagp |= POSTPONED;
10091                 nextchar(pRExC_state);
10092                 return ret;
10093                 /*notreached*/
10094             /* named and numeric backreferences */
10095             case '&':            /* (?&NAME) */
10096                 parse_start = RExC_parse - 1;
10097               named_recursion:
10098                 {
10099                     SV *sv_dat = reg_scan_name(pRExC_state,
10100                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10101                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10102                 }
10103                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10104                     vFAIL("Sequence (?&... not terminated");
10105                 goto gen_recurse_regop;
10106                 /* NOT REACHED */
10107             case '+':
10108                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10109                     RExC_parse++;
10110                     vFAIL("Illegal pattern");
10111                 }
10112                 goto parse_recursion;
10113                 /* NOT REACHED*/
10114             case '-': /* (?-1) */
10115                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10116                     RExC_parse--; /* rewind to let it be handled later */
10117                     goto parse_flags;
10118                 }
10119                 /* FALLTHROUGH */
10120             case '1': case '2': case '3': case '4': /* (?1) */
10121             case '5': case '6': case '7': case '8': case '9':
10122                 RExC_parse--;
10123               parse_recursion:
10124                 {
10125                     bool is_neg = FALSE;
10126                     parse_start = RExC_parse - 1; /* MJD */
10127                     if (*RExC_parse == '-') {
10128                         RExC_parse++;
10129                         is_neg = TRUE;
10130                     }
10131                     num = grok_atou(RExC_parse, &endptr);
10132                     if (endptr)
10133                         RExC_parse = (char*)endptr;
10134                     if (is_neg) {
10135                         /* Some limit for num? */
10136                         num = -num;
10137                     }
10138                 }
10139                 if (*RExC_parse!=')')
10140                     vFAIL("Expecting close bracket");
10141
10142               gen_recurse_regop:
10143                 if ( paren == '-' ) {
10144                     /*
10145                     Diagram of capture buffer numbering.
10146                     Top line is the normal capture buffer numbers
10147                     Bottom line is the negative indexing as from
10148                     the X (the (?-2))
10149
10150                     +   1 2    3 4 5 X          6 7
10151                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10152                     -   5 4    3 2 1 X          x x
10153
10154                     */
10155                     num = RExC_npar + num;
10156                     if (num < 1)  {
10157                         RExC_parse++;
10158                         vFAIL("Reference to nonexistent group");
10159                     }
10160                 } else if ( paren == '+' ) {
10161                     num = RExC_npar + num - 1;
10162                 }
10163
10164                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10165                 if (!SIZE_ONLY) {
10166                     if (num > (I32)RExC_rx->nparens) {
10167                         RExC_parse++;
10168                         vFAIL("Reference to nonexistent group");
10169                     }
10170                     RExC_recurse_count++;
10171                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10172                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10173                               22, "|    |", (int)(depth * 2 + 1), "",
10174                               (UV)ARG(ret), (IV)ARG2L(ret)));
10175                 }
10176                 RExC_seen |= REG_RECURSE_SEEN;
10177                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10178                 Set_Node_Offset(ret, parse_start); /* MJD */
10179
10180                 *flagp |= POSTPONED;
10181                 nextchar(pRExC_state);
10182                 return ret;
10183
10184             /* NOT REACHED */
10185
10186             case '?':           /* (??...) */
10187                 is_logical = 1;
10188                 if (*RExC_parse != '{') {
10189                     RExC_parse++;
10190                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10191                     vFAIL2utf8f(
10192                         "Sequence (%"UTF8f"...) not recognized",
10193                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10194                     NOT_REACHED; /*NOTREACHED*/
10195                 }
10196                 *flagp |= POSTPONED;
10197                 paren = *RExC_parse++;
10198                 /* FALLTHROUGH */
10199             case '{':           /* (?{...}) */
10200             {
10201                 U32 n = 0;
10202                 struct reg_code_block *cb;
10203
10204                 RExC_seen_zerolen++;
10205
10206                 if (   !pRExC_state->num_code_blocks
10207                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10208                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10209                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10210                             - RExC_start)
10211                 ) {
10212                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10213                         FAIL("panic: Sequence (?{...}): no code block found\n");
10214                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10215                 }
10216                 /* this is a pre-compiled code block (?{...}) */
10217                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10218                 RExC_parse = RExC_start + cb->end;
10219                 if (!SIZE_ONLY) {
10220                     OP *o = cb->block;
10221                     if (cb->src_regex) {
10222                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10223                         RExC_rxi->data->data[n] =
10224                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10225                         RExC_rxi->data->data[n+1] = (void*)o;
10226                     }
10227                     else {
10228                         n = add_data(pRExC_state,
10229                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10230                         RExC_rxi->data->data[n] = (void*)o;
10231                     }
10232                 }
10233                 pRExC_state->code_index++;
10234                 nextchar(pRExC_state);
10235
10236                 if (is_logical) {
10237                     regnode *eval;
10238                     ret = reg_node(pRExC_state, LOGICAL);
10239
10240                     eval = reg2Lanode(pRExC_state, EVAL,
10241                                        n,
10242
10243                                        /* for later propagation into (??{})
10244                                         * return value */
10245                                        RExC_flags & RXf_PMf_COMPILETIME
10246                                       );
10247                     if (!SIZE_ONLY) {
10248                         ret->flags = 2;
10249                     }
10250                     REGTAIL(pRExC_state, ret, eval);
10251                     /* deal with the length of this later - MJD */
10252                     return ret;
10253                 }
10254                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10255                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10256                 Set_Node_Offset(ret, parse_start);
10257                 return ret;
10258             }
10259             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10260             {
10261                 int is_define= 0;
10262                 const int DEFINE_len = sizeof("DEFINE") - 1;
10263                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10264                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10265                         || RExC_parse[1] == '<'
10266                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10267                         I32 flag;
10268                         regnode *tail;
10269
10270                         ret = reg_node(pRExC_state, LOGICAL);
10271                         if (!SIZE_ONLY)
10272                             ret->flags = 1;
10273
10274                         tail = reg(pRExC_state, 1, &flag, depth+1);
10275                         if (flag & RESTART_UTF8) {
10276                             *flagp = RESTART_UTF8;
10277                             return NULL;
10278                         }
10279                         REGTAIL(pRExC_state, ret, tail);
10280                         goto insert_if;
10281                     }
10282                     /* Fall through to ‘Unknown switch condition’ at the
10283                        end of the if/else chain. */
10284                 }
10285                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10286                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10287                 {
10288                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10289                     char *name_start= RExC_parse++;
10290                     U32 num = 0;
10291                     SV *sv_dat=reg_scan_name(pRExC_state,
10292                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10293                     if (RExC_parse == name_start || *RExC_parse != ch)
10294                         vFAIL2("Sequence (?(%c... not terminated",
10295                             (ch == '>' ? '<' : ch));
10296                     RExC_parse++;
10297                     if (!SIZE_ONLY) {
10298                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10299                         RExC_rxi->data->data[num]=(void*)sv_dat;
10300                         SvREFCNT_inc_simple_void(sv_dat);
10301                     }
10302                     ret = reganode(pRExC_state,NGROUPP,num);
10303                     goto insert_if_check_paren;
10304                 }
10305                 else if (strnEQ(RExC_parse, "DEFINE",
10306                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10307                 {
10308                     ret = reganode(pRExC_state,DEFINEP,0);
10309                     RExC_parse += DEFINE_len;
10310                     is_define = 1;
10311                     goto insert_if_check_paren;
10312                 }
10313                 else if (RExC_parse[0] == 'R') {
10314                     RExC_parse++;
10315                     parno = 0;
10316                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10317                         parno = grok_atou(RExC_parse, &endptr);
10318                         if (endptr)
10319                             RExC_parse = (char*)endptr;
10320                     } else if (RExC_parse[0] == '&') {
10321                         SV *sv_dat;
10322                         RExC_parse++;
10323                         sv_dat = reg_scan_name(pRExC_state,
10324                             SIZE_ONLY
10325                             ? REG_RSN_RETURN_NULL
10326                             : REG_RSN_RETURN_DATA);
10327                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10328                     }
10329                     ret = reganode(pRExC_state,INSUBP,parno);
10330                     goto insert_if_check_paren;
10331                 }
10332                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10333                     /* (?(1)...) */
10334                     char c;
10335                     char *tmp;
10336                     parno = grok_atou(RExC_parse, &endptr);
10337                     if (endptr)
10338                         RExC_parse = (char*)endptr;
10339                     ret = reganode(pRExC_state, GROUPP, parno);
10340
10341                  insert_if_check_paren:
10342                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10343                         /* nextchar also skips comments, so undo its work
10344                          * and skip over the the next character.
10345                          */
10346                         RExC_parse = tmp;
10347                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10348                         vFAIL("Switch condition not recognized");
10349                     }
10350                   insert_if:
10351                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10352                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10353                     if (br == NULL) {
10354                         if (flags & RESTART_UTF8) {
10355                             *flagp = RESTART_UTF8;
10356                             return NULL;
10357                         }
10358                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10359                               (UV) flags);
10360                     } else
10361                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10362                                                           LONGJMP, 0));
10363                     c = *nextchar(pRExC_state);
10364                     if (flags&HASWIDTH)
10365                         *flagp |= HASWIDTH;
10366                     if (c == '|') {
10367                         if (is_define)
10368                             vFAIL("(?(DEFINE)....) does not allow branches");
10369
10370                         /* Fake one for optimizer.  */
10371                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10372
10373                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10374                             if (flags & RESTART_UTF8) {
10375                                 *flagp = RESTART_UTF8;
10376                                 return NULL;
10377                             }
10378                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10379                                   (UV) flags);
10380                         }
10381                         REGTAIL(pRExC_state, ret, lastbr);
10382                         if (flags&HASWIDTH)
10383                             *flagp |= HASWIDTH;
10384                         c = *nextchar(pRExC_state);
10385                     }
10386                     else
10387                         lastbr = NULL;
10388                     if (c != ')') {
10389                         if (RExC_parse>RExC_end)
10390                             vFAIL("Switch (?(condition)... not terminated");
10391                         else
10392                             vFAIL("Switch (?(condition)... contains too many branches");
10393                     }
10394                     ender = reg_node(pRExC_state, TAIL);
10395                     REGTAIL(pRExC_state, br, ender);
10396                     if (lastbr) {
10397                         REGTAIL(pRExC_state, lastbr, ender);
10398                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10399                     }
10400                     else
10401                         REGTAIL(pRExC_state, ret, ender);
10402                     RExC_size++; /* XXX WHY do we need this?!!
10403                                     For large programs it seems to be required
10404                                     but I can't figure out why. -- dmq*/
10405                     return ret;
10406                 }
10407                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10408                 vFAIL("Unknown switch condition (?(...))");
10409             }
10410             case '[':           /* (?[ ... ]) */
10411                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10412                                          oregcomp_parse);
10413             case 0:
10414                 RExC_parse--; /* for vFAIL to print correctly */
10415                 vFAIL("Sequence (? incomplete");
10416                 break;
10417             default: /* e.g., (?i) */
10418                 --RExC_parse;
10419               parse_flags:
10420                 parse_lparen_question_flags(pRExC_state);
10421                 if (UCHARAT(RExC_parse) != ':') {
10422                     nextchar(pRExC_state);
10423                     *flagp = TRYAGAIN;
10424                     return NULL;
10425                 }
10426                 paren = ':';
10427                 nextchar(pRExC_state);
10428                 ret = NULL;
10429                 goto parse_rest;
10430             } /* end switch */
10431         }
10432         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
10433           capturing_parens:
10434             parno = RExC_npar;
10435             RExC_npar++;
10436
10437             ret = reganode(pRExC_state, OPEN, parno);
10438             if (!SIZE_ONLY ){
10439                 if (!RExC_nestroot)
10440                     RExC_nestroot = parno;
10441                 if (RExC_seen & REG_RECURSE_SEEN
10442                     && !RExC_open_parens[parno-1])
10443                 {
10444                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10445                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10446                         22, "|    |", (int)(depth * 2 + 1), "",
10447                         (IV)parno, REG_NODE_NUM(ret)));
10448                     RExC_open_parens[parno-1]= ret;
10449                 }
10450             }
10451             Set_Node_Length(ret, 1); /* MJD */
10452             Set_Node_Offset(ret, RExC_parse); /* MJD */
10453             is_open = 1;
10454         } else {
10455             ret = NULL;
10456         }
10457     }
10458     else                        /* ! paren */
10459         ret = NULL;
10460
10461    parse_rest:
10462     /* Pick up the branches, linking them together. */
10463     parse_start = RExC_parse;   /* MJD */
10464     br = regbranch(pRExC_state, &flags, 1,depth+1);
10465
10466     /*     branch_len = (paren != 0); */
10467
10468     if (br == NULL) {
10469         if (flags & RESTART_UTF8) {
10470             *flagp = RESTART_UTF8;
10471             return NULL;
10472         }
10473         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10474     }
10475     if (*RExC_parse == '|') {
10476         if (!SIZE_ONLY && RExC_extralen) {
10477             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10478         }
10479         else {                  /* MJD */
10480             reginsert(pRExC_state, BRANCH, br, depth+1);
10481             Set_Node_Length(br, paren != 0);
10482             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10483         }
10484         have_branch = 1;
10485         if (SIZE_ONLY)
10486             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10487     }
10488     else if (paren == ':') {
10489         *flagp |= flags&SIMPLE;
10490     }
10491     if (is_open) {                              /* Starts with OPEN. */
10492         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10493     }
10494     else if (paren != '?')              /* Not Conditional */
10495         ret = br;
10496     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10497     lastbr = br;
10498     while (*RExC_parse == '|') {
10499         if (!SIZE_ONLY && RExC_extralen) {
10500             ender = reganode(pRExC_state, LONGJMP,0);
10501
10502             /* Append to the previous. */
10503             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10504         }
10505         if (SIZE_ONLY)
10506             RExC_extralen += 2;         /* Account for LONGJMP. */
10507         nextchar(pRExC_state);
10508         if (freeze_paren) {
10509             if (RExC_npar > after_freeze)
10510                 after_freeze = RExC_npar;
10511             RExC_npar = freeze_paren;
10512         }
10513         br = regbranch(pRExC_state, &flags, 0, depth+1);
10514
10515         if (br == NULL) {
10516             if (flags & RESTART_UTF8) {
10517                 *flagp = RESTART_UTF8;
10518                 return NULL;
10519             }
10520             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10521         }
10522         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10523         lastbr = br;
10524         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10525     }
10526
10527     if (have_branch || paren != ':') {
10528         /* Make a closing node, and hook it on the end. */
10529         switch (paren) {
10530         case ':':
10531             ender = reg_node(pRExC_state, TAIL);
10532             break;
10533         case 1: case 2:
10534             ender = reganode(pRExC_state, CLOSE, parno);
10535             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10536                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10537                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10538                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10539                 RExC_close_parens[parno-1]= ender;
10540                 if (RExC_nestroot == parno)
10541                     RExC_nestroot = 0;
10542             }
10543             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10544             Set_Node_Length(ender,1); /* MJD */
10545             break;
10546         case '<':
10547         case ',':
10548         case '=':
10549         case '!':
10550             *flagp &= ~HASWIDTH;
10551             /* FALLTHROUGH */
10552         case '>':
10553             ender = reg_node(pRExC_state, SUCCEED);
10554             break;
10555         case 0:
10556             ender = reg_node(pRExC_state, END);
10557             if (!SIZE_ONLY) {
10558                 assert(!RExC_opend); /* there can only be one! */
10559                 RExC_opend = ender;
10560             }
10561             break;
10562         }
10563         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10564             DEBUG_PARSE_MSG("lsbr");
10565             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10566             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10567             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10568                           SvPV_nolen_const(RExC_mysv1),
10569                           (IV)REG_NODE_NUM(lastbr),
10570                           SvPV_nolen_const(RExC_mysv2),
10571                           (IV)REG_NODE_NUM(ender),
10572                           (IV)(ender - lastbr)
10573             );
10574         });
10575         REGTAIL(pRExC_state, lastbr, ender);
10576
10577         if (have_branch && !SIZE_ONLY) {
10578             char is_nothing= 1;
10579             if (depth==1)
10580                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10581
10582             /* Hook the tails of the branches to the closing node. */
10583             for (br = ret; br; br = regnext(br)) {
10584                 const U8 op = PL_regkind[OP(br)];
10585                 if (op == BRANCH) {
10586                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10587                     if ( OP(NEXTOPER(br)) != NOTHING
10588                          || regnext(NEXTOPER(br)) != ender)
10589                         is_nothing= 0;
10590                 }
10591                 else if (op == BRANCHJ) {
10592                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10593                     /* for now we always disable this optimisation * /
10594                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10595                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10596                     */
10597                         is_nothing= 0;
10598                 }
10599             }
10600             if (is_nothing) {
10601                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10602                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10603                     DEBUG_PARSE_MSG("NADA");
10604                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10605                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10606                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10607                                   SvPV_nolen_const(RExC_mysv1),
10608                                   (IV)REG_NODE_NUM(ret),
10609                                   SvPV_nolen_const(RExC_mysv2),
10610                                   (IV)REG_NODE_NUM(ender),
10611                                   (IV)(ender - ret)
10612                     );
10613                 });
10614                 OP(br)= NOTHING;
10615                 if (OP(ender) == TAIL) {
10616                     NEXT_OFF(br)= 0;
10617                     RExC_emit= br + 1;
10618                 } else {
10619                     regnode *opt;
10620                     for ( opt= br + 1; opt < ender ; opt++ )
10621                         OP(opt)= OPTIMIZED;
10622                     NEXT_OFF(br)= ender - br;
10623                 }
10624             }
10625         }
10626     }
10627
10628     {
10629         const char *p;
10630         static const char parens[] = "=!<,>";
10631
10632         if (paren && (p = strchr(parens, paren))) {
10633             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10634             int flag = (p - parens) > 1;
10635
10636             if (paren == '>')
10637                 node = SUSPEND, flag = 0;
10638             reginsert(pRExC_state, node,ret, depth+1);
10639             Set_Node_Cur_Length(ret, parse_start);
10640             Set_Node_Offset(ret, parse_start + 1);
10641             ret->flags = flag;
10642             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10643         }
10644     }
10645
10646     /* Check for proper termination. */
10647     if (paren) {
10648         /* restore original flags, but keep (?p) */
10649         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10650         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10651             RExC_parse = oregcomp_parse;
10652             vFAIL("Unmatched (");
10653         }
10654     }
10655     else if (!paren && RExC_parse < RExC_end) {
10656         if (*RExC_parse == ')') {
10657             RExC_parse++;
10658             vFAIL("Unmatched )");
10659         }
10660         else
10661             FAIL("Junk on end of regexp");      /* "Can't happen". */
10662         NOT_REACHED; /* NOTREACHED */
10663     }
10664
10665     if (RExC_in_lookbehind) {
10666         RExC_in_lookbehind--;
10667     }
10668     if (after_freeze > RExC_npar)
10669         RExC_npar = after_freeze;
10670     return(ret);
10671 }
10672
10673 /*
10674  - regbranch - one alternative of an | operator
10675  *
10676  * Implements the concatenation operator.
10677  *
10678  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10679  * restarted.
10680  */
10681 STATIC regnode *
10682 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10683 {
10684     regnode *ret;
10685     regnode *chain = NULL;
10686     regnode *latest;
10687     I32 flags = 0, c = 0;
10688     GET_RE_DEBUG_FLAGS_DECL;
10689
10690     PERL_ARGS_ASSERT_REGBRANCH;
10691
10692     DEBUG_PARSE("brnc");
10693
10694     if (first)
10695         ret = NULL;
10696     else {
10697         if (!SIZE_ONLY && RExC_extralen)
10698             ret = reganode(pRExC_state, BRANCHJ,0);
10699         else {
10700             ret = reg_node(pRExC_state, BRANCH);
10701             Set_Node_Length(ret, 1);
10702         }
10703     }
10704
10705     if (!first && SIZE_ONLY)
10706         RExC_extralen += 1;                     /* BRANCHJ */
10707
10708     *flagp = WORST;                     /* Tentatively. */
10709
10710     RExC_parse--;
10711     nextchar(pRExC_state);
10712     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10713         flags &= ~TRYAGAIN;
10714         latest = regpiece(pRExC_state, &flags,depth+1);
10715         if (latest == NULL) {
10716             if (flags & TRYAGAIN)
10717                 continue;
10718             if (flags & RESTART_UTF8) {
10719                 *flagp = RESTART_UTF8;
10720                 return NULL;
10721             }
10722             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10723         }
10724         else if (ret == NULL)
10725             ret = latest;
10726         *flagp |= flags&(HASWIDTH|POSTPONED);
10727         if (chain == NULL)      /* First piece. */
10728             *flagp |= flags&SPSTART;
10729         else {
10730             /* FIXME adding one for every branch after the first is probably
10731              * excessive now we have TRIE support. (hv) */
10732             MARK_NAUGHTY(1);
10733             REGTAIL(pRExC_state, chain, latest);
10734         }
10735         chain = latest;
10736         c++;
10737     }
10738     if (chain == NULL) {        /* Loop ran zero times. */
10739         chain = reg_node(pRExC_state, NOTHING);
10740         if (ret == NULL)
10741             ret = chain;
10742     }
10743     if (c == 1) {
10744         *flagp |= flags&SIMPLE;
10745     }
10746
10747     return ret;
10748 }
10749
10750 /*
10751  - regpiece - something followed by possible [*+?]
10752  *
10753  * Note that the branching code sequences used for ? and the general cases
10754  * of * and + are somewhat optimized:  they use the same NOTHING node as
10755  * both the endmarker for their branch list and the body of the last branch.
10756  * It might seem that this node could be dispensed with entirely, but the
10757  * endmarker role is not redundant.
10758  *
10759  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10760  * TRYAGAIN.
10761  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10762  * restarted.
10763  */
10764 STATIC regnode *
10765 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10766 {
10767     regnode *ret;
10768     char op;
10769     char *next;
10770     I32 flags;
10771     const char * const origparse = RExC_parse;
10772     I32 min;
10773     I32 max = REG_INFTY;
10774 #ifdef RE_TRACK_PATTERN_OFFSETS
10775     char *parse_start;
10776 #endif
10777     const char *maxpos = NULL;
10778
10779     /* Save the original in case we change the emitted regop to a FAIL. */
10780     regnode * const orig_emit = RExC_emit;
10781
10782     GET_RE_DEBUG_FLAGS_DECL;
10783
10784     PERL_ARGS_ASSERT_REGPIECE;
10785
10786     DEBUG_PARSE("piec");
10787
10788     ret = regatom(pRExC_state, &flags,depth+1);
10789     if (ret == NULL) {
10790         if (flags & (TRYAGAIN|RESTART_UTF8))
10791             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10792         else
10793             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10794         return(NULL);
10795     }
10796
10797     op = *RExC_parse;
10798
10799     if (op == '{' && regcurly(RExC_parse)) {
10800         maxpos = NULL;
10801 #ifdef RE_TRACK_PATTERN_OFFSETS
10802         parse_start = RExC_parse; /* MJD */
10803 #endif
10804         next = RExC_parse + 1;
10805         while (isDIGIT(*next) || *next == ',') {
10806             if (*next == ',') {
10807                 if (maxpos)
10808                     break;
10809                 else
10810                     maxpos = next;
10811             }
10812             next++;
10813         }
10814         if (*next == '}') {             /* got one */
10815             const char* endptr;
10816             if (!maxpos)
10817                 maxpos = next;
10818             RExC_parse++;
10819             min = grok_atou(RExC_parse, &endptr);
10820             if (*maxpos == ',')
10821                 maxpos++;
10822             else
10823                 maxpos = RExC_parse;
10824             max = grok_atou(maxpos, &endptr);
10825             if (!max && *maxpos != '0')
10826                 max = REG_INFTY;                /* meaning "infinity" */
10827             else if (max >= REG_INFTY)
10828                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10829             RExC_parse = next;
10830             nextchar(pRExC_state);
10831             if (max < min) {    /* If can't match, warn and optimize to fail
10832                                    unconditionally */
10833                 if (SIZE_ONLY) {
10834
10835                     /* We can't back off the size because we have to reserve
10836                      * enough space for all the things we are about to throw
10837                      * away, but we can shrink it by the ammount we are about
10838                      * to re-use here */
10839                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10840                 }
10841                 else {
10842                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10843                     RExC_emit = orig_emit;
10844                 }
10845                 ret = reg_node(pRExC_state, OPFAIL);
10846                 return ret;
10847             }
10848             else if (min == max
10849                      && RExC_parse < RExC_end
10850                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10851             {
10852                 if (PASS2) {
10853                     ckWARN2reg(RExC_parse + 1,
10854                                "Useless use of greediness modifier '%c'",
10855                                *RExC_parse);
10856                 }
10857                 /* Absorb the modifier, so later code doesn't see nor use
10858                     * it */
10859                 nextchar(pRExC_state);
10860             }
10861
10862         do_curly:
10863             if ((flags&SIMPLE)) {
10864                 MARK_NAUGHTY_EXP(2, 2);
10865                 reginsert(pRExC_state, CURLY, ret, depth+1);
10866                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10867                 Set_Node_Cur_Length(ret, parse_start);
10868             }
10869             else {
10870                 regnode * const w = reg_node(pRExC_state, WHILEM);
10871
10872                 w->flags = 0;
10873                 REGTAIL(pRExC_state, ret, w);
10874                 if (!SIZE_ONLY && RExC_extralen) {
10875                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10876                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10877                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10878                 }
10879                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10880                                 /* MJD hk */
10881                 Set_Node_Offset(ret, parse_start+1);
10882                 Set_Node_Length(ret,
10883                                 op == '{' ? (RExC_parse - parse_start) : 1);
10884
10885                 if (!SIZE_ONLY && RExC_extralen)
10886                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10887                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10888                 if (SIZE_ONLY)
10889                     RExC_whilem_seen++, RExC_extralen += 3;
10890                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
10891             }
10892             ret->flags = 0;
10893
10894             if (min > 0)
10895                 *flagp = WORST;
10896             if (max > 0)
10897                 *flagp |= HASWIDTH;
10898             if (!SIZE_ONLY) {
10899                 ARG1_SET(ret, (U16)min);
10900                 ARG2_SET(ret, (U16)max);
10901             }
10902             if (max == REG_INFTY)
10903                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10904
10905             goto nest_check;
10906         }
10907     }
10908
10909     if (!ISMULT1(op)) {
10910         *flagp = flags;
10911         return(ret);
10912     }
10913
10914 #if 0                           /* Now runtime fix should be reliable. */
10915
10916     /* if this is reinstated, don't forget to put this back into perldiag:
10917
10918             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10919
10920            (F) The part of the regexp subject to either the * or + quantifier
10921            could match an empty string. The {#} shows in the regular
10922            expression about where the problem was discovered.
10923
10924     */
10925
10926     if (!(flags&HASWIDTH) && op != '?')
10927       vFAIL("Regexp *+ operand could be empty");
10928 #endif
10929
10930 #ifdef RE_TRACK_PATTERN_OFFSETS
10931     parse_start = RExC_parse;
10932 #endif
10933     nextchar(pRExC_state);
10934
10935     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10936
10937     if (op == '*' && (flags&SIMPLE)) {
10938         reginsert(pRExC_state, STAR, ret, depth+1);
10939         ret->flags = 0;
10940         MARK_NAUGHTY(4);
10941         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10942     }
10943     else if (op == '*') {
10944         min = 0;
10945         goto do_curly;
10946     }
10947     else if (op == '+' && (flags&SIMPLE)) {
10948         reginsert(pRExC_state, PLUS, ret, depth+1);
10949         ret->flags = 0;
10950         MARK_NAUGHTY(3);
10951         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10952     }
10953     else if (op == '+') {
10954         min = 1;
10955         goto do_curly;
10956     }
10957     else if (op == '?') {
10958         min = 0; max = 1;
10959         goto do_curly;
10960     }
10961   nest_check:
10962     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10963         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10964         ckWARN2reg(RExC_parse,
10965                    "%"UTF8f" matches null string many times",
10966                    UTF8fARG(UTF, (RExC_parse >= origparse
10967                                  ? RExC_parse - origparse
10968                                  : 0),
10969                    origparse));
10970         (void)ReREFCNT_inc(RExC_rx_sv);
10971     }
10972
10973     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10974         nextchar(pRExC_state);
10975         reginsert(pRExC_state, MINMOD, ret, depth+1);
10976         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10977     }
10978     else
10979     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10980         regnode *ender;
10981         nextchar(pRExC_state);
10982         ender = reg_node(pRExC_state, SUCCEED);
10983         REGTAIL(pRExC_state, ret, ender);
10984         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10985         ret->flags = 0;
10986         ender = reg_node(pRExC_state, TAIL);
10987         REGTAIL(pRExC_state, ret, ender);
10988     }
10989
10990     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10991         RExC_parse++;
10992         vFAIL("Nested quantifiers");
10993     }
10994
10995     return(ret);
10996 }
10997
10998 STATIC STRLEN
10999 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
11000                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
11001     )
11002 {
11003
11004  /* This is expected to be called by a parser routine that has recognized '\N'
11005    and needs to handle the rest. RExC_parse is expected to point at the first
11006    char following the N at the time of the call.  On successful return,
11007    RExC_parse has been updated to point to just after the sequence identified
11008    by this routine, <*flagp> has been updated, and the non-NULL input pointers
11009    have been set appropriately.
11010
11011    The typical case for this is \N{some character name}.  This is usually
11012    called while parsing the input, filling in or ready to fill in an EXACTish
11013    node, and the code point for the character should be returned, so that it
11014    can be added to the node, and parsing continued with the next input
11015    character.  But it may be that instead of a single character the \N{}
11016    expands to more than one, a named sequence.  In this case any following
11017    quantifier applies to the whole sequence, and it is easier, given the code
11018    structure that calls this, to handle it from a different area of the code.
11019    For this reason, the input parameters can be set so that it returns valid
11020    only on one or the other of these cases.
11021
11022    Another possibility is for the input to be an empty \N{}, which for
11023    backwards compatibility we accept, but generate a NOTHING node which should
11024    later get optimized out.  This is handled from the area of code which can
11025    handle a named sequence, so if called with the parameters for the other, it
11026    fails.
11027
11028    Still another possibility is for the \N to mean [^\n], and not a single
11029    character or explicit sequence at all.  This is determined by context.
11030    Again, this is handled from the area of code which can handle a named
11031    sequence, so if called with the parameters for the other, it also fails.
11032
11033    And the final possibility is for the \N to be called from within a bracketed
11034    character class.  In this case the [^\n] meaning makes no sense, and so is
11035    an error.  Other anomalous situations are left to the calling code to handle.
11036
11037    For non-single-quoted regexes, the tokenizer has attempted to decide which
11038    of the above applies, and in the case of a named sequence, has converted it
11039    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11040    where c1... are the characters in the sequence.  For single-quoted regexes,
11041    the tokenizer passes the \N sequence through unchanged; this code will not
11042    attempt to determine this nor expand those, instead raising a syntax error.
11043    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11044    or there is no '}', it signals that this \N occurrence means to match a
11045    non-newline. (This mostly was done because of [perl #56444].)
11046
11047    The API is somewhat convoluted due to historical and the above reasons.
11048
11049    The function raises an error (via vFAIL), and doesn't return for various
11050    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
11051    it returns a count of how many characters were accounted for by it.  (This
11052    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11053    points in the sequence.  It sets <node_p>, <valuep>, and/or
11054    <substitute_parse> on success.
11055
11056    If <valuep> is non-null, it means the caller can accept an input sequence
11057    consisting of just a single code point; <*valuep> is set to the value of the
11058    only or first code point in the input.
11059
11060    If <substitute_parse> is non-null, it means the caller can accept an input
11061    sequence consisting of one or more code points; <*substitute_parse> is a
11062    newly created mortal SV* in this case, containing \x{} escapes representing
11063    those code points.
11064
11065    Both <valuep> and <substitute_parse> can be non-NULL.
11066
11067    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
11068    that the caller can accept any legal sequence other than a single code
11069    point.  To wit, <*node_p> is set as follows:
11070     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11071     2) \N{}:              points to a new NOTHING node; return is 0
11072     3) otherwise:         points to a new EXACT node containing the resolved
11073                           string; return is the number of code points in the
11074                           string.  This will never be 1.
11075    Note that failure is returned for single code point sequences if <valuep> is
11076    null and <node_p> is not.
11077  */
11078
11079     char * endbrace;    /* '}' following the name */
11080     char* p;
11081     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11082                            stream */
11083     bool has_multiple_chars; /* true if the input stream contains a sequence of
11084                                 more than one character */
11085     bool in_char_class = substitute_parse != NULL;
11086     STRLEN count = 0;   /* Number of characters in this sequence */
11087
11088     GET_RE_DEBUG_FLAGS_DECL;
11089
11090     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11091
11092     GET_RE_DEBUG_FLAGS;
11093
11094     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
11095     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11096
11097     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11098      * modifier.  The other meaning does not, so use a temporary until we find
11099      * out which we are being called with */
11100     p = (RExC_flags & RXf_PMf_EXTENDED)
11101         ? regpatws(pRExC_state, RExC_parse,
11102                                 TRUE) /* means recognize comments */
11103         : RExC_parse;
11104
11105     /* Disambiguate between \N meaning a named character versus \N meaning
11106      * [^\n].  The former is assumed when it can't be the latter. */
11107     if (*p != '{' || regcurly(p)) {
11108         RExC_parse = p;
11109         if (! node_p) {
11110             /* no bare \N allowed in a charclass */
11111             if (in_char_class) {
11112                 vFAIL("\\N in a character class must be a named character: \\N{...}");
11113             }
11114             return (STRLEN) -1;
11115         }
11116         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11117                            current char */
11118         nextchar(pRExC_state);
11119         *node_p = reg_node(pRExC_state, REG_ANY);
11120         *flagp |= HASWIDTH|SIMPLE;
11121         MARK_NAUGHTY(1);
11122         Set_Node_Length(*node_p, 1); /* MJD */
11123         return 1;
11124     }
11125
11126     /* Here, we have decided it should be a named character or sequence */
11127
11128     /* The test above made sure that the next real character is a '{', but
11129      * under the /x modifier, it could be separated by space (or a comment and
11130      * \n) and this is not allowed (for consistency with \x{...} and the
11131      * tokenizer handling of \N{NAME}). */
11132     if (*RExC_parse != '{') {
11133         vFAIL("Missing braces on \\N{}");
11134     }
11135
11136     RExC_parse++;       /* Skip past the '{' */
11137
11138     if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11139         || ! (endbrace == RExC_parse            /* nothing between the {} */
11140               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked... */
11141                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11142                                                        error msg) */
11143     {
11144         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11145         vFAIL("\\N{NAME} must be resolved by the lexer");
11146     }
11147
11148     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11149
11150     if (endbrace == RExC_parse) {   /* empty: \N{} */
11151         if (node_p) {
11152             *node_p = reg_node(pRExC_state,NOTHING);
11153         }
11154         else if (! in_char_class) {
11155             return (STRLEN) -1;
11156         }
11157         nextchar(pRExC_state);
11158         return 0;
11159     }
11160
11161     RExC_parse += 2;    /* Skip past the 'U+' */
11162
11163     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11164
11165     /* Code points are separated by dots.  If none, there is only one code
11166      * point, and is terminated by the brace */
11167     has_multiple_chars = (endchar < endbrace);
11168
11169     /* We get the first code point if we want it, and either there is only one,
11170      * or we can accept both cases of one and there is more than one */
11171     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11172         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11173         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11174                            | PERL_SCAN_DISALLOW_PREFIX
11175
11176                              /* No errors in the first pass (See [perl
11177                               * #122671].)  We let the code below find the
11178                               * errors when there are multiple chars. */
11179                            | ((SIZE_ONLY || has_multiple_chars)
11180                               ? PERL_SCAN_SILENT_ILLDIGIT
11181                               : 0);
11182
11183         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11184
11185         /* The tokenizer should have guaranteed validity, but it's possible to
11186          * bypass it by using single quoting, so check.  Don't do the check
11187          * here when there are multiple chars; we do it below anyway. */
11188         if (! has_multiple_chars) {
11189             if (length_of_hex == 0
11190                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11191             {
11192                 RExC_parse += length_of_hex;    /* Includes all the valid */
11193                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11194                                 ? UTF8SKIP(RExC_parse)
11195                                 : 1;
11196                 /* Guard against malformed utf8 */
11197                 if (RExC_parse >= endchar) {
11198                     RExC_parse = endchar;
11199                 }
11200                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11201             }
11202
11203             RExC_parse = endbrace + 1;
11204             return 1;
11205         }
11206     }
11207
11208     /* Here, we should have already handled the case where a single character
11209      * is expected and found.  So it is a failure if we aren't expecting
11210      * multiple chars and got them; or didn't get them but wanted them.  We
11211      * fail without advancing the parse, so that the caller can try again with
11212      * different acceptance criteria */
11213     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11214         RExC_parse = p;
11215         return (STRLEN) -1;
11216     }
11217
11218     {
11219         /* What is done here is to convert this to a sub-pattern of the form
11220          * \x{char1}\x{char2}...
11221          * and then either return it in <*substitute_parse> if non-null; or
11222          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11223          * way, it retains its atomicness, while not having to worry about
11224          * special handling that some code points may have.  toke.c has
11225          * converted the original Unicode values to native, so that we can just
11226          * pass on the hex values unchanged.  We do have to set a flag to keep
11227          * recoding from happening in the recursion */
11228
11229         SV * dummy = NULL;
11230         STRLEN len;
11231         char *orig_end = RExC_end;
11232         I32 flags;
11233
11234         if (substitute_parse) {
11235             *substitute_parse = newSVpvs("");
11236         }
11237         else {
11238             substitute_parse = &dummy;
11239             *substitute_parse = newSVpvs("?:");
11240         }
11241         *substitute_parse = sv_2mortal(*substitute_parse);
11242
11243         while (RExC_parse < endbrace) {
11244
11245             /* Convert to notation the rest of the code understands */
11246             sv_catpv(*substitute_parse, "\\x{");
11247             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11248             sv_catpv(*substitute_parse, "}");
11249
11250             /* Point to the beginning of the next character in the sequence. */
11251             RExC_parse = endchar + 1;
11252             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11253
11254             count++;
11255         }
11256         if (! in_char_class) {
11257             sv_catpv(*substitute_parse, ")");
11258         }
11259
11260         RExC_parse = SvPV(*substitute_parse, len);
11261
11262         /* Don't allow empty number */
11263         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11264             RExC_parse = endbrace;
11265             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11266         }
11267         RExC_end = RExC_parse + len;
11268
11269         /* The values are Unicode, and therefore not subject to recoding */
11270         RExC_override_recoding = 1;
11271
11272         if (node_p) {
11273             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11274                 if (flags & RESTART_UTF8) {
11275                     *flagp = RESTART_UTF8;
11276                     return (STRLEN) -1;
11277                 }
11278                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11279                     (UV) flags);
11280             }
11281             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11282         }
11283
11284         RExC_parse = endbrace;
11285         RExC_end = orig_end;
11286         RExC_override_recoding = 0;
11287
11288         nextchar(pRExC_state);
11289     }
11290
11291     return count;
11292 }
11293
11294
11295 /*
11296  * reg_recode
11297  *
11298  * It returns the code point in utf8 for the value in *encp.
11299  *    value: a code value in the source encoding
11300  *    encp:  a pointer to an Encode object
11301  *
11302  * If the result from Encode is not a single character,
11303  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11304  */
11305 STATIC UV
11306 S_reg_recode(pTHX_ const char value, SV **encp)
11307 {
11308     STRLEN numlen = 1;
11309     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11310     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11311     const STRLEN newlen = SvCUR(sv);
11312     UV uv = UNICODE_REPLACEMENT;
11313
11314     PERL_ARGS_ASSERT_REG_RECODE;
11315
11316     if (newlen)
11317         uv = SvUTF8(sv)
11318              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11319              : *(U8*)s;
11320
11321     if (!newlen || numlen != newlen) {
11322         uv = UNICODE_REPLACEMENT;
11323         *encp = NULL;
11324     }
11325     return uv;
11326 }
11327
11328 PERL_STATIC_INLINE U8
11329 S_compute_EXACTish(RExC_state_t *pRExC_state)
11330 {
11331     U8 op;
11332
11333     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11334
11335     if (! FOLD) {
11336         return (LOC)
11337                 ? EXACTL
11338                 : EXACT;
11339     }
11340
11341     op = get_regex_charset(RExC_flags);
11342     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11343         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11344                  been, so there is no hole */
11345     }
11346
11347     return op + EXACTF;
11348 }
11349
11350 PERL_STATIC_INLINE void
11351 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11352                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11353                          bool downgradable)
11354 {
11355     /* This knows the details about sizing an EXACTish node, setting flags for
11356      * it (by setting <*flagp>, and potentially populating it with a single
11357      * character.
11358      *
11359      * If <len> (the length in bytes) is non-zero, this function assumes that
11360      * the node has already been populated, and just does the sizing.  In this
11361      * case <code_point> should be the final code point that has already been
11362      * placed into the node.  This value will be ignored except that under some
11363      * circumstances <*flagp> is set based on it.
11364      *
11365      * If <len> is zero, the function assumes that the node is to contain only
11366      * the single character given by <code_point> and calculates what <len>
11367      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11368      * additionally will populate the node's STRING with <code_point> or its
11369      * fold if folding.
11370      *
11371      * In both cases <*flagp> is appropriately set
11372      *
11373      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11374      * 255, must be folded (the former only when the rules indicate it can
11375      * match 'ss')
11376      *
11377      * When it does the populating, it looks at the flag 'downgradable'.  If
11378      * true with a node that folds, it checks if the single code point
11379      * participates in a fold, and if not downgrades the node to an EXACT.
11380      * This helps the optimizer */
11381
11382     bool len_passed_in = cBOOL(len != 0);
11383     U8 character[UTF8_MAXBYTES_CASE+1];
11384
11385     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11386
11387     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11388      * sizing difference, and is extra work that is thrown away */
11389     if (downgradable && ! PASS2) {
11390         downgradable = FALSE;
11391     }
11392
11393     if (! len_passed_in) {
11394         if (UTF) {
11395             if (UVCHR_IS_INVARIANT(code_point)) {
11396                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11397                     *character = (U8) code_point;
11398                 }
11399                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11400                           ASCII, which isn't the same thing as INVARIANT on
11401                           EBCDIC, but it works there, as the extra invariants
11402                           fold to themselves) */
11403                     *character = toFOLD((U8) code_point);
11404
11405                     /* We can downgrade to an EXACT node if this character
11406                      * isn't a folding one.  Note that this assumes that
11407                      * nothing above Latin1 folds to some other invariant than
11408                      * one of these alphabetics; otherwise we would also have
11409                      * to check:
11410                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11411                      *      || ASCII_FOLD_RESTRICTED))
11412                      */
11413                     if (downgradable && PL_fold[code_point] == code_point) {
11414                         OP(node) = EXACT;
11415                     }
11416                 }
11417                 len = 1;
11418             }
11419             else if (FOLD && (! LOC
11420                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11421             {   /* Folding, and ok to do so now */
11422                 UV folded = _to_uni_fold_flags(
11423                                    code_point,
11424                                    character,
11425                                    &len,
11426                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11427                                                       ? FOLD_FLAGS_NOMIX_ASCII
11428                                                       : 0));
11429                 if (downgradable
11430                     && folded == code_point /* This quickly rules out many
11431                                                cases, avoiding the
11432                                                _invlist_contains_cp() overhead
11433                                                for those.  */
11434                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11435                 {
11436                     OP(node) = (LOC)
11437                                ? EXACTL
11438                                : EXACT;
11439                 }
11440             }
11441             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11442
11443                 /* Not folding this cp, and can output it directly */
11444                 *character = UTF8_TWO_BYTE_HI(code_point);
11445                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11446                 len = 2;
11447             }
11448             else {
11449                 uvchr_to_utf8( character, code_point);
11450                 len = UTF8SKIP(character);
11451             }
11452         } /* Else pattern isn't UTF8.  */
11453         else if (! FOLD) {
11454             *character = (U8) code_point;
11455             len = 1;
11456         } /* Else is folded non-UTF8 */
11457         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11458
11459             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11460              * comments at join_exact()); */
11461             *character = (U8) code_point;
11462             len = 1;
11463
11464             /* Can turn into an EXACT node if we know the fold at compile time,
11465              * and it folds to itself and doesn't particpate in other folds */
11466             if (downgradable
11467                 && ! LOC
11468                 && PL_fold_latin1[code_point] == code_point
11469                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11470                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11471             {
11472                 OP(node) = EXACT;
11473             }
11474         } /* else is Sharp s.  May need to fold it */
11475         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11476             *character = 's';
11477             *(character + 1) = 's';
11478             len = 2;
11479         }
11480         else {
11481             *character = LATIN_SMALL_LETTER_SHARP_S;
11482             len = 1;
11483         }
11484     }
11485
11486     if (SIZE_ONLY) {
11487         RExC_size += STR_SZ(len);
11488     }
11489     else {
11490         RExC_emit += STR_SZ(len);
11491         STR_LEN(node) = len;
11492         if (! len_passed_in) {
11493             Copy((char *) character, STRING(node), len, char);
11494         }
11495     }
11496
11497     *flagp |= HASWIDTH;
11498
11499     /* A single character node is SIMPLE, except for the special-cased SHARP S
11500      * under /di. */
11501     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11502         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11503             || ! FOLD || ! DEPENDS_SEMANTICS))
11504     {
11505         *flagp |= SIMPLE;
11506     }
11507
11508     /* The OP may not be well defined in PASS1 */
11509     if (PASS2 && OP(node) == EXACTFL) {
11510         RExC_contains_locale = 1;
11511     }
11512 }
11513
11514
11515 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11516  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11517
11518 static I32
11519 S_backref_value(char *p)
11520 {
11521     const char* endptr;
11522     UV val = grok_atou(p, &endptr);
11523     if (endptr == p || endptr == NULL || val > I32_MAX)
11524         return I32_MAX;
11525     return (I32)val;
11526 }
11527
11528
11529 /*
11530  - regatom - the lowest level
11531
11532    Try to identify anything special at the start of the pattern. If there
11533    is, then handle it as required. This may involve generating a single regop,
11534    such as for an assertion; or it may involve recursing, such as to
11535    handle a () structure.
11536
11537    If the string doesn't start with something special then we gobble up
11538    as much literal text as we can.
11539
11540    Once we have been able to handle whatever type of thing started the
11541    sequence, we return.
11542
11543    Note: we have to be careful with escapes, as they can be both literal
11544    and special, and in the case of \10 and friends, context determines which.
11545
11546    A summary of the code structure is:
11547
11548    switch (first_byte) {
11549         cases for each special:
11550             handle this special;
11551             break;
11552         case '\\':
11553             switch (2nd byte) {
11554                 cases for each unambiguous special:
11555                     handle this special;
11556                     break;
11557                 cases for each ambigous special/literal:
11558                     disambiguate;
11559                     if (special)  handle here
11560                     else goto defchar;
11561                 default: // unambiguously literal:
11562                     goto defchar;
11563             }
11564         default:  // is a literal char
11565             // FALL THROUGH
11566         defchar:
11567             create EXACTish node for literal;
11568             while (more input and node isn't full) {
11569                 switch (input_byte) {
11570                    cases for each special;
11571                        make sure parse pointer is set so that the next call to
11572                            regatom will see this special first
11573                        goto loopdone; // EXACTish node terminated by prev. char
11574                    default:
11575                        append char to EXACTISH node;
11576                 }
11577                 get next input byte;
11578             }
11579         loopdone:
11580    }
11581    return the generated node;
11582
11583    Specifically there are two separate switches for handling
11584    escape sequences, with the one for handling literal escapes requiring
11585    a dummy entry for all of the special escapes that are actually handled
11586    by the other.
11587
11588    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11589    TRYAGAIN.
11590    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11591    restarted.
11592    Otherwise does not return NULL.
11593 */
11594
11595 STATIC regnode *
11596 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11597 {
11598     regnode *ret = NULL;
11599     I32 flags = 0;
11600     char *parse_start = RExC_parse;
11601     U8 op;
11602     int invert = 0;
11603     U8 arg;
11604
11605     GET_RE_DEBUG_FLAGS_DECL;
11606
11607     *flagp = WORST;             /* Tentatively. */
11608
11609     DEBUG_PARSE("atom");
11610
11611     PERL_ARGS_ASSERT_REGATOM;
11612
11613 tryagain:
11614     switch ((U8)*RExC_parse) {
11615     case '^':
11616         RExC_seen_zerolen++;
11617         nextchar(pRExC_state);
11618         if (RExC_flags & RXf_PMf_MULTILINE)
11619             ret = reg_node(pRExC_state, MBOL);
11620         else
11621             ret = reg_node(pRExC_state, SBOL);
11622         Set_Node_Length(ret, 1); /* MJD */
11623         break;
11624     case '$':
11625         nextchar(pRExC_state);
11626         if (*RExC_parse)
11627             RExC_seen_zerolen++;
11628         if (RExC_flags & RXf_PMf_MULTILINE)
11629             ret = reg_node(pRExC_state, MEOL);
11630         else
11631             ret = reg_node(pRExC_state, SEOL);
11632         Set_Node_Length(ret, 1); /* MJD */
11633         break;
11634     case '.':
11635         nextchar(pRExC_state);
11636         if (RExC_flags & RXf_PMf_SINGLELINE)
11637             ret = reg_node(pRExC_state, SANY);
11638         else
11639             ret = reg_node(pRExC_state, REG_ANY);
11640         *flagp |= HASWIDTH|SIMPLE;
11641         MARK_NAUGHTY(1);
11642         Set_Node_Length(ret, 1); /* MJD */
11643         break;
11644     case '[':
11645     {
11646         char * const oregcomp_parse = ++RExC_parse;
11647         ret = regclass(pRExC_state, flagp,depth+1,
11648                        FALSE, /* means parse the whole char class */
11649                        TRUE, /* allow multi-char folds */
11650                        FALSE, /* don't silence non-portable warnings. */
11651                        NULL);
11652         if (*RExC_parse != ']') {
11653             RExC_parse = oregcomp_parse;
11654             vFAIL("Unmatched [");
11655         }
11656         if (ret == NULL) {
11657             if (*flagp & RESTART_UTF8)
11658                 return NULL;
11659             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11660                   (UV) *flagp);
11661         }
11662         nextchar(pRExC_state);
11663         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11664         break;
11665     }
11666     case '(':
11667         nextchar(pRExC_state);
11668         ret = reg(pRExC_state, 2, &flags,depth+1);
11669         if (ret == NULL) {
11670                 if (flags & TRYAGAIN) {
11671                     if (RExC_parse == RExC_end) {
11672                          /* Make parent create an empty node if needed. */
11673                         *flagp |= TRYAGAIN;
11674                         return(NULL);
11675                     }
11676                     goto tryagain;
11677                 }
11678                 if (flags & RESTART_UTF8) {
11679                     *flagp = RESTART_UTF8;
11680                     return NULL;
11681                 }
11682                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11683                                                                  (UV) flags);
11684         }
11685         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11686         break;
11687     case '|':
11688     case ')':
11689         if (flags & TRYAGAIN) {
11690             *flagp |= TRYAGAIN;
11691             return NULL;
11692         }
11693         vFAIL("Internal urp");
11694                                 /* Supposed to be caught earlier. */
11695         break;
11696     case '?':
11697     case '+':
11698     case '*':
11699         RExC_parse++;
11700         vFAIL("Quantifier follows nothing");
11701         break;
11702     case '\\':
11703         /* Special Escapes
11704
11705            This switch handles escape sequences that resolve to some kind
11706            of special regop and not to literal text. Escape sequnces that
11707            resolve to literal text are handled below in the switch marked
11708            "Literal Escapes".
11709
11710            Every entry in this switch *must* have a corresponding entry
11711            in the literal escape switch. However, the opposite is not
11712            required, as the default for this switch is to jump to the
11713            literal text handling code.
11714         */
11715         switch ((U8)*++RExC_parse) {
11716         /* Special Escapes */
11717         case 'A':
11718             RExC_seen_zerolen++;
11719             ret = reg_node(pRExC_state, SBOL);
11720             /* SBOL is shared with /^/ so we set the flags so we can tell
11721              * /\A/ from /^/ in split. We check ret because first pass we
11722              * have no regop struct to set the flags on. */
11723             if (PASS2)
11724                 ret->flags = 1;
11725             *flagp |= SIMPLE;
11726             goto finish_meta_pat;
11727         case 'G':
11728             ret = reg_node(pRExC_state, GPOS);
11729             RExC_seen |= REG_GPOS_SEEN;
11730             *flagp |= SIMPLE;
11731             goto finish_meta_pat;
11732         case 'K':
11733             RExC_seen_zerolen++;
11734             ret = reg_node(pRExC_state, KEEPS);
11735             *flagp |= SIMPLE;
11736             /* XXX:dmq : disabling in-place substitution seems to
11737              * be necessary here to avoid cases of memory corruption, as
11738              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11739              */
11740             RExC_seen |= REG_LOOKBEHIND_SEEN;
11741             goto finish_meta_pat;
11742         case 'Z':
11743             ret = reg_node(pRExC_state, SEOL);
11744             *flagp |= SIMPLE;
11745             RExC_seen_zerolen++;                /* Do not optimize RE away */
11746             goto finish_meta_pat;
11747         case 'z':
11748             ret = reg_node(pRExC_state, EOS);
11749             *flagp |= SIMPLE;
11750             RExC_seen_zerolen++;                /* Do not optimize RE away */
11751             goto finish_meta_pat;
11752         case 'C':
11753             ret = reg_node(pRExC_state, CANY);
11754             RExC_seen |= REG_CANY_SEEN;
11755             *flagp |= HASWIDTH|SIMPLE;
11756             if (PASS2) {
11757                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11758             }
11759             goto finish_meta_pat;
11760         case 'X':
11761             ret = reg_node(pRExC_state, CLUMP);
11762             *flagp |= HASWIDTH;
11763             goto finish_meta_pat;
11764
11765         case 'W':
11766             invert = 1;
11767             /* FALLTHROUGH */
11768         case 'w':
11769             arg = ANYOF_WORDCHAR;
11770             goto join_posix;
11771
11772         case 'b':
11773             RExC_seen_zerolen++;
11774             RExC_seen |= REG_LOOKBEHIND_SEEN;
11775             op = BOUND + get_regex_charset(RExC_flags);
11776             if (op > BOUNDA) {  /* /aa is same as /a */
11777                 op = BOUNDA;
11778             }
11779             else if (op == BOUNDL) {
11780                 RExC_contains_locale = 1;
11781             }
11782             ret = reg_node(pRExC_state, op);
11783             FLAGS(ret) = get_regex_charset(RExC_flags);
11784             *flagp |= SIMPLE;
11785             if ((U8) *(RExC_parse + 1) == '{') {
11786                 /* diag_listed_as: Use "%s" instead of "%s" */
11787                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11788             }
11789             goto finish_meta_pat;
11790         case 'B':
11791             RExC_seen_zerolen++;
11792             RExC_seen |= REG_LOOKBEHIND_SEEN;
11793             op = NBOUND + get_regex_charset(RExC_flags);
11794             if (op > NBOUNDA) { /* /aa is same as /a */
11795                 op = NBOUNDA;
11796             }
11797             else if (op == NBOUNDL) {
11798                 RExC_contains_locale = 1;
11799             }
11800             ret = reg_node(pRExC_state, op);
11801             FLAGS(ret) = get_regex_charset(RExC_flags);
11802             *flagp |= SIMPLE;
11803             if ((U8) *(RExC_parse + 1) == '{') {
11804                 /* diag_listed_as: Use "%s" instead of "%s" */
11805                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11806             }
11807             goto finish_meta_pat;
11808
11809         case 'D':
11810             invert = 1;
11811             /* FALLTHROUGH */
11812         case 'd':
11813             arg = ANYOF_DIGIT;
11814             goto join_posix;
11815
11816         case 'R':
11817             ret = reg_node(pRExC_state, LNBREAK);
11818             *flagp |= HASWIDTH|SIMPLE;
11819             goto finish_meta_pat;
11820
11821         case 'H':
11822             invert = 1;
11823             /* FALLTHROUGH */
11824         case 'h':
11825             arg = ANYOF_BLANK;
11826             op = POSIXU;
11827             goto join_posix_op_known;
11828
11829         case 'V':
11830             invert = 1;
11831             /* FALLTHROUGH */
11832         case 'v':
11833             arg = ANYOF_VERTWS;
11834             op = POSIXU;
11835             goto join_posix_op_known;
11836
11837         case 'S':
11838             invert = 1;
11839             /* FALLTHROUGH */
11840         case 's':
11841             arg = ANYOF_SPACE;
11842
11843         join_posix:
11844
11845             op = POSIXD + get_regex_charset(RExC_flags);
11846             if (op > POSIXA) {  /* /aa is same as /a */
11847                 op = POSIXA;
11848             }
11849             else if (op == POSIXL) {
11850                 RExC_contains_locale = 1;
11851             }
11852
11853         join_posix_op_known:
11854
11855             if (invert) {
11856                 op += NPOSIXD - POSIXD;
11857             }
11858
11859             ret = reg_node(pRExC_state, op);
11860             if (! SIZE_ONLY) {
11861                 FLAGS(ret) = namedclass_to_classnum(arg);
11862             }
11863
11864             *flagp |= HASWIDTH|SIMPLE;
11865             /* FALLTHROUGH */
11866
11867          finish_meta_pat:
11868             nextchar(pRExC_state);
11869             Set_Node_Length(ret, 2); /* MJD */
11870             break;
11871         case 'p':
11872         case 'P':
11873             {
11874 #ifdef DEBUGGING
11875                 char* parse_start = RExC_parse - 2;
11876 #endif
11877
11878                 RExC_parse--;
11879
11880                 ret = regclass(pRExC_state, flagp,depth+1,
11881                                TRUE, /* means just parse this element */
11882                                FALSE, /* don't allow multi-char folds */
11883                                FALSE, /* don't silence non-portable warnings.
11884                                          It would be a bug if these returned
11885                                          non-portables */
11886                                NULL);
11887                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11888                    are allowed.  */
11889                 if (!ret)
11890                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11891                           (UV) *flagp);
11892
11893                 RExC_parse--;
11894
11895                 Set_Node_Offset(ret, parse_start + 2);
11896                 Set_Node_Cur_Length(ret, parse_start);
11897                 nextchar(pRExC_state);
11898             }
11899             break;
11900         case 'N':
11901             /* Handle \N and \N{NAME} with multiple code points here and not
11902              * below because it can be multicharacter. join_exact() will join
11903              * them up later on.  Also this makes sure that things like
11904              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11905              * The options to the grok function call causes it to fail if the
11906              * sequence is just a single code point.  We then go treat it as
11907              * just another character in the current EXACT node, and hence it
11908              * gets uniform treatment with all the other characters.  The
11909              * special treatment for quantifiers is not needed for such single
11910              * character sequences */
11911             ++RExC_parse;
11912             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11913                                              depth, FALSE))
11914             {
11915                 if (*flagp & RESTART_UTF8)
11916                     return NULL;
11917                 RExC_parse--;
11918                 goto defchar;
11919             }
11920             break;
11921         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11922         parse_named_seq:
11923         {
11924             char ch= RExC_parse[1];
11925             if (ch != '<' && ch != '\'' && ch != '{') {
11926                 RExC_parse++;
11927                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11928                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11929             } else {
11930                 /* this pretty much dupes the code for (?P=...) in reg(), if
11931                    you change this make sure you change that */
11932                 char* name_start = (RExC_parse += 2);
11933                 U32 num = 0;
11934                 SV *sv_dat = reg_scan_name(pRExC_state,
11935                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11936                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11937                 if (RExC_parse == name_start || *RExC_parse != ch)
11938                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11939                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11940
11941                 if (!SIZE_ONLY) {
11942                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11943                     RExC_rxi->data->data[num]=(void*)sv_dat;
11944                     SvREFCNT_inc_simple_void(sv_dat);
11945                 }
11946
11947                 RExC_sawback = 1;
11948                 ret = reganode(pRExC_state,
11949                                ((! FOLD)
11950                                  ? NREF
11951                                  : (ASCII_FOLD_RESTRICTED)
11952                                    ? NREFFA
11953                                    : (AT_LEAST_UNI_SEMANTICS)
11954                                      ? NREFFU
11955                                      : (LOC)
11956                                        ? NREFFL
11957                                        : NREFF),
11958                                 num);
11959                 *flagp |= HASWIDTH;
11960
11961                 /* override incorrect value set in reganode MJD */
11962                 Set_Node_Offset(ret, parse_start+1);
11963                 Set_Node_Cur_Length(ret, parse_start);
11964                 nextchar(pRExC_state);
11965
11966             }
11967             break;
11968         }
11969         case 'g':
11970         case '1': case '2': case '3': case '4':
11971         case '5': case '6': case '7': case '8': case '9':
11972             {
11973                 I32 num;
11974                 bool hasbrace = 0;
11975
11976                 if (*RExC_parse == 'g') {
11977                     bool isrel = 0;
11978
11979                     RExC_parse++;
11980                     if (*RExC_parse == '{') {
11981                         RExC_parse++;
11982                         hasbrace = 1;
11983                     }
11984                     if (*RExC_parse == '-') {
11985                         RExC_parse++;
11986                         isrel = 1;
11987                     }
11988                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11989                         if (isrel) RExC_parse--;
11990                         RExC_parse -= 2;
11991                         goto parse_named_seq;
11992                     }
11993
11994                     num = S_backref_value(RExC_parse);
11995                     if (num == 0)
11996                         vFAIL("Reference to invalid group 0");
11997                     else if (num == I32_MAX) {
11998                          if (isDIGIT(*RExC_parse))
11999                             vFAIL("Reference to nonexistent group");
12000                         else
12001                             vFAIL("Unterminated \\g... pattern");
12002                     }
12003
12004                     if (isrel) {
12005                         num = RExC_npar - num;
12006                         if (num < 1)
12007                             vFAIL("Reference to nonexistent or unclosed group");
12008                     }
12009                 }
12010                 else {
12011                     num = S_backref_value(RExC_parse);
12012                     /* bare \NNN might be backref or octal - if it is larger than or equal
12013                      * RExC_npar then it is assumed to be and octal escape.
12014                      * Note RExC_npar is +1 from the actual number of parens*/
12015                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
12016                             && *RExC_parse != '8' && *RExC_parse != '9'))
12017                     {
12018                         /* Probably a character specified in octal, e.g. \35 */
12019                         goto defchar;
12020                     }
12021                 }
12022
12023                 /* at this point RExC_parse definitely points to a backref
12024                  * number */
12025                 {
12026 #ifdef RE_TRACK_PATTERN_OFFSETS
12027                     char * const parse_start = RExC_parse - 1; /* MJD */
12028 #endif
12029                     while (isDIGIT(*RExC_parse))
12030                         RExC_parse++;
12031                     if (hasbrace) {
12032                         if (*RExC_parse != '}')
12033                             vFAIL("Unterminated \\g{...} pattern");
12034                         RExC_parse++;
12035                     }
12036                     if (!SIZE_ONLY) {
12037                         if (num > (I32)RExC_rx->nparens)
12038                             vFAIL("Reference to nonexistent group");
12039                     }
12040                     RExC_sawback = 1;
12041                     ret = reganode(pRExC_state,
12042                                    ((! FOLD)
12043                                      ? REF
12044                                      : (ASCII_FOLD_RESTRICTED)
12045                                        ? REFFA
12046                                        : (AT_LEAST_UNI_SEMANTICS)
12047                                          ? REFFU
12048                                          : (LOC)
12049                                            ? REFFL
12050                                            : REFF),
12051                                     num);
12052                     *flagp |= HASWIDTH;
12053
12054                     /* override incorrect value set in reganode MJD */
12055                     Set_Node_Offset(ret, parse_start+1);
12056                     Set_Node_Cur_Length(ret, parse_start);
12057                     RExC_parse--;
12058                     nextchar(pRExC_state);
12059                 }
12060             }
12061             break;
12062         case '\0':
12063             if (RExC_parse >= RExC_end)
12064                 FAIL("Trailing \\");
12065             /* FALLTHROUGH */
12066         default:
12067             /* Do not generate "unrecognized" warnings here, we fall
12068                back into the quick-grab loop below */
12069             parse_start--;
12070             goto defchar;
12071         }
12072         break;
12073
12074     case '#':
12075         if (RExC_flags & RXf_PMf_EXTENDED) {
12076             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12077             if (RExC_parse < RExC_end)
12078                 goto tryagain;
12079         }
12080         /* FALLTHROUGH */
12081
12082     default:
12083
12084             parse_start = RExC_parse - 1;
12085
12086             RExC_parse++;
12087
12088         defchar: {
12089             STRLEN len = 0;
12090             UV ender = 0;
12091             char *p;
12092             char *s;
12093 #define MAX_NODE_STRING_SIZE 127
12094             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12095             char *s0;
12096             U8 upper_parse = MAX_NODE_STRING_SIZE;
12097             U8 node_type = compute_EXACTish(pRExC_state);
12098             bool next_is_quantifier;
12099             char * oldp = NULL;
12100
12101             /* We can convert EXACTF nodes to EXACTFU if they contain only
12102              * characters that match identically regardless of the target
12103              * string's UTF8ness.  The reason to do this is that EXACTF is not
12104              * trie-able, EXACTFU is.
12105              *
12106              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12107              * contain only above-Latin1 characters (hence must be in UTF8),
12108              * which don't participate in folds with Latin1-range characters,
12109              * as the latter's folds aren't known until runtime.  (We don't
12110              * need to figure this out until pass 2) */
12111             bool maybe_exactfu = PASS2
12112                                && (node_type == EXACTF || node_type == EXACTFL);
12113
12114             /* If a folding node contains only code points that don't
12115              * participate in folds, it can be changed into an EXACT node,
12116              * which allows the optimizer more things to look for */
12117             bool maybe_exact;
12118
12119             ret = reg_node(pRExC_state, node_type);
12120
12121             /* In pass1, folded, we use a temporary buffer instead of the
12122              * actual node, as the node doesn't exist yet */
12123             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12124
12125             s0 = s;
12126
12127         reparse:
12128
12129             /* We do the EXACTFish to EXACT node only if folding.  (And we
12130              * don't need to figure this out until pass 2) */
12131             maybe_exact = FOLD && PASS2;
12132
12133             /* XXX The node can hold up to 255 bytes, yet this only goes to
12134              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12135              * 255 allows us to not have to worry about overflow due to
12136              * converting to utf8 and fold expansion, but that value is
12137              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12138              * split up by this limit into a single one using the real max of
12139              * 255.  Even at 127, this breaks under rare circumstances.  If
12140              * folding, we do not want to split a node at a character that is a
12141              * non-final in a multi-char fold, as an input string could just
12142              * happen to want to match across the node boundary.  The join
12143              * would solve that problem if the join actually happens.  But a
12144              * series of more than two nodes in a row each of 127 would cause
12145              * the first join to succeed to get to 254, but then there wouldn't
12146              * be room for the next one, which could at be one of those split
12147              * multi-char folds.  I don't know of any fool-proof solution.  One
12148              * could back off to end with only a code point that isn't such a
12149              * non-final, but it is possible for there not to be any in the
12150              * entire node. */
12151             for (p = RExC_parse - 1;
12152                  len < upper_parse && p < RExC_end;
12153                  len++)
12154             {
12155                 oldp = p;
12156
12157                 if (RExC_flags & RXf_PMf_EXTENDED)
12158                     p = regpatws(pRExC_state, p,
12159                                           TRUE); /* means recognize comments */
12160                 switch ((U8)*p) {
12161                 case '^':
12162                 case '$':
12163                 case '.':
12164                 case '[':
12165                 case '(':
12166                 case ')':
12167                 case '|':
12168                     goto loopdone;
12169                 case '\\':
12170                     /* Literal Escapes Switch
12171
12172                        This switch is meant to handle escape sequences that
12173                        resolve to a literal character.
12174
12175                        Every escape sequence that represents something
12176                        else, like an assertion or a char class, is handled
12177                        in the switch marked 'Special Escapes' above in this
12178                        routine, but also has an entry here as anything that
12179                        isn't explicitly mentioned here will be treated as
12180                        an unescaped equivalent literal.
12181                     */
12182
12183                     switch ((U8)*++p) {
12184                     /* These are all the special escapes. */
12185                     case 'A':             /* Start assertion */
12186                     case 'b': case 'B':   /* Word-boundary assertion*/
12187                     case 'C':             /* Single char !DANGEROUS! */
12188                     case 'd': case 'D':   /* digit class */
12189                     case 'g': case 'G':   /* generic-backref, pos assertion */
12190                     case 'h': case 'H':   /* HORIZWS */
12191                     case 'k': case 'K':   /* named backref, keep marker */
12192                     case 'p': case 'P':   /* Unicode property */
12193                               case 'R':   /* LNBREAK */
12194                     case 's': case 'S':   /* space class */
12195                     case 'v': case 'V':   /* VERTWS */
12196                     case 'w': case 'W':   /* word class */
12197                     case 'X':             /* eXtended Unicode "combining
12198                                              character sequence" */
12199                     case 'z': case 'Z':   /* End of line/string assertion */
12200                         --p;
12201                         goto loopdone;
12202
12203                     /* Anything after here is an escape that resolves to a
12204                        literal. (Except digits, which may or may not)
12205                      */
12206                     case 'n':
12207                         ender = '\n';
12208                         p++;
12209                         break;
12210                     case 'N': /* Handle a single-code point named character. */
12211                         /* The options cause it to fail if a multiple code
12212                          * point sequence.  Handle those in the switch() above
12213                          * */
12214                         RExC_parse = p + 1;
12215                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12216                                                          &ender,
12217                                                          flagp,
12218                                                          depth,
12219                                                          FALSE
12220                         )) {
12221                             if (*flagp & RESTART_UTF8)
12222                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12223                             RExC_parse = p = oldp;
12224                             goto loopdone;
12225                         }
12226                         p = RExC_parse;
12227                         if (ender > 0xff) {
12228                             REQUIRE_UTF8;
12229                         }
12230                         break;
12231                     case 'r':
12232                         ender = '\r';
12233                         p++;
12234                         break;
12235                     case 't':
12236                         ender = '\t';
12237                         p++;
12238                         break;
12239                     case 'f':
12240                         ender = '\f';
12241                         p++;
12242                         break;
12243                     case 'e':
12244                         ender = ESC_NATIVE;
12245                         p++;
12246                         break;
12247                     case 'a':
12248                         ender = '\a';
12249                         p++;
12250                         break;
12251                     case 'o':
12252                         {
12253                             UV result;
12254                             const char* error_msg;
12255
12256                             bool valid = grok_bslash_o(&p,
12257                                                        &result,
12258                                                        &error_msg,
12259                                                        PASS2, /* out warnings */
12260                                                        FALSE, /* not strict */
12261                                                        TRUE, /* Output warnings
12262                                                                 for non-
12263                                                                 portables */
12264                                                        UTF);
12265                             if (! valid) {
12266                                 RExC_parse = p; /* going to die anyway; point
12267                                                    to exact spot of failure */
12268                                 vFAIL(error_msg);
12269                             }
12270                             ender = result;
12271                             if (IN_ENCODING && ender < 0x100) {
12272                                 goto recode_encoding;
12273                             }
12274                             if (ender > 0xff) {
12275                                 REQUIRE_UTF8;
12276                             }
12277                             break;
12278                         }
12279                     case 'x':
12280                         {
12281                             UV result = UV_MAX; /* initialize to erroneous
12282                                                    value */
12283                             const char* error_msg;
12284
12285                             bool valid = grok_bslash_x(&p,
12286                                                        &result,
12287                                                        &error_msg,
12288                                                        PASS2, /* out warnings */
12289                                                        FALSE, /* not strict */
12290                                                        TRUE, /* Output warnings
12291                                                                 for non-
12292                                                                 portables */
12293                                                        UTF);
12294                             if (! valid) {
12295                                 RExC_parse = p; /* going to die anyway; point
12296                                                    to exact spot of failure */
12297                                 vFAIL(error_msg);
12298                             }
12299                             ender = result;
12300
12301                             if (IN_ENCODING && ender < 0x100) {
12302                                 goto recode_encoding;
12303                             }
12304                             if (ender > 0xff) {
12305                                 REQUIRE_UTF8;
12306                             }
12307                             break;
12308                         }
12309                     case 'c':
12310                         p++;
12311                         ender = grok_bslash_c(*p++, PASS2);
12312                         break;
12313                     case '8': case '9': /* must be a backreference */
12314                         --p;
12315                         goto loopdone;
12316                     case '1': case '2': case '3':case '4':
12317                     case '5': case '6': case '7':
12318                         /* When we parse backslash escapes there is ambiguity
12319                          * between backreferences and octal escapes. Any escape
12320                          * from \1 - \9 is a backreference, any multi-digit
12321                          * escape which does not start with 0 and which when
12322                          * evaluated as decimal could refer to an already
12323                          * parsed capture buffer is a backslash. Anything else
12324                          * is octal.
12325                          *
12326                          * Note this implies that \118 could be interpreted as
12327                          * 118 OR as "\11" . "8" depending on whether there
12328                          * were 118 capture buffers defined already in the
12329                          * pattern.  */
12330
12331                         /* NOTE, RExC_npar is 1 more than the actual number of
12332                          * parens we have seen so far, hence the < RExC_npar below. */
12333
12334                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12335                         {  /* Not to be treated as an octal constant, go
12336                                    find backref */
12337                             --p;
12338                             goto loopdone;
12339                         }
12340                         /* FALLTHROUGH */
12341                     case '0':
12342                         {
12343                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12344                             STRLEN numlen = 3;
12345                             ender = grok_oct(p, &numlen, &flags, NULL);
12346                             if (ender > 0xff) {
12347                                 REQUIRE_UTF8;
12348                             }
12349                             p += numlen;
12350                             if (PASS2   /* like \08, \178 */
12351                                 && numlen < 3
12352                                 && p < RExC_end
12353                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12354                             {
12355                                 reg_warn_non_literal_string(
12356                                          p + 1,
12357                                          form_short_octal_warning(p, numlen));
12358                             }
12359                         }
12360                         if (IN_ENCODING && ender < 0x100)
12361                             goto recode_encoding;
12362                         break;
12363                     recode_encoding:
12364                         if (! RExC_override_recoding) {
12365                             SV* enc = _get_encoding();
12366                             ender = reg_recode((const char)(U8)ender, &enc);
12367                             if (!enc && PASS2)
12368                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12369                             REQUIRE_UTF8;
12370                         }
12371                         break;
12372                     case '\0':
12373                         if (p >= RExC_end)
12374                             FAIL("Trailing \\");
12375                         /* FALLTHROUGH */
12376                     default:
12377                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12378                             /* Include any { following the alpha to emphasize
12379                              * that it could be part of an escape at some point
12380                              * in the future */
12381                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12382                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12383                         }
12384                         goto normal_default;
12385                     } /* End of switch on '\' */
12386                     break;
12387                 case '{':
12388                     /* Currently we don't warn when the lbrace is at the start
12389                      * of a construct.  This catches it in the middle of a
12390                      * literal string, or when its the first thing after
12391                      * something like "\b" */
12392                     if (! SIZE_ONLY
12393                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12394                     {
12395                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12396                     }
12397                     /*FALLTHROUGH*/
12398                 default:    /* A literal character */
12399                   normal_default:
12400                     if (UTF8_IS_START(*p) && UTF) {
12401                         STRLEN numlen;
12402                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12403                                                &numlen, UTF8_ALLOW_DEFAULT);
12404                         p += numlen;
12405                     }
12406                     else
12407                         ender = (U8) *p++;
12408                     break;
12409                 } /* End of switch on the literal */
12410
12411                 /* Here, have looked at the literal character and <ender>
12412                  * contains its ordinal, <p> points to the character after it
12413                  */
12414
12415                 if ( RExC_flags & RXf_PMf_EXTENDED)
12416                     p = regpatws(pRExC_state, p,
12417                                           TRUE); /* means recognize comments */
12418
12419                 /* If the next thing is a quantifier, it applies to this
12420                  * character only, which means that this character has to be in
12421                  * its own node and can't just be appended to the string in an
12422                  * existing node, so if there are already other characters in
12423                  * the node, close the node with just them, and set up to do
12424                  * this character again next time through, when it will be the
12425                  * only thing in its new node */
12426                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12427                 {
12428                     p = oldp;
12429                     goto loopdone;
12430                 }
12431
12432                 if (! FOLD) {  /* The simple case, just append the literal */
12433
12434                     /* In the sizing pass, we need only the size of the
12435                      * character we are appending, hence we can delay getting
12436                      * its representation until PASS2. */
12437                     if (SIZE_ONLY) {
12438                         if (UTF) {
12439                             const STRLEN unilen = UNISKIP(ender);
12440                             s += unilen;
12441
12442                             /* We have to subtract 1 just below (and again in
12443                              * the corresponding PASS2 code) because the loop
12444                              * increments <len> each time, as all but this path
12445                              * (and one other) through it add a single byte to
12446                              * the EXACTish node.  But these paths would change
12447                              * len to be the correct final value, so cancel out
12448                              * the increment that follows */
12449                             len += unilen - 1;
12450                         }
12451                         else {
12452                             s++;
12453                         }
12454                     } else { /* PASS2 */
12455                       not_fold_common:
12456                         if (UTF) {
12457                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12458                             len += (char *) new_s - s - 1;
12459                             s = (char *) new_s;
12460                         }
12461                         else {
12462                             *(s++) = (char) ender;
12463                         }
12464                     }
12465                 }
12466                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12467
12468                     /* Here are folding under /l, and the code point is
12469                      * problematic.  First, we know we can't simplify things */
12470                     maybe_exact = FALSE;
12471                     maybe_exactfu = FALSE;
12472
12473                     /* A problematic code point in this context means that its
12474                      * fold isn't known until runtime, so we can't fold it now.
12475                      * (The non-problematic code points are the above-Latin1
12476                      * ones that fold to also all above-Latin1.  Their folds
12477                      * don't vary no matter what the locale is.) But here we
12478                      * have characters whose fold depends on the locale.
12479                      * Unlike the non-folding case above, we have to keep track
12480                      * of these in the sizing pass, so that we can make sure we
12481                      * don't split too-long nodes in the middle of a potential
12482                      * multi-char fold.  And unlike the regular fold case
12483                      * handled in the else clauses below, we don't actually
12484                      * fold and don't have special cases to consider.  What we
12485                      * do for both passes is the PASS2 code for non-folding */
12486                     goto not_fold_common;
12487                 }
12488                 else /* A regular FOLD code point */
12489                     if (! ( UTF
12490                         /* See comments for join_exact() as to why we fold this
12491                          * non-UTF at compile time */
12492                         || (node_type == EXACTFU
12493                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12494                 {
12495                     /* Here, are folding and are not UTF-8 encoded; therefore
12496                      * the character must be in the range 0-255, and is not /l
12497                      * (Not /l because we already handled these under /l in
12498                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12499                     if (IS_IN_SOME_FOLD_L1(ender)) {
12500                         maybe_exact = FALSE;
12501
12502                         /* See if the character's fold differs between /d and
12503                          * /u.  This includes the multi-char fold SHARP S to
12504                          * 'ss' */
12505                         if (maybe_exactfu
12506                             && (PL_fold[ender] != PL_fold_latin1[ender]
12507                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12508                                 || (len > 0
12509                                    && isALPHA_FOLD_EQ(ender, 's')
12510                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12511                         {
12512                             maybe_exactfu = FALSE;
12513                         }
12514                     }
12515
12516                     /* Even when folding, we store just the input character, as
12517                      * we have an array that finds its fold quickly */
12518                     *(s++) = (char) ender;
12519                 }
12520                 else {  /* FOLD and UTF */
12521                     /* Unlike the non-fold case, we do actually have to
12522                      * calculate the results here in pass 1.  This is for two
12523                      * reasons, the folded length may be longer than the
12524                      * unfolded, and we have to calculate how many EXACTish
12525                      * nodes it will take; and we may run out of room in a node
12526                      * in the middle of a potential multi-char fold, and have
12527                      * to back off accordingly.  */
12528
12529                     UV folded;
12530                     if (isASCII_uni(ender)) {
12531                         folded = toFOLD(ender);
12532                         *(s)++ = (U8) folded;
12533                     }
12534                     else {
12535                         STRLEN foldlen;
12536
12537                         folded = _to_uni_fold_flags(
12538                                      ender,
12539                                      (U8 *) s,
12540                                      &foldlen,
12541                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12542                                                         ? FOLD_FLAGS_NOMIX_ASCII
12543                                                         : 0));
12544                         s += foldlen;
12545
12546                         /* The loop increments <len> each time, as all but this
12547                          * path (and one other) through it add a single byte to
12548                          * the EXACTish node.  But this one has changed len to
12549                          * be the correct final value, so subtract one to
12550                          * cancel out the increment that follows */
12551                         len += foldlen - 1;
12552                     }
12553                     /* If this node only contains non-folding code points so
12554                      * far, see if this new one is also non-folding */
12555                     if (maybe_exact) {
12556                         if (folded != ender) {
12557                             maybe_exact = FALSE;
12558                         }
12559                         else {
12560                             /* Here the fold is the original; we have to check
12561                              * further to see if anything folds to it */
12562                             if (_invlist_contains_cp(PL_utf8_foldable,
12563                                                         ender))
12564                             {
12565                                 maybe_exact = FALSE;
12566                             }
12567                         }
12568                     }
12569                     ender = folded;
12570                 }
12571
12572                 if (next_is_quantifier) {
12573
12574                     /* Here, the next input is a quantifier, and to get here,
12575                      * the current character is the only one in the node.
12576                      * Also, here <len> doesn't include the final byte for this
12577                      * character */
12578                     len++;
12579                     goto loopdone;
12580                 }
12581
12582             } /* End of loop through literal characters */
12583
12584             /* Here we have either exhausted the input or ran out of room in
12585              * the node.  (If we encountered a character that can't be in the
12586              * node, transfer is made directly to <loopdone>, and so we
12587              * wouldn't have fallen off the end of the loop.)  In the latter
12588              * case, we artificially have to split the node into two, because
12589              * we just don't have enough space to hold everything.  This
12590              * creates a problem if the final character participates in a
12591              * multi-character fold in the non-final position, as a match that
12592              * should have occurred won't, due to the way nodes are matched,
12593              * and our artificial boundary.  So back off until we find a non-
12594              * problematic character -- one that isn't at the beginning or
12595              * middle of such a fold.  (Either it doesn't participate in any
12596              * folds, or appears only in the final position of all the folds it
12597              * does participate in.)  A better solution with far fewer false
12598              * positives, and that would fill the nodes more completely, would
12599              * be to actually have available all the multi-character folds to
12600              * test against, and to back-off only far enough to be sure that
12601              * this node isn't ending with a partial one.  <upper_parse> is set
12602              * further below (if we need to reparse the node) to include just
12603              * up through that final non-problematic character that this code
12604              * identifies, so when it is set to less than the full node, we can
12605              * skip the rest of this */
12606             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12607
12608                 const STRLEN full_len = len;
12609
12610                 assert(len >= MAX_NODE_STRING_SIZE);
12611
12612                 /* Here, <s> points to the final byte of the final character.
12613                  * Look backwards through the string until find a non-
12614                  * problematic character */
12615
12616                 if (! UTF) {
12617
12618                     /* This has no multi-char folds to non-UTF characters */
12619                     if (ASCII_FOLD_RESTRICTED) {
12620                         goto loopdone;
12621                     }
12622
12623                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12624                     len = s - s0 + 1;
12625                 }
12626                 else {
12627                     if (!  PL_NonL1NonFinalFold) {
12628                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12629                                         NonL1_Perl_Non_Final_Folds_invlist);
12630                     }
12631
12632                     /* Point to the first byte of the final character */
12633                     s = (char *) utf8_hop((U8 *) s, -1);
12634
12635                     while (s >= s0) {   /* Search backwards until find
12636                                            non-problematic char */
12637                         if (UTF8_IS_INVARIANT(*s)) {
12638
12639                             /* There are no ascii characters that participate
12640                              * in multi-char folds under /aa.  In EBCDIC, the
12641                              * non-ascii invariants are all control characters,
12642                              * so don't ever participate in any folds. */
12643                             if (ASCII_FOLD_RESTRICTED
12644                                 || ! IS_NON_FINAL_FOLD(*s))
12645                             {
12646                                 break;
12647                             }
12648                         }
12649                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12650                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12651                                                                   *s, *(s+1))))
12652                             {
12653                                 break;
12654                             }
12655                         }
12656                         else if (! _invlist_contains_cp(
12657                                         PL_NonL1NonFinalFold,
12658                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12659                         {
12660                             break;
12661                         }
12662
12663                         /* Here, the current character is problematic in that
12664                          * it does occur in the non-final position of some
12665                          * fold, so try the character before it, but have to
12666                          * special case the very first byte in the string, so
12667                          * we don't read outside the string */
12668                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12669                     } /* End of loop backwards through the string */
12670
12671                     /* If there were only problematic characters in the string,
12672                      * <s> will point to before s0, in which case the length
12673                      * should be 0, otherwise include the length of the
12674                      * non-problematic character just found */
12675                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12676                 }
12677
12678                 /* Here, have found the final character, if any, that is
12679                  * non-problematic as far as ending the node without splitting
12680                  * it across a potential multi-char fold.  <len> contains the
12681                  * number of bytes in the node up-to and including that
12682                  * character, or is 0 if there is no such character, meaning
12683                  * the whole node contains only problematic characters.  In
12684                  * this case, give up and just take the node as-is.  We can't
12685                  * do any better */
12686                 if (len == 0) {
12687                     len = full_len;
12688
12689                     /* If the node ends in an 's' we make sure it stays EXACTF,
12690                      * as if it turns into an EXACTFU, it could later get
12691                      * joined with another 's' that would then wrongly match
12692                      * the sharp s */
12693                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12694                     {
12695                         maybe_exactfu = FALSE;
12696                     }
12697                 } else {
12698
12699                     /* Here, the node does contain some characters that aren't
12700                      * problematic.  If one such is the final character in the
12701                      * node, we are done */
12702                     if (len == full_len) {
12703                         goto loopdone;
12704                     }
12705                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12706
12707                         /* If the final character is problematic, but the
12708                          * penultimate is not, back-off that last character to
12709                          * later start a new node with it */
12710                         p = oldp;
12711                         goto loopdone;
12712                     }
12713
12714                     /* Here, the final non-problematic character is earlier
12715                      * in the input than the penultimate character.  What we do
12716                      * is reparse from the beginning, going up only as far as
12717                      * this final ok one, thus guaranteeing that the node ends
12718                      * in an acceptable character.  The reason we reparse is
12719                      * that we know how far in the character is, but we don't
12720                      * know how to correlate its position with the input parse.
12721                      * An alternate implementation would be to build that
12722                      * correlation as we go along during the original parse,
12723                      * but that would entail extra work for every node, whereas
12724                      * this code gets executed only when the string is too
12725                      * large for the node, and the final two characters are
12726                      * problematic, an infrequent occurrence.  Yet another
12727                      * possible strategy would be to save the tail of the
12728                      * string, and the next time regatom is called, initialize
12729                      * with that.  The problem with this is that unless you
12730                      * back off one more character, you won't be guaranteed
12731                      * regatom will get called again, unless regbranch,
12732                      * regpiece ... are also changed.  If you do back off that
12733                      * extra character, so that there is input guaranteed to
12734                      * force calling regatom, you can't handle the case where
12735                      * just the first character in the node is acceptable.  I
12736                      * (khw) decided to try this method which doesn't have that
12737                      * pitfall; if performance issues are found, we can do a
12738                      * combination of the current approach plus that one */
12739                     upper_parse = len;
12740                     len = 0;
12741                     s = s0;
12742                     goto reparse;
12743                 }
12744             }   /* End of verifying node ends with an appropriate char */
12745
12746         loopdone:   /* Jumped to when encounters something that shouldn't be in
12747                        the node */
12748
12749             /* I (khw) don't know if you can get here with zero length, but the
12750              * old code handled this situation by creating a zero-length EXACT
12751              * node.  Might as well be NOTHING instead */
12752             if (len == 0) {
12753                 OP(ret) = NOTHING;
12754             }
12755             else {
12756                 if (FOLD) {
12757                     /* If 'maybe_exact' is still set here, means there are no
12758                      * code points in the node that participate in folds;
12759                      * similarly for 'maybe_exactfu' and code points that match
12760                      * differently depending on UTF8ness of the target string
12761                      * (for /u), or depending on locale for /l */
12762                     if (maybe_exact) {
12763                         OP(ret) = (LOC)
12764                                   ? EXACTL
12765                                   : EXACT;
12766                     }
12767                     else if (maybe_exactfu) {
12768                         OP(ret) = (LOC)
12769                                   ? EXACTFLU8
12770                                   : EXACTFU;
12771                     }
12772                 }
12773                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12774                                            FALSE /* Don't look to see if could
12775                                                     be turned into an EXACT
12776                                                     node, as we have already
12777                                                     computed that */
12778                                           );
12779             }
12780
12781             RExC_parse = p - 1;
12782             Set_Node_Cur_Length(ret, parse_start);
12783             nextchar(pRExC_state);
12784             {
12785                 /* len is STRLEN which is unsigned, need to copy to signed */
12786                 IV iv = len;
12787                 if (iv < 0)
12788                     vFAIL("Internal disaster");
12789             }
12790
12791         } /* End of label 'defchar:' */
12792         break;
12793     } /* End of giant switch on input character */
12794
12795     return(ret);
12796 }
12797
12798 STATIC char *
12799 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12800 {
12801     /* Returns the next non-pattern-white space, non-comment character (the
12802      * latter only if 'recognize_comment is true) in the string p, which is
12803      * ended by RExC_end.  See also reg_skipcomment */
12804     const char *e = RExC_end;
12805
12806     PERL_ARGS_ASSERT_REGPATWS;
12807
12808     while (p < e) {
12809         STRLEN len;
12810         if ((len = is_PATWS_safe(p, e, UTF))) {
12811             p += len;
12812         }
12813         else if (recognize_comment && *p == '#') {
12814             p = reg_skipcomment(pRExC_state, p);
12815         }
12816         else
12817             break;
12818     }
12819     return p;
12820 }
12821
12822 STATIC void
12823 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12824 {
12825     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12826      * sets up the bitmap and any flags, removing those code points from the
12827      * inversion list, setting it to NULL should it become completely empty */
12828
12829     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12830     assert(PL_regkind[OP(node)] == ANYOF);
12831
12832     ANYOF_BITMAP_ZERO(node);
12833     if (*invlist_ptr) {
12834
12835         /* This gets set if we actually need to modify things */
12836         bool change_invlist = FALSE;
12837
12838         UV start, end;
12839
12840         /* Start looking through *invlist_ptr */
12841         invlist_iterinit(*invlist_ptr);
12842         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12843             UV high;
12844             int i;
12845
12846             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12847                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12848             }
12849             else if (end >= NUM_ANYOF_CODE_POINTS) {
12850                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12851             }
12852
12853             /* Quit if are above what we should change */
12854             if (start >= NUM_ANYOF_CODE_POINTS) {
12855                 break;
12856             }
12857
12858             change_invlist = TRUE;
12859
12860             /* Set all the bits in the range, up to the max that we are doing */
12861             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12862                    ? end
12863                    : NUM_ANYOF_CODE_POINTS - 1;
12864             for (i = start; i <= (int) high; i++) {
12865                 if (! ANYOF_BITMAP_TEST(node, i)) {
12866                     ANYOF_BITMAP_SET(node, i);
12867                 }
12868             }
12869         }
12870         invlist_iterfinish(*invlist_ptr);
12871
12872         /* Done with loop; remove any code points that are in the bitmap from
12873          * *invlist_ptr; similarly for code points above the bitmap if we have
12874          * a flag to match all of them anyways */
12875         if (change_invlist) {
12876             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12877         }
12878         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12879             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12880         }
12881
12882         /* If have completely emptied it, remove it completely */
12883         if (_invlist_len(*invlist_ptr) == 0) {
12884             SvREFCNT_dec_NN(*invlist_ptr);
12885             *invlist_ptr = NULL;
12886         }
12887     }
12888 }
12889
12890 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12891    Character classes ([:foo:]) can also be negated ([:^foo:]).
12892    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12893    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12894    but trigger failures because they are currently unimplemented. */
12895
12896 #define POSIXCC_DONE(c)   ((c) == ':')
12897 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12898 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12899
12900 PERL_STATIC_INLINE I32
12901 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12902 {
12903     I32 namedclass = OOB_NAMEDCLASS;
12904
12905     PERL_ARGS_ASSERT_REGPPOSIXCC;
12906
12907     if (value == '[' && RExC_parse + 1 < RExC_end &&
12908         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12909         POSIXCC(UCHARAT(RExC_parse)))
12910     {
12911         const char c = UCHARAT(RExC_parse);
12912         char* const s = RExC_parse++;
12913
12914         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12915             RExC_parse++;
12916         if (RExC_parse == RExC_end) {
12917             if (strict) {
12918
12919                 /* Try to give a better location for the error (than the end of
12920                  * the string) by looking for the matching ']' */
12921                 RExC_parse = s;
12922                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12923                     RExC_parse++;
12924                 }
12925                 vFAIL2("Unmatched '%c' in POSIX class", c);
12926             }
12927             /* Grandfather lone [:, [=, [. */
12928             RExC_parse = s;
12929         }
12930         else {
12931             const char* const t = RExC_parse++; /* skip over the c */
12932             assert(*t == c);
12933
12934             if (UCHARAT(RExC_parse) == ']') {
12935                 const char *posixcc = s + 1;
12936                 RExC_parse++; /* skip over the ending ] */
12937
12938                 if (*s == ':') {
12939                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12940                     const I32 skip = t - posixcc;
12941
12942                     /* Initially switch on the length of the name.  */
12943                     switch (skip) {
12944                     case 4:
12945                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12946                                                           this is the Perl \w
12947                                                         */
12948                             namedclass = ANYOF_WORDCHAR;
12949                         break;
12950                     case 5:
12951                         /* Names all of length 5.  */
12952                         /* alnum alpha ascii blank cntrl digit graph lower
12953                            print punct space upper  */
12954                         /* Offset 4 gives the best switch position.  */
12955                         switch (posixcc[4]) {
12956                         case 'a':
12957                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12958                                 namedclass = ANYOF_ALPHA;
12959                             break;
12960                         case 'e':
12961                             if (memEQ(posixcc, "spac", 4)) /* space */
12962                                 namedclass = ANYOF_PSXSPC;
12963                             break;
12964                         case 'h':
12965                             if (memEQ(posixcc, "grap", 4)) /* graph */
12966                                 namedclass = ANYOF_GRAPH;
12967                             break;
12968                         case 'i':
12969                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12970                                 namedclass = ANYOF_ASCII;
12971                             break;
12972                         case 'k':
12973                             if (memEQ(posixcc, "blan", 4)) /* blank */
12974                                 namedclass = ANYOF_BLANK;
12975                             break;
12976                         case 'l':
12977                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12978                                 namedclass = ANYOF_CNTRL;
12979                             break;
12980                         case 'm':
12981                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12982                                 namedclass = ANYOF_ALPHANUMERIC;
12983                             break;
12984                         case 'r':
12985                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12986                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12987                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12988                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12989                             break;
12990                         case 't':
12991                             if (memEQ(posixcc, "digi", 4)) /* digit */
12992                                 namedclass = ANYOF_DIGIT;
12993                             else if (memEQ(posixcc, "prin", 4)) /* print */
12994                                 namedclass = ANYOF_PRINT;
12995                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12996                                 namedclass = ANYOF_PUNCT;
12997                             break;
12998                         }
12999                         break;
13000                     case 6:
13001                         if (memEQ(posixcc, "xdigit", 6))
13002                             namedclass = ANYOF_XDIGIT;
13003                         break;
13004                     }
13005
13006                     if (namedclass == OOB_NAMEDCLASS)
13007                         vFAIL2utf8f(
13008                             "POSIX class [:%"UTF8f":] unknown",
13009                             UTF8fARG(UTF, t - s - 1, s + 1));
13010
13011                     /* The #defines are structured so each complement is +1 to
13012                      * the normal one */
13013                     if (complement) {
13014                         namedclass++;
13015                     }
13016                     assert (posixcc[skip] == ':');
13017                     assert (posixcc[skip+1] == ']');
13018                 } else if (!SIZE_ONLY) {
13019                     /* [[=foo=]] and [[.foo.]] are still future. */
13020
13021                     /* adjust RExC_parse so the warning shows after
13022                        the class closes */
13023                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13024                         RExC_parse++;
13025                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13026                 }
13027             } else {
13028                 /* Maternal grandfather:
13029                  * "[:" ending in ":" but not in ":]" */
13030                 if (strict) {
13031                     vFAIL("Unmatched '[' in POSIX class");
13032                 }
13033
13034                 /* Grandfather lone [:, [=, [. */
13035                 RExC_parse = s;
13036             }
13037         }
13038     }
13039
13040     return namedclass;
13041 }
13042
13043 STATIC bool
13044 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13045 {
13046     /* This applies some heuristics at the current parse position (which should
13047      * be at a '[') to see if what follows might be intended to be a [:posix:]
13048      * class.  It returns true if it really is a posix class, of course, but it
13049      * also can return true if it thinks that what was intended was a posix
13050      * class that didn't quite make it.
13051      *
13052      * It will return true for
13053      *      [:alphanumerics:
13054      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13055      *                         ')' indicating the end of the (?[
13056      *      [:any garbage including %^&$ punctuation:]
13057      *
13058      * This is designed to be called only from S_handle_regex_sets; it could be
13059      * easily adapted to be called from the spot at the beginning of regclass()
13060      * that checks to see in a normal bracketed class if the surrounding []
13061      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13062      * change long-standing behavior, so I (khw) didn't do that */
13063     char* p = RExC_parse + 1;
13064     char first_char = *p;
13065
13066     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13067
13068     assert(*(p - 1) == '[');
13069
13070     if (! POSIXCC(first_char)) {
13071         return FALSE;
13072     }
13073
13074     p++;
13075     while (p < RExC_end && isWORDCHAR(*p)) p++;
13076
13077     if (p >= RExC_end) {
13078         return FALSE;
13079     }
13080
13081     if (p - RExC_parse > 2    /* Got at least 1 word character */
13082         && (*p == first_char
13083             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13084     {
13085         return TRUE;
13086     }
13087
13088     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13089
13090     return (p
13091             && p - RExC_parse > 2 /* [:] evaluates to colon;
13092                                       [::] is a bad posix class. */
13093             && first_char == *(p - 1));
13094 }
13095
13096 STATIC regnode *
13097 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13098                     I32 *flagp, U32 depth,
13099                     char * const oregcomp_parse)
13100 {
13101     /* Handle the (?[...]) construct to do set operations */
13102
13103     U8 curchar;
13104     UV start, end;      /* End points of code point ranges */
13105     SV* result_string;
13106     char *save_end, *save_parse;
13107     SV* final;
13108     STRLEN len;
13109     regnode* node;
13110     AV* stack;
13111     const bool save_fold = FOLD;
13112
13113     GET_RE_DEBUG_FLAGS_DECL;
13114
13115     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13116
13117     if (LOC) {
13118         vFAIL("(?[...]) not valid in locale");
13119     }
13120     RExC_uni_semantics = 1;
13121
13122     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13123      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13124      * call regclass to handle '[]' so as to not have to reinvent its parsing
13125      * rules here (throwing away the size it computes each time).  And, we exit
13126      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13127      * these things, we need to realize that something preceded by a backslash
13128      * is escaped, so we have to keep track of backslashes */
13129     if (PASS2) {
13130         Perl_ck_warner_d(aTHX_
13131             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13132             "The regex_sets feature is experimental" REPORT_LOCATION,
13133                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13134                 UTF8fARG(UTF,
13135                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13136                          RExC_precomp + (RExC_parse - RExC_precomp)));
13137     }
13138     else {
13139         UV depth = 0; /* how many nested (?[...]) constructs */
13140
13141         while (RExC_parse < RExC_end) {
13142             SV* current = NULL;
13143             RExC_parse = regpatws(pRExC_state, RExC_parse,
13144                                           TRUE); /* means recognize comments */
13145             switch (*RExC_parse) {
13146                 case '?':
13147                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13148                     /* FALLTHROUGH */
13149                 default:
13150                     break;
13151                 case '\\':
13152                     /* Skip the next byte (which could cause us to end up in
13153                      * the middle of a UTF-8 character, but since none of those
13154                      * are confusable with anything we currently handle in this
13155                      * switch (invariants all), it's safe.  We'll just hit the
13156                      * default: case next time and keep on incrementing until
13157                      * we find one of the invariants we do handle. */
13158                     RExC_parse++;
13159                     break;
13160                 case '[':
13161                 {
13162                     /* If this looks like it is a [:posix:] class, leave the
13163                      * parse pointer at the '[' to fool regclass() into
13164                      * thinking it is part of a '[[:posix:]]'.  That function
13165                      * will use strict checking to force a syntax error if it
13166                      * doesn't work out to a legitimate class */
13167                     bool is_posix_class
13168                                     = could_it_be_a_POSIX_class(pRExC_state);
13169                     if (! is_posix_class) {
13170                         RExC_parse++;
13171                     }
13172
13173                     /* regclass() can only return RESTART_UTF8 if multi-char
13174                        folds are allowed.  */
13175                     if (!regclass(pRExC_state, flagp,depth+1,
13176                                   is_posix_class, /* parse the whole char
13177                                                      class only if not a
13178                                                      posix class */
13179                                   FALSE, /* don't allow multi-char folds */
13180                                   TRUE, /* silence non-portable warnings. */
13181                                   &current))
13182                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13183                               (UV) *flagp);
13184
13185                     /* function call leaves parse pointing to the ']', except
13186                      * if we faked it */
13187                     if (is_posix_class) {
13188                         RExC_parse--;
13189                     }
13190
13191                     SvREFCNT_dec(current);   /* In case it returned something */
13192                     break;
13193                 }
13194
13195                 case ']':
13196                     if (depth--) break;
13197                     RExC_parse++;
13198                     if (RExC_parse < RExC_end
13199                         && *RExC_parse == ')')
13200                     {
13201                         node = reganode(pRExC_state, ANYOF, 0);
13202                         RExC_size += ANYOF_SKIP;
13203                         nextchar(pRExC_state);
13204                         Set_Node_Length(node,
13205                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13206                         return node;
13207                     }
13208                     goto no_close;
13209             }
13210             RExC_parse++;
13211         }
13212
13213         no_close:
13214         FAIL("Syntax error in (?[...])");
13215     }
13216
13217     /* Pass 2 only after this.  Everything in this construct is a
13218      * metacharacter.  Operands begin with either a '\' (for an escape
13219      * sequence), or a '[' for a bracketed character class.  Any other
13220      * character should be an operator, or parenthesis for grouping.  Both
13221      * types of operands are handled by calling regclass() to parse them.  It
13222      * is called with a parameter to indicate to return the computed inversion
13223      * list.  The parsing here is implemented via a stack.  Each entry on the
13224      * stack is a single character representing one of the operators, or the
13225      * '('; or else a pointer to an operand inversion list. */
13226
13227 #define IS_OPERAND(a)  (! SvIOK(a))
13228
13229     /* The stack starts empty.  It is a syntax error if the first thing parsed
13230      * is a binary operator; everything else is pushed on the stack.  When an
13231      * operand is parsed, the top of the stack is examined.  If it is a binary
13232      * operator, the item before it should be an operand, and both are replaced
13233      * by the result of doing that operation on the new operand and the one on
13234      * the stack.   Thus a sequence of binary operands is reduced to a single
13235      * one before the next one is parsed.
13236      *
13237      * A unary operator may immediately follow a binary in the input, for
13238      * example
13239      *      [a] + ! [b]
13240      * When an operand is parsed and the top of the stack is a unary operator,
13241      * the operation is performed, and then the stack is rechecked to see if
13242      * this new operand is part of a binary operation; if so, it is handled as
13243      * above.
13244      *
13245      * A '(' is simply pushed on the stack; it is valid only if the stack is
13246      * empty, or the top element of the stack is an operator or another '('
13247      * (for which the parenthesized expression will become an operand).  By the
13248      * time the corresponding ')' is parsed everything in between should have
13249      * been parsed and evaluated to a single operand (or else is a syntax
13250      * error), and is handled as a regular operand */
13251
13252     sv_2mortal((SV *)(stack = newAV()));
13253
13254     while (RExC_parse < RExC_end) {
13255         I32 top_index = av_tindex(stack);
13256         SV** top_ptr;
13257         SV* current = NULL;
13258
13259         /* Skip white space */
13260         RExC_parse = regpatws(pRExC_state, RExC_parse,
13261                                          TRUE /* means recognize comments */ );
13262         if (RExC_parse >= RExC_end) {
13263             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13264         }
13265         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13266             break;
13267         }
13268
13269         switch (curchar) {
13270
13271             case '?':
13272                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13273                                                safely subtract 1 from
13274                                                RExC_parse in the next clause.
13275                                                If we have something on the
13276                                                stack, we have parsed something
13277                                              */
13278                     && UCHARAT(RExC_parse - 1) == '('
13279                     && RExC_parse < RExC_end)
13280                 {
13281                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13282                      * This happens when we have some thing like
13283                      *
13284                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13285                      *   ...
13286                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13287                      *
13288                      * Here we would be handling the interpolated
13289                      * '$thai_or_lao'.  We handle this by a recursive call to
13290                      * ourselves which returns the inversion list the
13291                      * interpolated expression evaluates to.  We use the flags
13292                      * from the interpolated pattern. */
13293                     U32 save_flags = RExC_flags;
13294                     const char * const save_parse = ++RExC_parse;
13295
13296                     parse_lparen_question_flags(pRExC_state);
13297
13298                     if (RExC_parse == save_parse  /* Makes sure there was at
13299                                                      least one flag (or this
13300                                                      embedding wasn't compiled)
13301                                                    */
13302                         || RExC_parse >= RExC_end - 4
13303                         || UCHARAT(RExC_parse) != ':'
13304                         || UCHARAT(++RExC_parse) != '('
13305                         || UCHARAT(++RExC_parse) != '?'
13306                         || UCHARAT(++RExC_parse) != '[')
13307                     {
13308
13309                         /* In combination with the above, this moves the
13310                          * pointer to the point just after the first erroneous
13311                          * character (or if there are no flags, to where they
13312                          * should have been) */
13313                         if (RExC_parse >= RExC_end - 4) {
13314                             RExC_parse = RExC_end;
13315                         }
13316                         else if (RExC_parse != save_parse) {
13317                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13318                         }
13319                         vFAIL("Expecting '(?flags:(?[...'");
13320                     }
13321                     RExC_parse++;
13322                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13323                                                     depth+1, oregcomp_parse);
13324
13325                     /* Here, 'current' contains the embedded expression's
13326                      * inversion list, and RExC_parse points to the trailing
13327                      * ']'; the next character should be the ')' which will be
13328                      * paired with the '(' that has been put on the stack, so
13329                      * the whole embedded expression reduces to '(operand)' */
13330                     RExC_parse++;
13331
13332                     RExC_flags = save_flags;
13333                     goto handle_operand;
13334                 }
13335                 /* FALLTHROUGH */
13336
13337             default:
13338                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13339                 vFAIL("Unexpected character");
13340
13341             case '\\':
13342                 /* regclass() can only return RESTART_UTF8 if multi-char
13343                    folds are allowed.  */
13344                 if (!regclass(pRExC_state, flagp,depth+1,
13345                               TRUE, /* means parse just the next thing */
13346                               FALSE, /* don't allow multi-char folds */
13347                               FALSE, /* don't silence non-portable warnings.  */
13348                               &current))
13349                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13350                           (UV) *flagp);
13351                 /* regclass() will return with parsing just the \ sequence,
13352                  * leaving the parse pointer at the next thing to parse */
13353                 RExC_parse--;
13354                 goto handle_operand;
13355
13356             case '[':   /* Is a bracketed character class */
13357             {
13358                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13359
13360                 if (! is_posix_class) {
13361                     RExC_parse++;
13362                 }
13363
13364                 /* regclass() can only return RESTART_UTF8 if multi-char
13365                    folds are allowed.  */
13366                 if(!regclass(pRExC_state, flagp,depth+1,
13367                              is_posix_class, /* parse the whole char class
13368                                                 only if not a posix class */
13369                              FALSE, /* don't allow multi-char folds */
13370                              FALSE, /* don't silence non-portable warnings.  */
13371                              &current))
13372                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13373                           (UV) *flagp);
13374                 /* function call leaves parse pointing to the ']', except if we
13375                  * faked it */
13376                 if (is_posix_class) {
13377                     RExC_parse--;
13378                 }
13379
13380                 goto handle_operand;
13381             }
13382
13383             case '&':
13384             case '|':
13385             case '+':
13386             case '-':
13387             case '^':
13388                 if (top_index < 0
13389                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13390                     || ! IS_OPERAND(*top_ptr))
13391                 {
13392                     RExC_parse++;
13393                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13394                 }
13395                 av_push(stack, newSVuv(curchar));
13396                 break;
13397
13398             case '!':
13399                 av_push(stack, newSVuv(curchar));
13400                 break;
13401
13402             case '(':
13403                 if (top_index >= 0) {
13404                     top_ptr = av_fetch(stack, top_index, FALSE);
13405                     assert(top_ptr);
13406                     if (IS_OPERAND(*top_ptr)) {
13407                         RExC_parse++;
13408                         vFAIL("Unexpected '(' with no preceding operator");
13409                     }
13410                 }
13411                 av_push(stack, newSVuv(curchar));
13412                 break;
13413
13414             case ')':
13415             {
13416                 SV* lparen;
13417                 if (top_index < 1
13418                     || ! (current = av_pop(stack))
13419                     || ! IS_OPERAND(current)
13420                     || ! (lparen = av_pop(stack))
13421                     || IS_OPERAND(lparen)
13422                     || SvUV(lparen) != '(')
13423                 {
13424                     SvREFCNT_dec(current);
13425                     RExC_parse++;
13426                     vFAIL("Unexpected ')'");
13427                 }
13428                 top_index -= 2;
13429                 SvREFCNT_dec_NN(lparen);
13430
13431                 /* FALLTHROUGH */
13432             }
13433
13434               handle_operand:
13435
13436                 /* Here, we have an operand to process, in 'current' */
13437
13438                 if (top_index < 0) {    /* Just push if stack is empty */
13439                     av_push(stack, current);
13440                 }
13441                 else {
13442                     SV* top = av_pop(stack);
13443                     SV *prev = NULL;
13444                     char current_operator;
13445
13446                     if (IS_OPERAND(top)) {
13447                         SvREFCNT_dec_NN(top);
13448                         SvREFCNT_dec_NN(current);
13449                         vFAIL("Operand with no preceding operator");
13450                     }
13451                     current_operator = (char) SvUV(top);
13452                     switch (current_operator) {
13453                         case '(':   /* Push the '(' back on followed by the new
13454                                        operand */
13455                             av_push(stack, top);
13456                             av_push(stack, current);
13457                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13458                                                    just after the 'break', so
13459                                                    it doesn't get wrongly freed
13460                                                  */
13461                             break;
13462
13463                         case '!':
13464                             _invlist_invert(current);
13465
13466                             /* Unlike binary operators, the top of the stack,
13467                              * now that this unary one has been popped off, may
13468                              * legally be an operator, and we now have operand
13469                              * for it. */
13470                             top_index--;
13471                             SvREFCNT_dec_NN(top);
13472                             goto handle_operand;
13473
13474                         case '&':
13475                             prev = av_pop(stack);
13476                             _invlist_intersection(prev,
13477                                                    current,
13478                                                    &current);
13479                             av_push(stack, current);
13480                             break;
13481
13482                         case '|':
13483                         case '+':
13484                             prev = av_pop(stack);
13485                             _invlist_union(prev, current, &current);
13486                             av_push(stack, current);
13487                             break;
13488
13489                         case '-':
13490                             prev = av_pop(stack);;
13491                             _invlist_subtract(prev, current, &current);
13492                             av_push(stack, current);
13493                             break;
13494
13495                         case '^':   /* The union minus the intersection */
13496                         {
13497                             SV* i = NULL;
13498                             SV* u = NULL;
13499                             SV* element;
13500
13501                             prev = av_pop(stack);
13502                             _invlist_union(prev, current, &u);
13503                             _invlist_intersection(prev, current, &i);
13504                             /* _invlist_subtract will overwrite current
13505                                 without freeing what it already contains */
13506                             element = current;
13507                             _invlist_subtract(u, i, &current);
13508                             av_push(stack, current);
13509                             SvREFCNT_dec_NN(i);
13510                             SvREFCNT_dec_NN(u);
13511                             SvREFCNT_dec_NN(element);
13512                             break;
13513                         }
13514
13515                         default:
13516                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13517                 }
13518                 SvREFCNT_dec_NN(top);
13519                 SvREFCNT_dec(prev);
13520             }
13521         }
13522
13523         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13524     }
13525
13526     if (av_tindex(stack) < 0   /* Was empty */
13527         || ((final = av_pop(stack)) == NULL)
13528         || ! IS_OPERAND(final)
13529         || av_tindex(stack) >= 0)  /* More left on stack */
13530     {
13531         vFAIL("Incomplete expression within '(?[ ])'");
13532     }
13533
13534     /* Here, 'final' is the resultant inversion list from evaluating the
13535      * expression.  Return it if so requested */
13536     if (return_invlist) {
13537         *return_invlist = final;
13538         return END;
13539     }
13540
13541     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13542      * expecting a string of ranges and individual code points */
13543     invlist_iterinit(final);
13544     result_string = newSVpvs("");
13545     while (invlist_iternext(final, &start, &end)) {
13546         if (start == end) {
13547             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13548         }
13549         else {
13550             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13551                                                      start,          end);
13552         }
13553     }
13554
13555     save_parse = RExC_parse;
13556     RExC_parse = SvPV(result_string, len);
13557     save_end = RExC_end;
13558     RExC_end = RExC_parse + len;
13559
13560     /* We turn off folding around the call, as the class we have constructed
13561      * already has all folding taken into consideration, and we don't want
13562      * regclass() to add to that */
13563     RExC_flags &= ~RXf_PMf_FOLD;
13564     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13565      */
13566     node = regclass(pRExC_state, flagp,depth+1,
13567                     FALSE, /* means parse the whole char class */
13568                     FALSE, /* don't allow multi-char folds */
13569                     TRUE, /* silence non-portable warnings.  The above may very
13570                              well have generated non-portable code points, but
13571                              they're valid on this machine */
13572                     NULL);
13573     if (!node)
13574         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13575                     PTR2UV(flagp));
13576     if (save_fold) {
13577         RExC_flags |= RXf_PMf_FOLD;
13578     }
13579     RExC_parse = save_parse + 1;
13580     RExC_end = save_end;
13581     SvREFCNT_dec_NN(final);
13582     SvREFCNT_dec_NN(result_string);
13583
13584     nextchar(pRExC_state);
13585     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13586     return node;
13587 }
13588 #undef IS_OPERAND
13589
13590 STATIC void
13591 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13592 {
13593     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13594      * innocent-looking character class, like /[ks]/i won't have to go out to
13595      * disk to find the possible matches.
13596      *
13597      * This should be called only for a Latin1-range code points, cp, which is
13598      * known to be involved in a simple fold with other code points above
13599      * Latin1.  It would give false results if /aa has been specified.
13600      * Multi-char folds are outside the scope of this, and must be handled
13601      * specially.
13602      *
13603      * XXX It would be better to generate these via regen, in case a new
13604      * version of the Unicode standard adds new mappings, though that is not
13605      * really likely, and may be caught by the default: case of the switch
13606      * below. */
13607
13608     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13609
13610     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13611
13612     switch (cp) {
13613         case 'k':
13614         case 'K':
13615           *invlist =
13616              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13617             break;
13618         case 's':
13619         case 'S':
13620           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13621             break;
13622         case MICRO_SIGN:
13623           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13624           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13625             break;
13626         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13627         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13628           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13629             break;
13630         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13631           *invlist = add_cp_to_invlist(*invlist,
13632                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13633             break;
13634         case LATIN_SMALL_LETTER_SHARP_S:
13635           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13636             break;
13637         default:
13638             /* Use deprecated warning to increase the chances of this being
13639              * output */
13640             if (PASS2) {
13641                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13642             }
13643             break;
13644     }
13645 }
13646
13647 STATIC AV *
13648 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13649 {
13650     /* This adds the string scalar <multi_string> to the array
13651      * <multi_char_matches>.  <multi_string> is known to have exactly
13652      * <cp_count> code points in it.  This is used when constructing a
13653      * bracketed character class and we find something that needs to match more
13654      * than a single character.
13655      *
13656      * <multi_char_matches> is actually an array of arrays.  Each top-level
13657      * element is an array that contains all the strings known so far that are
13658      * the same length.  And that length (in number of code points) is the same
13659      * as the index of the top-level array.  Hence, the [2] element is an
13660      * array, each element thereof is a string containing TWO code points;
13661      * while element [3] is for strings of THREE characters, and so on.  Since
13662      * this is for multi-char strings there can never be a [0] nor [1] element.
13663      *
13664      * When we rewrite the character class below, we will do so such that the
13665      * longest strings are written first, so that it prefers the longest
13666      * matching strings first.  This is done even if it turns out that any
13667      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13668      * Christiansen has agreed that this is ok.  This makes the test for the
13669      * ligature 'ffi' come before the test for 'ff', for example */
13670
13671     AV* this_array;
13672     AV** this_array_ptr;
13673
13674     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13675
13676     if (! multi_char_matches) {
13677         multi_char_matches = newAV();
13678     }
13679
13680     if (av_exists(multi_char_matches, cp_count)) {
13681         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13682         this_array = *this_array_ptr;
13683     }
13684     else {
13685         this_array = newAV();
13686         av_store(multi_char_matches, cp_count,
13687                  (SV*) this_array);
13688     }
13689     av_push(this_array, multi_string);
13690
13691     return multi_char_matches;
13692 }
13693
13694 /* The names of properties whose definitions are not known at compile time are
13695  * stored in this SV, after a constant heading.  So if the length has been
13696  * changed since initialization, then there is a run-time definition. */
13697 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13698                                         (SvCUR(listsv) != initial_listsv_len)
13699
13700 STATIC regnode *
13701 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13702                  const bool stop_at_1,  /* Just parse the next thing, don't
13703                                            look for a full character class */
13704                  bool allow_multi_folds,
13705                  const bool silence_non_portable,   /* Don't output warnings
13706                                                        about too large
13707                                                        characters */
13708                  SV** ret_invlist)  /* Return an inversion list, not a node */
13709 {
13710     /* parse a bracketed class specification.  Most of these will produce an
13711      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13712      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13713      * under /i with multi-character folds: it will be rewritten following the
13714      * paradigm of this example, where the <multi-fold>s are characters which
13715      * fold to multiple character sequences:
13716      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13717      * gets effectively rewritten as:
13718      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13719      * reg() gets called (recursively) on the rewritten version, and this
13720      * function will return what it constructs.  (Actually the <multi-fold>s
13721      * aren't physically removed from the [abcdefghi], it's just that they are
13722      * ignored in the recursion by means of a flag:
13723      * <RExC_in_multi_char_class>.)
13724      *
13725      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13726      * characters, with the corresponding bit set if that character is in the
13727      * list.  For characters above this, a range list or swash is used.  There
13728      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13729      * determinable at compile time
13730      *
13731      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13732      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13733      */
13734
13735     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13736     IV range = 0;
13737     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13738     regnode *ret;
13739     STRLEN numlen;
13740     IV namedclass = OOB_NAMEDCLASS;
13741     char *rangebegin = NULL;
13742     bool need_class = 0;
13743     SV *listsv = NULL;
13744     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13745                                       than just initialized.  */
13746     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13747     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13748                                extended beyond the Latin1 range.  These have to
13749                                be kept separate from other code points for much
13750                                of this function because their handling  is
13751                                different under /i, and for most classes under
13752                                /d as well */
13753     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13754                                separate for a while from the non-complemented
13755                                versions because of complications with /d
13756                                matching */
13757     UV element_count = 0;   /* Number of distinct elements in the class.
13758                                Optimizations may be possible if this is tiny */
13759     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13760                                        character; used under /i */
13761     UV n;
13762     char * stop_ptr = RExC_end;    /* where to stop parsing */
13763     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13764                                                    space? */
13765     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13766
13767     /* Unicode properties are stored in a swash; this holds the current one
13768      * being parsed.  If this swash is the only above-latin1 component of the
13769      * character class, an optimization is to pass it directly on to the
13770      * execution engine.  Otherwise, it is set to NULL to indicate that there
13771      * are other things in the class that have to be dealt with at execution
13772      * time */
13773     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13774
13775     /* Set if a component of this character class is user-defined; just passed
13776      * on to the engine */
13777     bool has_user_defined_property = FALSE;
13778
13779     /* inversion list of code points this node matches only when the target
13780      * string is in UTF-8.  (Because is under /d) */
13781     SV* depends_list = NULL;
13782
13783     /* Inversion list of code points this node matches regardless of things
13784      * like locale, folding, utf8ness of the target string */
13785     SV* cp_list = NULL;
13786
13787     /* Like cp_list, but code points on this list need to be checked for things
13788      * that fold to/from them under /i */
13789     SV* cp_foldable_list = NULL;
13790
13791     /* Like cp_list, but code points on this list are valid only when the
13792      * runtime locale is UTF-8 */
13793     SV* only_utf8_locale_list = NULL;
13794
13795 #ifdef EBCDIC
13796     /* In a range, counts how many 0-2 of the ends of it came from literals,
13797      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13798     UV literal_endpoint = 0;
13799
13800     /* Is the range unicode? which means on a platform that isn't 1-1 native
13801      * to Unicode (i.e. non-ASCII), each code point in it should be considered
13802      * to be a Unicode value.  */
13803     bool unicode_range = FALSE;
13804 #endif
13805     bool invert = FALSE;    /* Is this class to be complemented */
13806
13807     bool warn_super = ALWAYS_WARN_SUPER;
13808
13809     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13810         case we need to change the emitted regop to an EXACT. */
13811     const char * orig_parse = RExC_parse;
13812     const SSize_t orig_size = RExC_size;
13813     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13814     GET_RE_DEBUG_FLAGS_DECL;
13815
13816     PERL_ARGS_ASSERT_REGCLASS;
13817 #ifndef DEBUGGING
13818     PERL_UNUSED_ARG(depth);
13819 #endif
13820
13821     DEBUG_PARSE("clas");
13822
13823     /* Assume we are going to generate an ANYOF node. */
13824     ret = reganode(pRExC_state,
13825                    (LOC)
13826                     ? ANYOFL
13827                     : ANYOF,
13828                    0);
13829
13830     if (SIZE_ONLY) {
13831         RExC_size += ANYOF_SKIP;
13832         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13833     }
13834     else {
13835         ANYOF_FLAGS(ret) = 0;
13836
13837         RExC_emit += ANYOF_SKIP;
13838         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13839         initial_listsv_len = SvCUR(listsv);
13840         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13841     }
13842
13843     if (skip_white) {
13844         RExC_parse = regpatws(pRExC_state, RExC_parse,
13845                               FALSE /* means don't recognize comments */ );
13846     }
13847
13848     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13849         RExC_parse++;
13850         invert = TRUE;
13851         allow_multi_folds = FALSE;
13852         MARK_NAUGHTY(1);
13853         if (skip_white) {
13854             RExC_parse = regpatws(pRExC_state, RExC_parse,
13855                                   FALSE /* means don't recognize comments */ );
13856         }
13857     }
13858
13859     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13860     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13861         const char *s = RExC_parse;
13862         const char  c = *s++;
13863
13864         while (isWORDCHAR(*s))
13865             s++;
13866         if (*s && c == *s && s[1] == ']') {
13867             SAVEFREESV(RExC_rx_sv);
13868             ckWARN3reg(s+2,
13869                        "POSIX syntax [%c %c] belongs inside character classes",
13870                        c, c);
13871             (void)ReREFCNT_inc(RExC_rx_sv);
13872         }
13873     }
13874
13875     /* If the caller wants us to just parse a single element, accomplish this
13876      * by faking the loop ending condition */
13877     if (stop_at_1 && RExC_end > RExC_parse) {
13878         stop_ptr = RExC_parse + 1;
13879     }
13880
13881     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13882     if (UCHARAT(RExC_parse) == ']')
13883         goto charclassloop;
13884
13885     while (1) {
13886         if  (RExC_parse >= stop_ptr) {
13887             break;
13888         }
13889
13890         if (skip_white) {
13891             RExC_parse = regpatws(pRExC_state, RExC_parse,
13892                                   FALSE /* means don't recognize comments */ );
13893         }
13894
13895         if  (UCHARAT(RExC_parse) == ']') {
13896             break;
13897         }
13898
13899     charclassloop:
13900
13901         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13902         save_value = value;
13903         save_prevvalue = prevvalue;
13904
13905         if (!range) {
13906             rangebegin = RExC_parse;
13907             element_count++;
13908 #ifdef EBCDIC
13909             literal_endpoint = 0;
13910 #endif
13911         }
13912         if (UTF) {
13913             value = utf8n_to_uvchr((U8*)RExC_parse,
13914                                    RExC_end - RExC_parse,
13915                                    &numlen, UTF8_ALLOW_DEFAULT);
13916             RExC_parse += numlen;
13917         }
13918         else
13919             value = UCHARAT(RExC_parse++);
13920
13921         if (value == '['
13922             && RExC_parse < RExC_end
13923             && POSIXCC(UCHARAT(RExC_parse)))
13924         {
13925             namedclass = regpposixcc(pRExC_state, value, strict);
13926         }
13927         else if (value != '\\') {
13928 #ifdef EBCDIC
13929             literal_endpoint++;
13930 #endif
13931         }
13932         else {
13933             /* Is a backslash; get the code point of the char after it */
13934             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13935                 value = utf8n_to_uvchr((U8*)RExC_parse,
13936                                    RExC_end - RExC_parse,
13937                                    &numlen, UTF8_ALLOW_DEFAULT);
13938                 RExC_parse += numlen;
13939             }
13940             else
13941                 value = UCHARAT(RExC_parse++);
13942
13943             /* Some compilers cannot handle switching on 64-bit integer
13944              * values, therefore value cannot be an UV.  Yes, this will
13945              * be a problem later if we want switch on Unicode.
13946              * A similar issue a little bit later when switching on
13947              * namedclass. --jhi */
13948
13949             /* If the \ is escaping white space when white space is being
13950              * skipped, it means that that white space is wanted literally, and
13951              * is already in 'value'.  Otherwise, need to translate the escape
13952              * into what it signifies. */
13953             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13954
13955             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13956             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13957             case 's':   namedclass = ANYOF_SPACE;       break;
13958             case 'S':   namedclass = ANYOF_NSPACE;      break;
13959             case 'd':   namedclass = ANYOF_DIGIT;       break;
13960             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13961             case 'v':   namedclass = ANYOF_VERTWS;      break;
13962             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13963             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13964             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13965             case 'N':  /* Handle \N{NAME} in class */
13966                 {
13967                     SV *as_text;
13968                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13969                                                     flagp, depth, &as_text);
13970                     if (*flagp & RESTART_UTF8)
13971                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13972                     if (cp_count != 1) {    /* The typical case drops through */
13973                         assert(cp_count != (STRLEN) -1);
13974                         if (cp_count == 0) {
13975                             if (strict) {
13976                                 RExC_parse++;   /* Position after the "}" */
13977                                 vFAIL("Zero length \\N{}");
13978                             }
13979                             else if (PASS2) {
13980                                 ckWARNreg(RExC_parse,
13981                                         "Ignoring zero length \\N{} in character class");
13982                             }
13983                         }
13984                         else { /* cp_count > 1 */
13985                             if (! RExC_in_multi_char_class) {
13986                                 if (invert || range || *RExC_parse == '-') {
13987                                     if (strict) {
13988                                         RExC_parse--;
13989                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13990                                     }
13991                                     else if (PASS2) {
13992                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13993                                     }
13994                                 }
13995                                 else {
13996                                     multi_char_matches
13997                                         = add_multi_match(multi_char_matches,
13998                                                           as_text,
13999                                                           cp_count);
14000                                 }
14001                                 break; /* <value> contains the first code
14002                                           point. Drop out of the switch to
14003                                           process it */
14004                             }
14005                         } /* End of cp_count != 1 */
14006
14007                         /* This element should not be processed further in this
14008                          * class */
14009                         element_count--;
14010                         value = save_value;
14011                         prevvalue = save_prevvalue;
14012                         continue;   /* Back to top of loop to get next char */
14013                     }
14014                     /* Here, is a single code point, and <value> contains it */
14015 #ifdef EBCDIC
14016                     /* We consider named characters to be literal characters,
14017                      * and they are Unicode */
14018                     literal_endpoint++;
14019                     unicode_range = TRUE;
14020 #endif
14021                 }
14022                 break;
14023             case 'p':
14024             case 'P':
14025                 {
14026                 char *e;
14027
14028                 /* We will handle any undefined properties ourselves */
14029                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14030                                        /* And we actually would prefer to get
14031                                         * the straight inversion list of the
14032                                         * swash, since we will be accessing it
14033                                         * anyway, to save a little time */
14034                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14035
14036                 if (RExC_parse >= RExC_end)
14037                     vFAIL2("Empty \\%c{}", (U8)value);
14038                 if (*RExC_parse == '{') {
14039                     const U8 c = (U8)value;
14040                     e = strchr(RExC_parse++, '}');
14041                     if (!e)
14042                         vFAIL2("Missing right brace on \\%c{}", c);
14043                     while (isSPACE(*RExC_parse))
14044                         RExC_parse++;
14045                     if (e == RExC_parse)
14046                         vFAIL2("Empty \\%c{}", c);
14047                     n = e - RExC_parse;
14048                     while (isSPACE(*(RExC_parse + n - 1)))
14049                         n--;
14050                 }
14051                 else {
14052                     e = RExC_parse;
14053                     n = 1;
14054                 }
14055                 if (!SIZE_ONLY) {
14056                     SV* invlist;
14057                     char* name;
14058
14059                     if (UCHARAT(RExC_parse) == '^') {
14060                          RExC_parse++;
14061                          n--;
14062                          /* toggle.  (The rhs xor gets the single bit that
14063                           * differs between P and p; the other xor inverts just
14064                           * that bit) */
14065                          value ^= 'P' ^ 'p';
14066
14067                          while (isSPACE(*RExC_parse)) {
14068                               RExC_parse++;
14069                               n--;
14070                          }
14071                     }
14072                     /* Try to get the definition of the property into
14073                      * <invlist>.  If /i is in effect, the effective property
14074                      * will have its name be <__NAME_i>.  The design is
14075                      * discussed in commit
14076                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14077                     name = savepv(Perl_form(aTHX_
14078                                           "%s%.*s%s\n",
14079                                           (FOLD) ? "__" : "",
14080                                           (int)n,
14081                                           RExC_parse,
14082                                           (FOLD) ? "_i" : ""
14083                                 ));
14084
14085                     /* Look up the property name, and get its swash and
14086                      * inversion list, if the property is found  */
14087                     if (swash) {
14088                         SvREFCNT_dec_NN(swash);
14089                     }
14090                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14091                                              1, /* binary */
14092                                              0, /* not tr/// */
14093                                              NULL, /* No inversion list */
14094                                              &swash_init_flags
14095                                             );
14096                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14097                         HV* curpkg = (IN_PERL_COMPILETIME)
14098                                       ? PL_curstash
14099                                       : CopSTASH(PL_curcop);
14100                         if (swash) {
14101                             SvREFCNT_dec_NN(swash);
14102                             swash = NULL;
14103                         }
14104
14105                         /* Here didn't find it.  It could be a user-defined
14106                          * property that will be available at run-time.  If we
14107                          * accept only compile-time properties, is an error;
14108                          * otherwise add it to the list for run-time look up */
14109                         if (ret_invlist) {
14110                             RExC_parse = e + 1;
14111                             vFAIL2utf8f(
14112                                 "Property '%"UTF8f"' is unknown",
14113                                 UTF8fARG(UTF, n, name));
14114                         }
14115
14116                         /* If the property name doesn't already have a package
14117                          * name, add the current one to it so that it can be
14118                          * referred to outside it. [perl #121777] */
14119                         if (curpkg && ! instr(name, "::")) {
14120                             char* pkgname = HvNAME(curpkg);
14121                             if (strNE(pkgname, "main")) {
14122                                 char* full_name = Perl_form(aTHX_
14123                                                             "%s::%s",
14124                                                             pkgname,
14125                                                             name);
14126                                 n = strlen(full_name);
14127                                 Safefree(name);
14128                                 name = savepvn(full_name, n);
14129                             }
14130                         }
14131                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14132                                         (value == 'p' ? '+' : '!'),
14133                                         UTF8fARG(UTF, n, name));
14134                         has_user_defined_property = TRUE;
14135
14136                         /* We don't know yet, so have to assume that the
14137                          * property could match something in the Latin1 range,
14138                          * hence something that isn't utf8.  Note that this
14139                          * would cause things in <depends_list> to match
14140                          * inappropriately, except that any \p{}, including
14141                          * this one forces Unicode semantics, which means there
14142                          * is no <depends_list> */
14143                         ANYOF_FLAGS(ret)
14144                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14145                     }
14146                     else {
14147
14148                         /* Here, did get the swash and its inversion list.  If
14149                          * the swash is from a user-defined property, then this
14150                          * whole character class should be regarded as such */
14151                         if (swash_init_flags
14152                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14153                         {
14154                             has_user_defined_property = TRUE;
14155                         }
14156                         else if
14157                             /* We warn on matching an above-Unicode code point
14158                              * if the match would return true, except don't
14159                              * warn for \p{All}, which has exactly one element
14160                              * = 0 */
14161                             (_invlist_contains_cp(invlist, 0x110000)
14162                                 && (! (_invlist_len(invlist) == 1
14163                                        && *invlist_array(invlist) == 0)))
14164                         {
14165                             warn_super = TRUE;
14166                         }
14167
14168
14169                         /* Invert if asking for the complement */
14170                         if (value == 'P') {
14171                             _invlist_union_complement_2nd(properties,
14172                                                           invlist,
14173                                                           &properties);
14174
14175                             /* The swash can't be used as-is, because we've
14176                              * inverted things; delay removing it to here after
14177                              * have copied its invlist above */
14178                             SvREFCNT_dec_NN(swash);
14179                             swash = NULL;
14180                         }
14181                         else {
14182                             _invlist_union(properties, invlist, &properties);
14183                         }
14184                     }
14185                     Safefree(name);
14186                 }
14187                 RExC_parse = e + 1;
14188                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14189                                                 named */
14190
14191                 /* \p means they want Unicode semantics */
14192                 RExC_uni_semantics = 1;
14193                 }
14194                 break;
14195             case 'n':   value = '\n';                   break;
14196             case 'r':   value = '\r';                   break;
14197             case 't':   value = '\t';                   break;
14198             case 'f':   value = '\f';                   break;
14199             case 'b':   value = '\b';                   break;
14200             case 'e':   value = ESC_NATIVE;             break;
14201             case 'a':   value = '\a';                   break;
14202             case 'o':
14203                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14204                 {
14205                     const char* error_msg;
14206                     bool valid = grok_bslash_o(&RExC_parse,
14207                                                &value,
14208                                                &error_msg,
14209                                                PASS2,   /* warnings only in
14210                                                            pass 2 */
14211                                                strict,
14212                                                silence_non_portable,
14213                                                UTF);
14214                     if (! valid) {
14215                         vFAIL(error_msg);
14216                     }
14217                 }
14218                 if (IN_ENCODING && value < 0x100) {
14219                     goto recode_encoding;
14220                 }
14221                 break;
14222             case 'x':
14223                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14224                 {
14225                     const char* error_msg;
14226                     bool valid = grok_bslash_x(&RExC_parse,
14227                                                &value,
14228                                                &error_msg,
14229                                                PASS2, /* Output warnings */
14230                                                strict,
14231                                                silence_non_portable,
14232                                                UTF);
14233                     if (! valid) {
14234                         vFAIL(error_msg);
14235                     }
14236                 }
14237                 if (IN_ENCODING && value < 0x100)
14238                     goto recode_encoding;
14239                 break;
14240             case 'c':
14241                 value = grok_bslash_c(*RExC_parse++, PASS2);
14242                 break;
14243             case '0': case '1': case '2': case '3': case '4':
14244             case '5': case '6': case '7':
14245                 {
14246                     /* Take 1-3 octal digits */
14247                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14248                     numlen = (strict) ? 4 : 3;
14249                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14250                     RExC_parse += numlen;
14251                     if (numlen != 3) {
14252                         if (strict) {
14253                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14254                             vFAIL("Need exactly 3 octal digits");
14255                         }
14256                         else if (! SIZE_ONLY /* like \08, \178 */
14257                                  && numlen < 3
14258                                  && RExC_parse < RExC_end
14259                                  && isDIGIT(*RExC_parse)
14260                                  && ckWARN(WARN_REGEXP))
14261                         {
14262                             SAVEFREESV(RExC_rx_sv);
14263                             reg_warn_non_literal_string(
14264                                  RExC_parse + 1,
14265                                  form_short_octal_warning(RExC_parse, numlen));
14266                             (void)ReREFCNT_inc(RExC_rx_sv);
14267                         }
14268                     }
14269                     if (IN_ENCODING && value < 0x100)
14270                         goto recode_encoding;
14271                     break;
14272                 }
14273             recode_encoding:
14274                 if (! RExC_override_recoding) {
14275                     SV* enc = _get_encoding();
14276                     value = reg_recode((const char)(U8)value, &enc);
14277                     if (!enc) {
14278                         if (strict) {
14279                             vFAIL("Invalid escape in the specified encoding");
14280                         }
14281                         else if (PASS2) {
14282                             ckWARNreg(RExC_parse,
14283                                   "Invalid escape in the specified encoding");
14284                         }
14285                     }
14286                     break;
14287                 }
14288             default:
14289                 /* Allow \_ to not give an error */
14290                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14291                     if (strict) {
14292                         vFAIL2("Unrecognized escape \\%c in character class",
14293                                (int)value);
14294                     }
14295                     else {
14296                         SAVEFREESV(RExC_rx_sv);
14297                         ckWARN2reg(RExC_parse,
14298                             "Unrecognized escape \\%c in character class passed through",
14299                             (int)value);
14300                         (void)ReREFCNT_inc(RExC_rx_sv);
14301                     }
14302                 }
14303                 break;
14304             }   /* End of switch on char following backslash */
14305         } /* end of handling backslash escape sequences */
14306
14307         /* Here, we have the current token in 'value' */
14308
14309         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14310             U8 classnum;
14311
14312             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14313              * literal, as is the character that began the false range, i.e.
14314              * the 'a' in the examples */
14315             if (range) {
14316                 if (!SIZE_ONLY) {
14317                     const int w = (RExC_parse >= rangebegin)
14318                                   ? RExC_parse - rangebegin
14319                                   : 0;
14320                     if (strict) {
14321                         vFAIL2utf8f(
14322                             "False [] range \"%"UTF8f"\"",
14323                             UTF8fARG(UTF, w, rangebegin));
14324                     }
14325                     else {
14326                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14327                         ckWARN2reg(RExC_parse,
14328                             "False [] range \"%"UTF8f"\"",
14329                             UTF8fARG(UTF, w, rangebegin));
14330                         (void)ReREFCNT_inc(RExC_rx_sv);
14331                         cp_list = add_cp_to_invlist(cp_list, '-');
14332                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14333                                                              prevvalue);
14334                     }
14335                 }
14336
14337                 range = 0; /* this was not a true range */
14338                 element_count += 2; /* So counts for three values */
14339             }
14340
14341             classnum = namedclass_to_classnum(namedclass);
14342
14343             if (LOC && namedclass < ANYOF_POSIXL_MAX
14344 #ifndef HAS_ISASCII
14345                 && classnum != _CC_ASCII
14346 #endif
14347             ) {
14348                 /* What the Posix classes (like \w, [:space:]) match in locale
14349                  * isn't knowable under locale until actual match time.  Room
14350                  * must be reserved (one time per outer bracketed class) to
14351                  * store such classes.  The space will contain a bit for each
14352                  * named class that is to be matched against.  This isn't
14353                  * needed for \p{} and pseudo-classes, as they are not affected
14354                  * by locale, and hence are dealt with separately */
14355                 if (! need_class) {
14356                     need_class = 1;
14357                     if (SIZE_ONLY) {
14358                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14359                     }
14360                     else {
14361                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14362                     }
14363                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14364                     ANYOF_POSIXL_ZERO(ret);
14365                 }
14366
14367                 /* Coverity thinks it is possible for this to be negative; both
14368                  * jhi and khw think it's not, but be safer */
14369                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14370                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14371
14372                 /* See if it already matches the complement of this POSIX
14373                  * class */
14374                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14375                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14376                                                             ? -1
14377                                                             : 1)))
14378                 {
14379                     posixl_matches_all = TRUE;
14380                     break;  /* No need to continue.  Since it matches both
14381                                e.g., \w and \W, it matches everything, and the
14382                                bracketed class can be optimized into qr/./s */
14383                 }
14384
14385                 /* Add this class to those that should be checked at runtime */
14386                 ANYOF_POSIXL_SET(ret, namedclass);
14387
14388                 /* The above-Latin1 characters are not subject to locale rules.
14389                  * Just add them, in the second pass, to the
14390                  * unconditionally-matched list */
14391                 if (! SIZE_ONLY) {
14392                     SV* scratch_list = NULL;
14393
14394                     /* Get the list of the above-Latin1 code points this
14395                      * matches */
14396                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14397                                           PL_XPosix_ptrs[classnum],
14398
14399                                           /* Odd numbers are complements, like
14400                                            * NDIGIT, NASCII, ... */
14401                                           namedclass % 2 != 0,
14402                                           &scratch_list);
14403                     /* Checking if 'cp_list' is NULL first saves an extra
14404                      * clone.  Its reference count will be decremented at the
14405                      * next union, etc, or if this is the only instance, at the
14406                      * end of the routine */
14407                     if (! cp_list) {
14408                         cp_list = scratch_list;
14409                     }
14410                     else {
14411                         _invlist_union(cp_list, scratch_list, &cp_list);
14412                         SvREFCNT_dec_NN(scratch_list);
14413                     }
14414                     continue;   /* Go get next character */
14415                 }
14416             }
14417             else if (! SIZE_ONLY) {
14418
14419                 /* Here, not in pass1 (in that pass we skip calculating the
14420                  * contents of this class), and is /l, or is a POSIX class for
14421                  * which /l doesn't matter (or is a Unicode property, which is
14422                  * skipped here). */
14423                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14424                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14425
14426                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14427                          * nor /l make a difference in what these match,
14428                          * therefore we just add what they match to cp_list. */
14429                         if (classnum != _CC_VERTSPACE) {
14430                             assert(   namedclass == ANYOF_HORIZWS
14431                                    || namedclass == ANYOF_NHORIZWS);
14432
14433                             /* It turns out that \h is just a synonym for
14434                              * XPosixBlank */
14435                             classnum = _CC_BLANK;
14436                         }
14437
14438                         _invlist_union_maybe_complement_2nd(
14439                                 cp_list,
14440                                 PL_XPosix_ptrs[classnum],
14441                                 namedclass % 2 != 0,    /* Complement if odd
14442                                                           (NHORIZWS, NVERTWS)
14443                                                         */
14444                                 &cp_list);
14445                     }
14446                 }
14447                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14448                            complement and use nposixes */
14449                     SV** posixes_ptr = namedclass % 2 == 0
14450                                        ? &posixes
14451                                        : &nposixes;
14452                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14453                     _invlist_union_maybe_complement_2nd(
14454                                                      *posixes_ptr,
14455                                                      *source_ptr,
14456                                                      namedclass % 2 != 0,
14457                                                      posixes_ptr);
14458                 }
14459             }
14460         } /* end of namedclass \blah */
14461
14462         if (skip_white) {
14463             RExC_parse = regpatws(pRExC_state, RExC_parse,
14464                                 FALSE /* means don't recognize comments */ );
14465         }
14466
14467         /* If 'range' is set, 'value' is the ending of a range--check its
14468          * validity.  (If value isn't a single code point in the case of a
14469          * range, we should have figured that out above in the code that
14470          * catches false ranges).  Later, we will handle each individual code
14471          * point in the range.  If 'range' isn't set, this could be the
14472          * beginning of a range, so check for that by looking ahead to see if
14473          * the next real character to be processed is the range indicator--the
14474          * minus sign */
14475
14476         if (range) {
14477 #ifdef EBCDIC
14478             /* For unicode ranges, we have to test that the Unicode as opposed
14479              * to the native values are not decreasing.  (Above 255, and there
14480              * is no difference between native and Unicode) */
14481             if (unicode_range && prevvalue < 255 && value < 255) {
14482                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14483                     goto backwards_range;
14484                 }
14485             }
14486             else
14487 #endif
14488             if (prevvalue > value) /* b-a */ {
14489                 int w;
14490 #ifdef EBCDIC
14491               backwards_range:
14492 #endif
14493                 w = RExC_parse - rangebegin;
14494                 vFAIL2utf8f(
14495                     "Invalid [] range \"%"UTF8f"\"",
14496                     UTF8fARG(UTF, w, rangebegin));
14497                 NOT_REACHED; /* NOT REACHED */
14498             }
14499         }
14500         else {
14501             prevvalue = value; /* save the beginning of the potential range */
14502             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14503                 && *RExC_parse == '-')
14504             {
14505                 char* next_char_ptr = RExC_parse + 1;
14506                 if (skip_white) {   /* Get the next real char after the '-' */
14507                     next_char_ptr = regpatws(pRExC_state,
14508                                              RExC_parse + 1,
14509                                              FALSE); /* means don't recognize
14510                                                         comments */
14511                 }
14512
14513                 /* If the '-' is at the end of the class (just before the ']',
14514                  * it is a literal minus; otherwise it is a range */
14515                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14516                     RExC_parse = next_char_ptr;
14517
14518                     /* a bad range like \w-, [:word:]- ? */
14519                     if (namedclass > OOB_NAMEDCLASS) {
14520                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14521                             const int w = RExC_parse >= rangebegin
14522                                           ?  RExC_parse - rangebegin
14523                                           : 0;
14524                             if (strict) {
14525                                 vFAIL4("False [] range \"%*.*s\"",
14526                                     w, w, rangebegin);
14527                             }
14528                             else if (PASS2) {
14529                                 vWARN4(RExC_parse,
14530                                     "False [] range \"%*.*s\"",
14531                                     w, w, rangebegin);
14532                             }
14533                         }
14534                         if (!SIZE_ONLY) {
14535                             cp_list = add_cp_to_invlist(cp_list, '-');
14536                         }
14537                         element_count++;
14538                     } else
14539                         range = 1;      /* yeah, it's a range! */
14540                     continue;   /* but do it the next time */
14541                 }
14542             }
14543         }
14544
14545         if (namedclass > OOB_NAMEDCLASS) {
14546             continue;
14547         }
14548
14549         /* Here, we have a single value this time through the loop, and
14550          * <prevvalue> is the beginning of the range, if any; or <value> if
14551          * not. */
14552
14553         /* non-Latin1 code point implies unicode semantics.  Must be set in
14554          * pass1 so is there for the whole of pass 2 */
14555         if (value > 255) {
14556             RExC_uni_semantics = 1;
14557         }
14558
14559         /* Ready to process either the single value, or the completed range.
14560          * For single-valued non-inverted ranges, we consider the possibility
14561          * of multi-char folds.  (We made a conscious decision to not do this
14562          * for the other cases because it can often lead to non-intuitive
14563          * results.  For example, you have the peculiar case that:
14564          *  "s s" =~ /^[^\xDF]+$/i => Y
14565          *  "ss"  =~ /^[^\xDF]+$/i => N
14566          *
14567          * See [perl #89750] */
14568         if (FOLD && allow_multi_folds && value == prevvalue) {
14569             if (value == LATIN_SMALL_LETTER_SHARP_S
14570                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14571                                                         value)))
14572             {
14573                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14574
14575                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14576                 STRLEN foldlen;
14577
14578                 UV folded = _to_uni_fold_flags(
14579                                 value,
14580                                 foldbuf,
14581                                 &foldlen,
14582                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14583                                                    ? FOLD_FLAGS_NOMIX_ASCII
14584                                                    : 0)
14585                                 );
14586
14587                 /* Here, <folded> should be the first character of the
14588                  * multi-char fold of <value>, with <foldbuf> containing the
14589                  * whole thing.  But, if this fold is not allowed (because of
14590                  * the flags), <fold> will be the same as <value>, and should
14591                  * be processed like any other character, so skip the special
14592                  * handling */
14593                 if (folded != value) {
14594
14595                     /* Skip if we are recursed, currently parsing the class
14596                      * again.  Otherwise add this character to the list of
14597                      * multi-char folds. */
14598                     if (! RExC_in_multi_char_class) {
14599                         STRLEN cp_count = utf8_length(foldbuf,
14600                                                       foldbuf + foldlen);
14601                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14602
14603                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14604
14605                         multi_char_matches
14606                                         = add_multi_match(multi_char_matches,
14607                                                           multi_fold,
14608                                                           cp_count);
14609
14610                     }
14611
14612                     /* This element should not be processed further in this
14613                      * class */
14614                     element_count--;
14615                     value = save_value;
14616                     prevvalue = save_prevvalue;
14617                     continue;
14618                 }
14619             }
14620         }
14621
14622         /* Deal with this element of the class */
14623         if (! SIZE_ONLY) {
14624 #ifndef EBCDIC
14625             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14626                                                      prevvalue, value);
14627 #else
14628             /* On non-ASCII platforms, for ranges that span all of 0..255, and
14629              * ones that don't require special handling, we can just add the
14630              * range like we do for ASCII platforms */
14631             if ((UNLIKELY(prevvalue == 0) && value >= 255)
14632                 || ! (prevvalue < 256
14633                       && (unicode_range
14634                           || (literal_endpoint == 2
14635                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14636                                   || (isUPPER_A(prevvalue)
14637                                       && isUPPER_A(value)))))))
14638             {
14639                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14640                                                          prevvalue, value);
14641             }
14642             else {
14643                 /* Here, requires special handling.  This can be because it is
14644                  * a range whose code points are considered to be Unicode, and
14645                  * so must be individually translated into native, or because
14646                  * its a subrange of 'A-Z' or 'a-z' which each aren't
14647                  * contiguous in EBCDIC, but we have defined them to include
14648                  * only the "expected" upper or lower case ASCII alphabetics.
14649                  * Subranges above 255 are the same in native and Unicode, so
14650                  * can be added as a range */
14651                 U8 start = NATIVE_TO_LATIN1(prevvalue);
14652                 unsigned j;
14653                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14654                 for (j = start; j <= end; j++) {
14655                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14656                 }
14657                 if (value > 255) {
14658                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14659                                                              256, value);
14660                 }
14661             }
14662 #endif
14663         }
14664
14665         range = 0; /* this range (if it was one) is done now */
14666     } /* End of loop through all the text within the brackets */
14667
14668     /* If anything in the class expands to more than one character, we have to
14669      * deal with them by building up a substitute parse string, and recursively
14670      * calling reg() on it, instead of proceeding */
14671     if (multi_char_matches) {
14672         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14673         I32 cp_count;
14674         STRLEN len;
14675         char *save_end = RExC_end;
14676         char *save_parse = RExC_parse;
14677         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14678                                        a "|" */
14679         I32 reg_flags;
14680
14681         assert(! invert);
14682 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14683            because too confusing */
14684         if (invert) {
14685             sv_catpv(substitute_parse, "(?:");
14686         }
14687 #endif
14688
14689         /* Look at the longest folds first */
14690         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14691
14692             if (av_exists(multi_char_matches, cp_count)) {
14693                 AV** this_array_ptr;
14694                 SV* this_sequence;
14695
14696                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14697                                                  cp_count, FALSE);
14698                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14699                                                                 &PL_sv_undef)
14700                 {
14701                     if (! first_time) {
14702                         sv_catpv(substitute_parse, "|");
14703                     }
14704                     first_time = FALSE;
14705
14706                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14707                 }
14708             }
14709         }
14710
14711         /* If the character class contains anything else besides these
14712          * multi-character folds, have to include it in recursive parsing */
14713         if (element_count) {
14714             sv_catpv(substitute_parse, "|[");
14715             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14716             sv_catpv(substitute_parse, "]");
14717         }
14718
14719         sv_catpv(substitute_parse, ")");
14720 #if 0
14721         if (invert) {
14722             /* This is a way to get the parse to skip forward a whole named
14723              * sequence instead of matching the 2nd character when it fails the
14724              * first */
14725             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14726         }
14727 #endif
14728
14729         RExC_parse = SvPV(substitute_parse, len);
14730         RExC_end = RExC_parse + len;
14731         RExC_in_multi_char_class = 1;
14732         RExC_override_recoding = 1;
14733         RExC_emit = (regnode *)orig_emit;
14734
14735         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14736
14737         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14738
14739         RExC_parse = save_parse;
14740         RExC_end = save_end;
14741         RExC_in_multi_char_class = 0;
14742         RExC_override_recoding = 0;
14743         SvREFCNT_dec_NN(multi_char_matches);
14744         return ret;
14745     }
14746
14747     /* Here, we've gone through the entire class and dealt with multi-char
14748      * folds.  We are now in a position that we can do some checks to see if we
14749      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14750      * Currently we only do two checks:
14751      * 1) is in the unlikely event that the user has specified both, eg. \w and
14752      *    \W under /l, then the class matches everything.  (This optimization
14753      *    is done only to make the optimizer code run later work.)
14754      * 2) if the character class contains only a single element (including a
14755      *    single range), we see if there is an equivalent node for it.
14756      * Other checks are possible */
14757     if (! ret_invlist   /* Can't optimize if returning the constructed
14758                            inversion list */
14759         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14760     {
14761         U8 op = END;
14762         U8 arg = 0;
14763
14764         if (UNLIKELY(posixl_matches_all)) {
14765             op = SANY;
14766         }
14767         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14768                                                    \w or [:digit:] or \p{foo}
14769                                                  */
14770
14771             /* All named classes are mapped into POSIXish nodes, with its FLAG
14772              * argument giving which class it is */
14773             switch ((I32)namedclass) {
14774                 case ANYOF_UNIPROP:
14775                     break;
14776
14777                 /* These don't depend on the charset modifiers.  They always
14778                  * match under /u rules */
14779                 case ANYOF_NHORIZWS:
14780                 case ANYOF_HORIZWS:
14781                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14782                     /* FALLTHROUGH */
14783
14784                 case ANYOF_NVERTWS:
14785                 case ANYOF_VERTWS:
14786                     op = POSIXU;
14787                     goto join_posix;
14788
14789                 /* The actual POSIXish node for all the rest depends on the
14790                  * charset modifier.  The ones in the first set depend only on
14791                  * ASCII or, if available on this platform, locale */
14792                 case ANYOF_ASCII:
14793                 case ANYOF_NASCII:
14794 #ifdef HAS_ISASCII
14795                     op = (LOC) ? POSIXL : POSIXA;
14796 #else
14797                     op = POSIXA;
14798 #endif
14799                     goto join_posix;
14800
14801                 case ANYOF_NCASED:
14802                 case ANYOF_LOWER:
14803                 case ANYOF_NLOWER:
14804                 case ANYOF_UPPER:
14805                 case ANYOF_NUPPER:
14806                     /* under /a could be alpha */
14807                     if (FOLD) {
14808                         if (ASCII_RESTRICTED) {
14809                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14810                         }
14811                         else if (! LOC) {
14812                             break;
14813                         }
14814                     }
14815                     /* FALLTHROUGH */
14816
14817                 /* The rest have more possibilities depending on the charset.
14818                  * We take advantage of the enum ordering of the charset
14819                  * modifiers to get the exact node type, */
14820                 default:
14821                     op = POSIXD + get_regex_charset(RExC_flags);
14822                     if (op > POSIXA) { /* /aa is same as /a */
14823                         op = POSIXA;
14824                     }
14825
14826                 join_posix:
14827                     /* The odd numbered ones are the complements of the
14828                      * next-lower even number one */
14829                     if (namedclass % 2 == 1) {
14830                         invert = ! invert;
14831                         namedclass--;
14832                     }
14833                     arg = namedclass_to_classnum(namedclass);
14834                     break;
14835             }
14836         }
14837         else if (value == prevvalue) {
14838
14839             /* Here, the class consists of just a single code point */
14840
14841             if (invert) {
14842                 if (! LOC && value == '\n') {
14843                     op = REG_ANY; /* Optimize [^\n] */
14844                     *flagp |= HASWIDTH|SIMPLE;
14845                     MARK_NAUGHTY(1);
14846                 }
14847             }
14848             else if (value < 256 || UTF) {
14849
14850                 /* Optimize a single value into an EXACTish node, but not if it
14851                  * would require converting the pattern to UTF-8. */
14852                 op = compute_EXACTish(pRExC_state);
14853             }
14854         } /* Otherwise is a range */
14855         else if (! LOC) {   /* locale could vary these */
14856             if (prevvalue == '0') {
14857                 if (value == '9') {
14858                     arg = _CC_DIGIT;
14859                     op = POSIXA;
14860                 }
14861             }
14862             else if (prevvalue == 'A') {
14863                 if (value == 'Z'
14864 #ifdef EBCDIC
14865                     && literal_endpoint == 2
14866 #endif
14867                 ) {
14868                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14869                     op = POSIXA;
14870                 }
14871             }
14872             else if (prevvalue == 'a') {
14873                 if (value == 'z'
14874 #ifdef EBCDIC
14875                     && literal_endpoint == 2
14876 #endif
14877                 ) {
14878                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14879                     op = POSIXA;
14880                 }
14881             }
14882         }
14883
14884         /* Here, we have changed <op> away from its initial value iff we found
14885          * an optimization */
14886         if (op != END) {
14887
14888             /* Throw away this ANYOF regnode, and emit the calculated one,
14889              * which should correspond to the beginning, not current, state of
14890              * the parse */
14891             const char * cur_parse = RExC_parse;
14892             RExC_parse = (char *)orig_parse;
14893             if ( SIZE_ONLY) {
14894                 if (! LOC) {
14895
14896                     /* To get locale nodes to not use the full ANYOF size would
14897                      * require moving the code above that writes the portions
14898                      * of it that aren't in other nodes to after this point.
14899                      * e.g.  ANYOF_POSIXL_SET */
14900                     RExC_size = orig_size;
14901                 }
14902             }
14903             else {
14904                 RExC_emit = (regnode *)orig_emit;
14905                 if (PL_regkind[op] == POSIXD) {
14906                     if (op == POSIXL) {
14907                         RExC_contains_locale = 1;
14908                     }
14909                     if (invert) {
14910                         op += NPOSIXD - POSIXD;
14911                     }
14912                 }
14913             }
14914
14915             ret = reg_node(pRExC_state, op);
14916
14917             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14918                 if (! SIZE_ONLY) {
14919                     FLAGS(ret) = arg;
14920                 }
14921                 *flagp |= HASWIDTH|SIMPLE;
14922             }
14923             else if (PL_regkind[op] == EXACT) {
14924                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14925                                            TRUE /* downgradable to EXACT */
14926                                            );
14927             }
14928
14929             RExC_parse = (char *) cur_parse;
14930
14931             SvREFCNT_dec(posixes);
14932             SvREFCNT_dec(nposixes);
14933             SvREFCNT_dec(cp_list);
14934             SvREFCNT_dec(cp_foldable_list);
14935             return ret;
14936         }
14937     }
14938
14939     if (SIZE_ONLY)
14940         return ret;
14941     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14942
14943     /* If folding, we calculate all characters that could fold to or from the
14944      * ones already on the list */
14945     if (cp_foldable_list) {
14946         if (FOLD) {
14947             UV start, end;      /* End points of code point ranges */
14948
14949             SV* fold_intersection = NULL;
14950             SV** use_list;
14951
14952             /* Our calculated list will be for Unicode rules.  For locale
14953              * matching, we have to keep a separate list that is consulted at
14954              * runtime only when the locale indicates Unicode rules.  For
14955              * non-locale, we just use to the general list */
14956             if (LOC) {
14957                 use_list = &only_utf8_locale_list;
14958             }
14959             else {
14960                 use_list = &cp_list;
14961             }
14962
14963             /* Only the characters in this class that participate in folds need
14964              * be checked.  Get the intersection of this class and all the
14965              * possible characters that are foldable.  This can quickly narrow
14966              * down a large class */
14967             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14968                                   &fold_intersection);
14969
14970             /* The folds for all the Latin1 characters are hard-coded into this
14971              * program, but we have to go out to disk to get the others. */
14972             if (invlist_highest(cp_foldable_list) >= 256) {
14973
14974                 /* This is a hash that for a particular fold gives all
14975                  * characters that are involved in it */
14976                 if (! PL_utf8_foldclosures) {
14977                     _load_PL_utf8_foldclosures();
14978                 }
14979             }
14980
14981             /* Now look at the foldable characters in this class individually */
14982             invlist_iterinit(fold_intersection);
14983             while (invlist_iternext(fold_intersection, &start, &end)) {
14984                 UV j;
14985
14986                 /* Look at every character in the range */
14987                 for (j = start; j <= end; j++) {
14988                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14989                     STRLEN foldlen;
14990                     SV** listp;
14991
14992                     if (j < 256) {
14993
14994                         if (IS_IN_SOME_FOLD_L1(j)) {
14995
14996                             /* ASCII is always matched; non-ASCII is matched
14997                              * only under Unicode rules (which could happen
14998                              * under /l if the locale is a UTF-8 one */
14999                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15000                                 *use_list = add_cp_to_invlist(*use_list,
15001                                                             PL_fold_latin1[j]);
15002                             }
15003                             else {
15004                                 depends_list =
15005                                  add_cp_to_invlist(depends_list,
15006                                                    PL_fold_latin1[j]);
15007                             }
15008                         }
15009
15010                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15011                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15012                         {
15013                             add_above_Latin1_folds(pRExC_state,
15014                                                    (U8) j,
15015                                                    use_list);
15016                         }
15017                         continue;
15018                     }
15019
15020                     /* Here is an above Latin1 character.  We don't have the
15021                      * rules hard-coded for it.  First, get its fold.  This is
15022                      * the simple fold, as the multi-character folds have been
15023                      * handled earlier and separated out */
15024                     _to_uni_fold_flags(j, foldbuf, &foldlen,
15025                                                         (ASCII_FOLD_RESTRICTED)
15026                                                         ? FOLD_FLAGS_NOMIX_ASCII
15027                                                         : 0);
15028
15029                     /* Single character fold of above Latin1.  Add everything in
15030                     * its fold closure to the list that this node should match.
15031                     * The fold closures data structure is a hash with the keys
15032                     * being the UTF-8 of every character that is folded to, like
15033                     * 'k', and the values each an array of all code points that
15034                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
15035                     * Multi-character folds are not included */
15036                     if ((listp = hv_fetch(PL_utf8_foldclosures,
15037                                         (char *) foldbuf, foldlen, FALSE)))
15038                     {
15039                         AV* list = (AV*) *listp;
15040                         IV k;
15041                         for (k = 0; k <= av_tindex(list); k++) {
15042                             SV** c_p = av_fetch(list, k, FALSE);
15043                             UV c;
15044                             assert(c_p);
15045
15046                             c = SvUV(*c_p);
15047
15048                             /* /aa doesn't allow folds between ASCII and non- */
15049                             if ((ASCII_FOLD_RESTRICTED
15050                                 && (isASCII(c) != isASCII(j))))
15051                             {
15052                                 continue;
15053                             }
15054
15055                             /* Folds under /l which cross the 255/256 boundary
15056                              * are added to a separate list.  (These are valid
15057                              * only when the locale is UTF-8.) */
15058                             if (c < 256 && LOC) {
15059                                 *use_list = add_cp_to_invlist(*use_list, c);
15060                                 continue;
15061                             }
15062
15063                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15064                             {
15065                                 cp_list = add_cp_to_invlist(cp_list, c);
15066                             }
15067                             else {
15068                                 /* Similarly folds involving non-ascii Latin1
15069                                 * characters under /d are added to their list */
15070                                 depends_list = add_cp_to_invlist(depends_list,
15071                                                                  c);
15072                             }
15073                         }
15074                     }
15075                 }
15076             }
15077             SvREFCNT_dec_NN(fold_intersection);
15078         }
15079
15080         /* Now that we have finished adding all the folds, there is no reason
15081          * to keep the foldable list separate */
15082         _invlist_union(cp_list, cp_foldable_list, &cp_list);
15083         SvREFCNT_dec_NN(cp_foldable_list);
15084     }
15085
15086     /* And combine the result (if any) with any inversion list from posix
15087      * classes.  The lists are kept separate up to now because we don't want to
15088      * fold the classes (folding of those is automatically handled by the swash
15089      * fetching code) */
15090     if (posixes || nposixes) {
15091         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15092             /* Under /a and /aa, nothing above ASCII matches these */
15093             _invlist_intersection(posixes,
15094                                   PL_XPosix_ptrs[_CC_ASCII],
15095                                   &posixes);
15096         }
15097         if (nposixes) {
15098             if (DEPENDS_SEMANTICS) {
15099                 /* Under /d, everything in the upper half of the Latin1 range
15100                  * matches these complements */
15101                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15102             }
15103             else if (AT_LEAST_ASCII_RESTRICTED) {
15104                 /* Under /a and /aa, everything above ASCII matches these
15105                  * complements */
15106                 _invlist_union_complement_2nd(nposixes,
15107                                               PL_XPosix_ptrs[_CC_ASCII],
15108                                               &nposixes);
15109             }
15110             if (posixes) {
15111                 _invlist_union(posixes, nposixes, &posixes);
15112                 SvREFCNT_dec_NN(nposixes);
15113             }
15114             else {
15115                 posixes = nposixes;
15116             }
15117         }
15118         if (! DEPENDS_SEMANTICS) {
15119             if (cp_list) {
15120                 _invlist_union(cp_list, posixes, &cp_list);
15121                 SvREFCNT_dec_NN(posixes);
15122             }
15123             else {
15124                 cp_list = posixes;
15125             }
15126         }
15127         else {
15128             /* Under /d, we put into a separate list the Latin1 things that
15129              * match only when the target string is utf8 */
15130             SV* nonascii_but_latin1_properties = NULL;
15131             _invlist_intersection(posixes, PL_UpperLatin1,
15132                                   &nonascii_but_latin1_properties);
15133             _invlist_subtract(posixes, nonascii_but_latin1_properties,
15134                               &posixes);
15135             if (cp_list) {
15136                 _invlist_union(cp_list, posixes, &cp_list);
15137                 SvREFCNT_dec_NN(posixes);
15138             }
15139             else {
15140                 cp_list = posixes;
15141             }
15142
15143             if (depends_list) {
15144                 _invlist_union(depends_list, nonascii_but_latin1_properties,
15145                                &depends_list);
15146                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15147             }
15148             else {
15149                 depends_list = nonascii_but_latin1_properties;
15150             }
15151         }
15152     }
15153
15154     /* And combine the result (if any) with any inversion list from properties.
15155      * The lists are kept separate up to now so that we can distinguish the two
15156      * in regards to matching above-Unicode.  A run-time warning is generated
15157      * if a Unicode property is matched against a non-Unicode code point. But,
15158      * we allow user-defined properties to match anything, without any warning,
15159      * and we also suppress the warning if there is a portion of the character
15160      * class that isn't a Unicode property, and which matches above Unicode, \W
15161      * or [\x{110000}] for example.
15162      * (Note that in this case, unlike the Posix one above, there is no
15163      * <depends_list>, because having a Unicode property forces Unicode
15164      * semantics */
15165     if (properties) {
15166         if (cp_list) {
15167
15168             /* If it matters to the final outcome, see if a non-property
15169              * component of the class matches above Unicode.  If so, the
15170              * warning gets suppressed.  This is true even if just a single
15171              * such code point is specified, as though not strictly correct if
15172              * another such code point is matched against, the fact that they
15173              * are using above-Unicode code points indicates they should know
15174              * the issues involved */
15175             if (warn_super) {
15176                 warn_super = ! (invert
15177                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15178             }
15179
15180             _invlist_union(properties, cp_list, &cp_list);
15181             SvREFCNT_dec_NN(properties);
15182         }
15183         else {
15184             cp_list = properties;
15185         }
15186
15187         if (warn_super) {
15188             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15189         }
15190     }
15191
15192     /* Here, we have calculated what code points should be in the character
15193      * class.
15194      *
15195      * Now we can see about various optimizations.  Fold calculation (which we
15196      * did above) needs to take place before inversion.  Otherwise /[^k]/i
15197      * would invert to include K, which under /i would match k, which it
15198      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15199      * folded until runtime */
15200
15201     /* If we didn't do folding, it's because some information isn't available
15202      * until runtime; set the run-time fold flag for these.  (We don't have to
15203      * worry about properties folding, as that is taken care of by the swash
15204      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15205      * locales, or the class matches at least one 0-255 range code point */
15206     if (LOC && FOLD) {
15207         if (only_utf8_locale_list) {
15208             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15209         }
15210         else if (cp_list) { /* Look to see if there a 0-255 code point is in
15211                                the list */
15212             UV start, end;
15213             invlist_iterinit(cp_list);
15214             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15215                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15216             }
15217             invlist_iterfinish(cp_list);
15218         }
15219     }
15220
15221     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15222      * at compile time.  Besides not inverting folded locale now, we can't
15223      * invert if there are things such as \w, which aren't known until runtime
15224      * */
15225     if (cp_list
15226         && invert
15227         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15228         && ! depends_list
15229         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15230     {
15231         _invlist_invert(cp_list);
15232
15233         /* Any swash can't be used as-is, because we've inverted things */
15234         if (swash) {
15235             SvREFCNT_dec_NN(swash);
15236             swash = NULL;
15237         }
15238
15239         /* Clear the invert flag since have just done it here */
15240         invert = FALSE;
15241     }
15242
15243     if (ret_invlist) {
15244         *ret_invlist = cp_list;
15245         SvREFCNT_dec(swash);
15246
15247         /* Discard the generated node */
15248         if (SIZE_ONLY) {
15249             RExC_size = orig_size;
15250         }
15251         else {
15252             RExC_emit = orig_emit;
15253         }
15254         return orig_emit;
15255     }
15256
15257     /* Some character classes are equivalent to other nodes.  Such nodes take
15258      * up less room and generally fewer operations to execute than ANYOF nodes.
15259      * Above, we checked for and optimized into some such equivalents for
15260      * certain common classes that are easy to test.  Getting to this point in
15261      * the code means that the class didn't get optimized there.  Since this
15262      * code is only executed in Pass 2, it is too late to save space--it has
15263      * been allocated in Pass 1, and currently isn't given back.  But turning
15264      * things into an EXACTish node can allow the optimizer to join it to any
15265      * adjacent such nodes.  And if the class is equivalent to things like /./,
15266      * expensive run-time swashes can be avoided.  Now that we have more
15267      * complete information, we can find things necessarily missed by the
15268      * earlier code.  I (khw) am not sure how much to look for here.  It would
15269      * be easy, but perhaps too slow, to check any candidates against all the
15270      * node types they could possibly match using _invlistEQ(). */
15271
15272     if (cp_list
15273         && ! invert
15274         && ! depends_list
15275         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15276         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15277
15278            /* We don't optimize if we are supposed to make sure all non-Unicode
15279             * code points raise a warning, as only ANYOF nodes have this check.
15280             * */
15281         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15282     {
15283         UV start, end;
15284         U8 op = END;  /* The optimzation node-type */
15285         const char * cur_parse= RExC_parse;
15286
15287         invlist_iterinit(cp_list);
15288         if (! invlist_iternext(cp_list, &start, &end)) {
15289
15290             /* Here, the list is empty.  This happens, for example, when a
15291              * Unicode property is the only thing in the character class, and
15292              * it doesn't match anything.  (perluniprops.pod notes such
15293              * properties) */
15294             op = OPFAIL;
15295             *flagp |= HASWIDTH|SIMPLE;
15296         }
15297         else if (start == end) {    /* The range is a single code point */
15298             if (! invlist_iternext(cp_list, &start, &end)
15299
15300                     /* Don't do this optimization if it would require changing
15301                      * the pattern to UTF-8 */
15302                 && (start < 256 || UTF))
15303             {
15304                 /* Here, the list contains a single code point.  Can optimize
15305                  * into an EXACTish node */
15306
15307                 value = start;
15308
15309                 if (! FOLD) {
15310                     op = (LOC)
15311                          ? EXACTL
15312                          : EXACT;
15313                 }
15314                 else if (LOC) {
15315
15316                     /* A locale node under folding with one code point can be
15317                      * an EXACTFL, as its fold won't be calculated until
15318                      * runtime */
15319                     op = EXACTFL;
15320                 }
15321                 else {
15322
15323                     /* Here, we are generally folding, but there is only one
15324                      * code point to match.  If we have to, we use an EXACT
15325                      * node, but it would be better for joining with adjacent
15326                      * nodes in the optimization pass if we used the same
15327                      * EXACTFish node that any such are likely to be.  We can
15328                      * do this iff the code point doesn't participate in any
15329                      * folds.  For example, an EXACTF of a colon is the same as
15330                      * an EXACT one, since nothing folds to or from a colon. */
15331                     if (value < 256) {
15332                         if (IS_IN_SOME_FOLD_L1(value)) {
15333                             op = EXACT;
15334                         }
15335                     }
15336                     else {
15337                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15338                             op = EXACT;
15339                         }
15340                     }
15341
15342                     /* If we haven't found the node type, above, it means we
15343                      * can use the prevailing one */
15344                     if (op == END) {
15345                         op = compute_EXACTish(pRExC_state);
15346                     }
15347                 }
15348             }
15349         }
15350         else if (start == 0) {
15351             if (end == UV_MAX) {
15352                 op = SANY;
15353                 *flagp |= HASWIDTH|SIMPLE;
15354                 MARK_NAUGHTY(1);
15355             }
15356             else if (end == '\n' - 1
15357                     && invlist_iternext(cp_list, &start, &end)
15358                     && start == '\n' + 1 && end == UV_MAX)
15359             {
15360                 op = REG_ANY;
15361                 *flagp |= HASWIDTH|SIMPLE;
15362                 MARK_NAUGHTY(1);
15363             }
15364         }
15365         invlist_iterfinish(cp_list);
15366
15367         if (op != END) {
15368             RExC_parse = (char *)orig_parse;
15369             RExC_emit = (regnode *)orig_emit;
15370
15371             ret = reg_node(pRExC_state, op);
15372
15373             RExC_parse = (char *)cur_parse;
15374
15375             if (PL_regkind[op] == EXACT) {
15376                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15377                                            TRUE /* downgradable to EXACT */
15378                                           );
15379             }
15380
15381             SvREFCNT_dec_NN(cp_list);
15382             return ret;
15383         }
15384     }
15385
15386     /* Here, <cp_list> contains all the code points we can determine at
15387      * compile time that match under all conditions.  Go through it, and
15388      * for things that belong in the bitmap, put them there, and delete from
15389      * <cp_list>.  While we are at it, see if everything above 255 is in the
15390      * list, and if so, set a flag to speed up execution */
15391
15392     populate_ANYOF_from_invlist(ret, &cp_list);
15393
15394     if (invert) {
15395         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15396     }
15397
15398     /* Here, the bitmap has been populated with all the Latin1 code points that
15399      * always match.  Can now add to the overall list those that match only
15400      * when the target string is UTF-8 (<depends_list>). */
15401     if (depends_list) {
15402         if (cp_list) {
15403             _invlist_union(cp_list, depends_list, &cp_list);
15404             SvREFCNT_dec_NN(depends_list);
15405         }
15406         else {
15407             cp_list = depends_list;
15408         }
15409         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15410     }
15411
15412     /* If there is a swash and more than one element, we can't use the swash in
15413      * the optimization below. */
15414     if (swash && element_count > 1) {
15415         SvREFCNT_dec_NN(swash);
15416         swash = NULL;
15417     }
15418
15419     /* Note that the optimization of using 'swash' if it is the only thing in
15420      * the class doesn't have us change swash at all, so it can include things
15421      * that are also in the bitmap; otherwise we have purposely deleted that
15422      * duplicate information */
15423     set_ANYOF_arg(pRExC_state, ret, cp_list,
15424                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15425                    ? listsv : NULL,
15426                   only_utf8_locale_list,
15427                   swash, has_user_defined_property);
15428
15429     *flagp |= HASWIDTH|SIMPLE;
15430
15431     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15432         RExC_contains_locale = 1;
15433     }
15434
15435     return ret;
15436 }
15437
15438 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15439
15440 STATIC void
15441 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15442                 regnode* const node,
15443                 SV* const cp_list,
15444                 SV* const runtime_defns,
15445                 SV* const only_utf8_locale_list,
15446                 SV* const swash,
15447                 const bool has_user_defined_property)
15448 {
15449     /* Sets the arg field of an ANYOF-type node 'node', using information about
15450      * the node passed-in.  If there is nothing outside the node's bitmap, the
15451      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15452      * the count returned by add_data(), having allocated and stored an array,
15453      * av, that that count references, as follows:
15454      *  av[0] stores the character class description in its textual form.
15455      *        This is used later (regexec.c:Perl_regclass_swash()) to
15456      *        initialize the appropriate swash, and is also useful for dumping
15457      *        the regnode.  This is set to &PL_sv_undef if the textual
15458      *        description is not needed at run-time (as happens if the other
15459      *        elements completely define the class)
15460      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15461      *        computed from av[0].  But if no further computation need be done,
15462      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15463      *  av[2] stores the inversion list of code points that match only if the
15464      *        current locale is UTF-8
15465      *  av[3] stores the cp_list inversion list for use in addition or instead
15466      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15467      *        (Otherwise everything needed is already in av[0] and av[1])
15468      *  av[4] is set if any component of the class is from a user-defined
15469      *        property; used only if av[3] exists */
15470
15471     UV n;
15472
15473     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15474
15475     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15476         assert(! (ANYOF_FLAGS(node)
15477                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15478                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15479         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15480     }
15481     else {
15482         AV * const av = newAV();
15483         SV *rv;
15484
15485         assert(ANYOF_FLAGS(node)
15486                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15487                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15488
15489         av_store(av, 0, (runtime_defns)
15490                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15491         if (swash) {
15492             assert(cp_list);
15493             av_store(av, 1, swash);
15494             SvREFCNT_dec_NN(cp_list);
15495         }
15496         else {
15497             av_store(av, 1, &PL_sv_undef);
15498             if (cp_list) {
15499                 av_store(av, 3, cp_list);
15500                 av_store(av, 4, newSVuv(has_user_defined_property));
15501             }
15502         }
15503
15504         if (only_utf8_locale_list) {
15505             av_store(av, 2, only_utf8_locale_list);
15506         }
15507         else {
15508             av_store(av, 2, &PL_sv_undef);
15509         }
15510
15511         rv = newRV_noinc(MUTABLE_SV(av));
15512         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15513         RExC_rxi->data->data[n] = (void*)rv;
15514         ARG_SET(node, n);
15515     }
15516 }
15517
15518 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15519 SV *
15520 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15521                                         const regnode* node,
15522                                         bool doinit,
15523                                         SV** listsvp,
15524                                         SV** only_utf8_locale_ptr,
15525                                         SV*  exclude_list)
15526
15527 {
15528     /* For internal core use only.
15529      * Returns the swash for the input 'node' in the regex 'prog'.
15530      * If <doinit> is 'true', will attempt to create the swash if not already
15531      *    done.
15532      * If <listsvp> is non-null, will return the printable contents of the
15533      *    swash.  This can be used to get debugging information even before the
15534      *    swash exists, by calling this function with 'doinit' set to false, in
15535      *    which case the components that will be used to eventually create the
15536      *    swash are returned  (in a printable form).
15537      * If <exclude_list> is not NULL, it is an inversion list of things to
15538      *    exclude from what's returned in <listsvp>.
15539      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15540      * that, in spite of this function's name, the swash it returns may include
15541      * the bitmap data as well */
15542
15543     SV *sw  = NULL;
15544     SV *si  = NULL;         /* Input swash initialization string */
15545     SV*  invlist = NULL;
15546
15547     RXi_GET_DECL(prog,progi);
15548     const struct reg_data * const data = prog ? progi->data : NULL;
15549
15550     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15551
15552     assert(ANYOF_FLAGS(node)
15553         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15554            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15555
15556     if (data && data->count) {
15557         const U32 n = ARG(node);
15558
15559         if (data->what[n] == 's') {
15560             SV * const rv = MUTABLE_SV(data->data[n]);
15561             AV * const av = MUTABLE_AV(SvRV(rv));
15562             SV **const ary = AvARRAY(av);
15563             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15564
15565             si = *ary;  /* ary[0] = the string to initialize the swash with */
15566
15567             /* Elements 3 and 4 are either both present or both absent. [3] is
15568              * any inversion list generated at compile time; [4] indicates if
15569              * that inversion list has any user-defined properties in it. */
15570             if (av_tindex(av) >= 2) {
15571                 if (only_utf8_locale_ptr
15572                     && ary[2]
15573                     && ary[2] != &PL_sv_undef)
15574                 {
15575                     *only_utf8_locale_ptr = ary[2];
15576                 }
15577                 else {
15578                     assert(only_utf8_locale_ptr);
15579                     *only_utf8_locale_ptr = NULL;
15580                 }
15581
15582                 if (av_tindex(av) >= 3) {
15583                     invlist = ary[3];
15584                     if (SvUV(ary[4])) {
15585                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15586                     }
15587                 }
15588                 else {
15589                     invlist = NULL;
15590                 }
15591             }
15592
15593             /* Element [1] is reserved for the set-up swash.  If already there,
15594              * return it; if not, create it and store it there */
15595             if (ary[1] && SvROK(ary[1])) {
15596                 sw = ary[1];
15597             }
15598             else if (doinit && ((si && si != &PL_sv_undef)
15599                                  || (invlist && invlist != &PL_sv_undef))) {
15600                 assert(si);
15601                 sw = _core_swash_init("utf8", /* the utf8 package */
15602                                       "", /* nameless */
15603                                       si,
15604                                       1, /* binary */
15605                                       0, /* not from tr/// */
15606                                       invlist,
15607                                       &swash_init_flags);
15608                 (void)av_store(av, 1, sw);
15609             }
15610         }
15611     }
15612
15613     /* If requested, return a printable version of what this swash matches */
15614     if (listsvp) {
15615         SV* matches_string = newSVpvs("");
15616
15617         /* The swash should be used, if possible, to get the data, as it
15618          * contains the resolved data.  But this function can be called at
15619          * compile-time, before everything gets resolved, in which case we
15620          * return the currently best available information, which is the string
15621          * that will eventually be used to do that resolving, 'si' */
15622         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15623             && (si && si != &PL_sv_undef))
15624         {
15625             sv_catsv(matches_string, si);
15626         }
15627
15628         /* Add the inversion list to whatever we have.  This may have come from
15629          * the swash, or from an input parameter */
15630         if (invlist) {
15631             if (exclude_list) {
15632                 SV* clone = invlist_clone(invlist);
15633                 _invlist_subtract(clone, exclude_list, &clone);
15634                 sv_catsv(matches_string, _invlist_contents(clone));
15635                 SvREFCNT_dec_NN(clone);
15636             }
15637             else {
15638                 sv_catsv(matches_string, _invlist_contents(invlist));
15639             }
15640         }
15641         *listsvp = matches_string;
15642     }
15643
15644     return sw;
15645 }
15646 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15647
15648 /* reg_skipcomment()
15649
15650    Absorbs an /x style # comment from the input stream,
15651    returning a pointer to the first character beyond the comment, or if the
15652    comment terminates the pattern without anything following it, this returns
15653    one past the final character of the pattern (in other words, RExC_end) and
15654    sets the REG_RUN_ON_COMMENT_SEEN flag.
15655
15656    Note it's the callers responsibility to ensure that we are
15657    actually in /x mode
15658
15659 */
15660
15661 PERL_STATIC_INLINE char*
15662 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15663 {
15664     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15665
15666     assert(*p == '#');
15667
15668     while (p < RExC_end) {
15669         if (*(++p) == '\n') {
15670             return p+1;
15671         }
15672     }
15673
15674     /* we ran off the end of the pattern without ending the comment, so we have
15675      * to add an \n when wrapping */
15676     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15677     return p;
15678 }
15679
15680 /* nextchar()
15681
15682    Advances the parse position, and optionally absorbs
15683    "whitespace" from the inputstream.
15684
15685    Without /x "whitespace" means (?#...) style comments only,
15686    with /x this means (?#...) and # comments and whitespace proper.
15687
15688    Returns the RExC_parse point from BEFORE the scan occurs.
15689
15690    This is the /x friendly way of saying RExC_parse++.
15691 */
15692
15693 STATIC char*
15694 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15695 {
15696     char* const retval = RExC_parse++;
15697
15698     PERL_ARGS_ASSERT_NEXTCHAR;
15699
15700     for (;;) {
15701         if (RExC_end - RExC_parse >= 3
15702             && *RExC_parse == '('
15703             && RExC_parse[1] == '?'
15704             && RExC_parse[2] == '#')
15705         {
15706             while (*RExC_parse != ')') {
15707                 if (RExC_parse == RExC_end)
15708                     FAIL("Sequence (?#... not terminated");
15709                 RExC_parse++;
15710             }
15711             RExC_parse++;
15712             continue;
15713         }
15714         if (RExC_flags & RXf_PMf_EXTENDED) {
15715             char * p = regpatws(pRExC_state, RExC_parse,
15716                                           TRUE); /* means recognize comments */
15717             if (p != RExC_parse) {
15718                 RExC_parse = p;
15719                 continue;
15720             }
15721         }
15722         return retval;
15723     }
15724 }
15725
15726 STATIC regnode *
15727 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15728 {
15729     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15730      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15731      * RExC_emit */
15732
15733     regnode * const ret = RExC_emit;
15734     GET_RE_DEBUG_FLAGS_DECL;
15735
15736     PERL_ARGS_ASSERT_REGNODE_GUTS;
15737
15738     assert(extra_size >= regarglen[op]);
15739
15740     if (SIZE_ONLY) {
15741         SIZE_ALIGN(RExC_size);
15742         RExC_size += 1 + extra_size;
15743         return(ret);
15744     }
15745     if (RExC_emit >= RExC_emit_bound)
15746         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15747                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15748
15749     NODE_ALIGN_FILL(ret);
15750 #ifndef RE_TRACK_PATTERN_OFFSETS
15751     PERL_UNUSED_ARG(name);
15752 #else
15753     if (RExC_offsets) {         /* MJD */
15754         MJD_OFFSET_DEBUG(
15755               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15756               name, __LINE__,
15757               PL_reg_name[op],
15758               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15759                 ? "Overwriting end of array!\n" : "OK",
15760               (UV)(RExC_emit - RExC_emit_start),
15761               (UV)(RExC_parse - RExC_start),
15762               (UV)RExC_offsets[0]));
15763         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15764     }
15765 #endif
15766     return(ret);
15767 }
15768
15769 /*
15770 - reg_node - emit a node
15771 */
15772 STATIC regnode *                        /* Location. */
15773 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15774 {
15775     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15776
15777     PERL_ARGS_ASSERT_REG_NODE;
15778
15779     assert(regarglen[op] == 0);
15780
15781     if (PASS2) {
15782         regnode *ptr = ret;
15783         FILL_ADVANCE_NODE(ptr, op);
15784         RExC_emit = ptr;
15785     }
15786     return(ret);
15787 }
15788
15789 /*
15790 - reganode - emit a node with an argument
15791 */
15792 STATIC regnode *                        /* Location. */
15793 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15794 {
15795     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15796
15797     PERL_ARGS_ASSERT_REGANODE;
15798
15799     assert(regarglen[op] == 1);
15800
15801     if (PASS2) {
15802         regnode *ptr = ret;
15803         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15804         RExC_emit = ptr;
15805     }
15806     return(ret);
15807 }
15808
15809 STATIC regnode *
15810 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15811 {
15812     /* emit a node with U32 and I32 arguments */
15813
15814     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15815
15816     PERL_ARGS_ASSERT_REG2LANODE;
15817
15818     assert(regarglen[op] == 2);
15819
15820     if (PASS2) {
15821         regnode *ptr = ret;
15822         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15823         RExC_emit = ptr;
15824     }
15825     return(ret);
15826 }
15827
15828 /*
15829 - reginsert - insert an operator in front of already-emitted operand
15830 *
15831 * Means relocating the operand.
15832 */
15833 STATIC void
15834 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15835 {
15836     regnode *src;
15837     regnode *dst;
15838     regnode *place;
15839     const int offset = regarglen[(U8)op];
15840     const int size = NODE_STEP_REGNODE + offset;
15841     GET_RE_DEBUG_FLAGS_DECL;
15842
15843     PERL_ARGS_ASSERT_REGINSERT;
15844     PERL_UNUSED_CONTEXT;
15845     PERL_UNUSED_ARG(depth);
15846 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15847     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15848     if (SIZE_ONLY) {
15849         RExC_size += size;
15850         return;
15851     }
15852
15853     src = RExC_emit;
15854     RExC_emit += size;
15855     dst = RExC_emit;
15856     if (RExC_open_parens) {
15857         int paren;
15858         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15859         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15860             if ( RExC_open_parens[paren] >= opnd ) {
15861                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15862                 RExC_open_parens[paren] += size;
15863             } else {
15864                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15865             }
15866             if ( RExC_close_parens[paren] >= opnd ) {
15867                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15868                 RExC_close_parens[paren] += size;
15869             } else {
15870                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15871             }
15872         }
15873     }
15874
15875     while (src > opnd) {
15876         StructCopy(--src, --dst, regnode);
15877 #ifdef RE_TRACK_PATTERN_OFFSETS
15878         if (RExC_offsets) {     /* MJD 20010112 */
15879             MJD_OFFSET_DEBUG(
15880                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15881                   "reg_insert",
15882                   __LINE__,
15883                   PL_reg_name[op],
15884                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15885                     ? "Overwriting end of array!\n" : "OK",
15886                   (UV)(src - RExC_emit_start),
15887                   (UV)(dst - RExC_emit_start),
15888                   (UV)RExC_offsets[0]));
15889             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15890             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15891         }
15892 #endif
15893     }
15894
15895
15896     place = opnd;               /* Op node, where operand used to be. */
15897 #ifdef RE_TRACK_PATTERN_OFFSETS
15898     if (RExC_offsets) {         /* MJD */
15899         MJD_OFFSET_DEBUG(
15900               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15901               "reginsert",
15902               __LINE__,
15903               PL_reg_name[op],
15904               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15905               ? "Overwriting end of array!\n" : "OK",
15906               (UV)(place - RExC_emit_start),
15907               (UV)(RExC_parse - RExC_start),
15908               (UV)RExC_offsets[0]));
15909         Set_Node_Offset(place, RExC_parse);
15910         Set_Node_Length(place, 1);
15911     }
15912 #endif
15913     src = NEXTOPER(place);
15914     FILL_ADVANCE_NODE(place, op);
15915     Zero(src, offset, regnode);
15916 }
15917
15918 /*
15919 - regtail - set the next-pointer at the end of a node chain of p to val.
15920 - SEE ALSO: regtail_study
15921 */
15922 /* TODO: All three parms should be const */
15923 STATIC void
15924 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15925                 const regnode *val,U32 depth)
15926 {
15927     regnode *scan;
15928     GET_RE_DEBUG_FLAGS_DECL;
15929
15930     PERL_ARGS_ASSERT_REGTAIL;
15931 #ifndef DEBUGGING
15932     PERL_UNUSED_ARG(depth);
15933 #endif
15934
15935     if (SIZE_ONLY)
15936         return;
15937
15938     /* Find last node. */
15939     scan = p;
15940     for (;;) {
15941         regnode * const temp = regnext(scan);
15942         DEBUG_PARSE_r({
15943             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15944             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15945             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15946                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15947                     (temp == NULL ? "->" : ""),
15948                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15949             );
15950         });
15951         if (temp == NULL)
15952             break;
15953         scan = temp;
15954     }
15955
15956     if (reg_off_by_arg[OP(scan)]) {
15957         ARG_SET(scan, val - scan);
15958     }
15959     else {
15960         NEXT_OFF(scan) = val - scan;
15961     }
15962 }
15963
15964 #ifdef DEBUGGING
15965 /*
15966 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15967 - Look for optimizable sequences at the same time.
15968 - currently only looks for EXACT chains.
15969
15970 This is experimental code. The idea is to use this routine to perform
15971 in place optimizations on branches and groups as they are constructed,
15972 with the long term intention of removing optimization from study_chunk so
15973 that it is purely analytical.
15974
15975 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15976 to control which is which.
15977
15978 */
15979 /* TODO: All four parms should be const */
15980
15981 STATIC U8
15982 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15983                       const regnode *val,U32 depth)
15984 {
15985     regnode *scan;
15986     U8 exact = PSEUDO;
15987 #ifdef EXPERIMENTAL_INPLACESCAN
15988     I32 min = 0;
15989 #endif
15990     GET_RE_DEBUG_FLAGS_DECL;
15991
15992     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15993
15994
15995     if (SIZE_ONLY)
15996         return exact;
15997
15998     /* Find last node. */
15999
16000     scan = p;
16001     for (;;) {
16002         regnode * const temp = regnext(scan);
16003 #ifdef EXPERIMENTAL_INPLACESCAN
16004         if (PL_regkind[OP(scan)] == EXACT) {
16005             bool unfolded_multi_char;   /* Unexamined in this routine */
16006             if (join_exact(pRExC_state, scan, &min,
16007                            &unfolded_multi_char, 1, val, depth+1))
16008                 return EXACT;
16009         }
16010 #endif
16011         if ( exact ) {
16012             switch (OP(scan)) {
16013                 case EXACT:
16014                 case EXACTL:
16015                 case EXACTF:
16016                 case EXACTFA_NO_TRIE:
16017                 case EXACTFA:
16018                 case EXACTFU:
16019                 case EXACTFLU8:
16020                 case EXACTFU_SS:
16021                 case EXACTFL:
16022                         if( exact == PSEUDO )
16023                             exact= OP(scan);
16024                         else if ( exact != OP(scan) )
16025                             exact= 0;
16026                 case NOTHING:
16027                     break;
16028                 default:
16029                     exact= 0;
16030             }
16031         }
16032         DEBUG_PARSE_r({
16033             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16034             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16035             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16036                 SvPV_nolen_const(RExC_mysv),
16037                 REG_NODE_NUM(scan),
16038                 PL_reg_name[exact]);
16039         });
16040         if (temp == NULL)
16041             break;
16042         scan = temp;
16043     }
16044     DEBUG_PARSE_r({
16045         DEBUG_PARSE_MSG("");
16046         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16047         PerlIO_printf(Perl_debug_log,
16048                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16049                       SvPV_nolen_const(RExC_mysv),
16050                       (IV)REG_NODE_NUM(val),
16051                       (IV)(val - scan)
16052         );
16053     });
16054     if (reg_off_by_arg[OP(scan)]) {
16055         ARG_SET(scan, val - scan);
16056     }
16057     else {
16058         NEXT_OFF(scan) = val - scan;
16059     }
16060
16061     return exact;
16062 }
16063 #endif
16064
16065 /*
16066  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16067  */
16068 #ifdef DEBUGGING
16069
16070 static void
16071 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16072 {
16073     int bit;
16074     int set=0;
16075
16076     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16077
16078     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16079         if (flags & (1<<bit)) {
16080             if (!set++ && lead)
16081                 PerlIO_printf(Perl_debug_log, "%s",lead);
16082             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16083         }
16084     }
16085     if (lead)  {
16086         if (set)
16087             PerlIO_printf(Perl_debug_log, "\n");
16088         else
16089             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16090     }
16091 }
16092
16093 static void
16094 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16095 {
16096     int bit;
16097     int set=0;
16098     regex_charset cs;
16099
16100     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16101
16102     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16103         if (flags & (1<<bit)) {
16104             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
16105                 continue;
16106             }
16107             if (!set++ && lead)
16108                 PerlIO_printf(Perl_debug_log, "%s",lead);
16109             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16110         }
16111     }
16112     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16113             if (!set++ && lead) {
16114                 PerlIO_printf(Perl_debug_log, "%s",lead);
16115             }
16116             switch (cs) {
16117                 case REGEX_UNICODE_CHARSET:
16118                     PerlIO_printf(Perl_debug_log, "UNICODE");
16119                     break;
16120                 case REGEX_LOCALE_CHARSET:
16121                     PerlIO_printf(Perl_debug_log, "LOCALE");
16122                     break;
16123                 case REGEX_ASCII_RESTRICTED_CHARSET:
16124                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16125                     break;
16126                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16127                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16128                     break;
16129                 default:
16130                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16131                     break;
16132             }
16133     }
16134     if (lead)  {
16135         if (set)
16136             PerlIO_printf(Perl_debug_log, "\n");
16137         else
16138             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16139     }
16140 }
16141 #endif
16142
16143 void
16144 Perl_regdump(pTHX_ const regexp *r)
16145 {
16146 #ifdef DEBUGGING
16147     SV * const sv = sv_newmortal();
16148     SV *dsv= sv_newmortal();
16149     RXi_GET_DECL(r,ri);
16150     GET_RE_DEBUG_FLAGS_DECL;
16151
16152     PERL_ARGS_ASSERT_REGDUMP;
16153
16154     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16155
16156     /* Header fields of interest. */
16157     if (r->anchored_substr) {
16158         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16159             RE_SV_DUMPLEN(r->anchored_substr), 30);
16160         PerlIO_printf(Perl_debug_log,
16161                       "anchored %s%s at %"IVdf" ",
16162                       s, RE_SV_TAIL(r->anchored_substr),
16163                       (IV)r->anchored_offset);
16164     } else if (r->anchored_utf8) {
16165         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16166             RE_SV_DUMPLEN(r->anchored_utf8), 30);
16167         PerlIO_printf(Perl_debug_log,
16168                       "anchored utf8 %s%s at %"IVdf" ",
16169                       s, RE_SV_TAIL(r->anchored_utf8),
16170                       (IV)r->anchored_offset);
16171     }
16172     if (r->float_substr) {
16173         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16174             RE_SV_DUMPLEN(r->float_substr), 30);
16175         PerlIO_printf(Perl_debug_log,
16176                       "floating %s%s at %"IVdf"..%"UVuf" ",
16177                       s, RE_SV_TAIL(r->float_substr),
16178                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16179     } else if (r->float_utf8) {
16180         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16181             RE_SV_DUMPLEN(r->float_utf8), 30);
16182         PerlIO_printf(Perl_debug_log,
16183                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16184                       s, RE_SV_TAIL(r->float_utf8),
16185                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16186     }
16187     if (r->check_substr || r->check_utf8)
16188         PerlIO_printf(Perl_debug_log,
16189                       (const char *)
16190                       (r->check_substr == r->float_substr
16191                        && r->check_utf8 == r->float_utf8
16192                        ? "(checking floating" : "(checking anchored"));
16193     if (r->intflags & PREGf_NOSCAN)
16194         PerlIO_printf(Perl_debug_log, " noscan");
16195     if (r->extflags & RXf_CHECK_ALL)
16196         PerlIO_printf(Perl_debug_log, " isall");
16197     if (r->check_substr || r->check_utf8)
16198         PerlIO_printf(Perl_debug_log, ") ");
16199
16200     if (ri->regstclass) {
16201         regprop(r, sv, ri->regstclass, NULL, NULL);
16202         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16203     }
16204     if (r->intflags & PREGf_ANCH) {
16205         PerlIO_printf(Perl_debug_log, "anchored");
16206         if (r->intflags & PREGf_ANCH_MBOL)
16207             PerlIO_printf(Perl_debug_log, "(MBOL)");
16208         if (r->intflags & PREGf_ANCH_SBOL)
16209             PerlIO_printf(Perl_debug_log, "(SBOL)");
16210         if (r->intflags & PREGf_ANCH_GPOS)
16211             PerlIO_printf(Perl_debug_log, "(GPOS)");
16212         PerlIO_putc(Perl_debug_log, ' ');
16213     }
16214     if (r->intflags & PREGf_GPOS_SEEN)
16215         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16216     if (r->intflags & PREGf_SKIP)
16217         PerlIO_printf(Perl_debug_log, "plus ");
16218     if (r->intflags & PREGf_IMPLICIT)
16219         PerlIO_printf(Perl_debug_log, "implicit ");
16220     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16221     if (r->extflags & RXf_EVAL_SEEN)
16222         PerlIO_printf(Perl_debug_log, "with eval ");
16223     PerlIO_printf(Perl_debug_log, "\n");
16224     DEBUG_FLAGS_r({
16225         regdump_extflags("r->extflags: ",r->extflags);
16226         regdump_intflags("r->intflags: ",r->intflags);
16227     });
16228 #else
16229     PERL_ARGS_ASSERT_REGDUMP;
16230     PERL_UNUSED_CONTEXT;
16231     PERL_UNUSED_ARG(r);
16232 #endif  /* DEBUGGING */
16233 }
16234
16235 /*
16236 - regprop - printable representation of opcode, with run time support
16237 */
16238
16239 void
16240 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16241 {
16242 #ifdef DEBUGGING
16243     int k;
16244
16245     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16246     static const char * const anyofs[] = {
16247 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16248     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16249     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16250     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16251     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16252     || _CC_VERTSPACE != 16
16253   #error Need to adjust order of anyofs[]
16254 #endif
16255         "\\w",
16256         "\\W",
16257         "\\d",
16258         "\\D",
16259         "[:alpha:]",
16260         "[:^alpha:]",
16261         "[:lower:]",
16262         "[:^lower:]",
16263         "[:upper:]",
16264         "[:^upper:]",
16265         "[:punct:]",
16266         "[:^punct:]",
16267         "[:print:]",
16268         "[:^print:]",
16269         "[:alnum:]",
16270         "[:^alnum:]",
16271         "[:graph:]",
16272         "[:^graph:]",
16273         "[:cased:]",
16274         "[:^cased:]",
16275         "\\s",
16276         "\\S",
16277         "[:blank:]",
16278         "[:^blank:]",
16279         "[:xdigit:]",
16280         "[:^xdigit:]",
16281         "[:space:]",
16282         "[:^space:]",
16283         "[:cntrl:]",
16284         "[:^cntrl:]",
16285         "[:ascii:]",
16286         "[:^ascii:]",
16287         "\\v",
16288         "\\V"
16289     };
16290     RXi_GET_DECL(prog,progi);
16291     GET_RE_DEBUG_FLAGS_DECL;
16292
16293     PERL_ARGS_ASSERT_REGPROP;
16294
16295     sv_setpvn(sv, "", 0);
16296
16297     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16298         /* It would be nice to FAIL() here, but this may be called from
16299            regexec.c, and it would be hard to supply pRExC_state. */
16300         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16301                                               (int)OP(o), (int)REGNODE_MAX);
16302     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16303
16304     k = PL_regkind[OP(o)];
16305
16306     if (k == EXACT) {
16307         sv_catpvs(sv, " ");
16308         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16309          * is a crude hack but it may be the best for now since
16310          * we have no flag "this EXACTish node was UTF-8"
16311          * --jhi */
16312         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16313                   PERL_PV_ESCAPE_UNI_DETECT |
16314                   PERL_PV_ESCAPE_NONASCII   |
16315                   PERL_PV_PRETTY_ELLIPSES   |
16316                   PERL_PV_PRETTY_LTGT       |
16317                   PERL_PV_PRETTY_NOCLEAR
16318                   );
16319     } else if (k == TRIE) {
16320         /* print the details of the trie in dumpuntil instead, as
16321          * progi->data isn't available here */
16322         const char op = OP(o);
16323         const U32 n = ARG(o);
16324         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16325                (reg_ac_data *)progi->data->data[n] :
16326                NULL;
16327         const reg_trie_data * const trie
16328             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16329
16330         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16331         DEBUG_TRIE_COMPILE_r(
16332           Perl_sv_catpvf(aTHX_ sv,
16333             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16334             (UV)trie->startstate,
16335             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16336             (UV)trie->wordcount,
16337             (UV)trie->minlen,
16338             (UV)trie->maxlen,
16339             (UV)TRIE_CHARCOUNT(trie),
16340             (UV)trie->uniquecharcount
16341           );
16342         );
16343         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16344             sv_catpvs(sv, "[");
16345             (void) put_charclass_bitmap_innards(sv,
16346                                                 (IS_ANYOF_TRIE(op))
16347                                                  ? ANYOF_BITMAP(o)
16348                                                  : TRIE_BITMAP(trie),
16349                                                 NULL);
16350             sv_catpvs(sv, "]");
16351         }
16352
16353     } else if (k == CURLY) {
16354         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16355             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16356         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16357     }
16358     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16359         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16360     else if (k == REF || k == OPEN || k == CLOSE
16361              || k == GROUPP || OP(o)==ACCEPT)
16362     {
16363         AV *name_list= NULL;
16364         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16365         if ( RXp_PAREN_NAMES(prog) ) {
16366             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16367         } else if ( pRExC_state ) {
16368             name_list= RExC_paren_name_list;
16369         }
16370         if (name_list) {
16371             if ( k != REF || (OP(o) < NREF)) {
16372                 SV **name= av_fetch(name_list, ARG(o), 0 );
16373                 if (name)
16374                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16375             }
16376             else {
16377                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16378                 I32 *nums=(I32*)SvPVX(sv_dat);
16379                 SV **name= av_fetch(name_list, nums[0], 0 );
16380                 I32 n;
16381                 if (name) {
16382                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16383                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16384                                     (n ? "," : ""), (IV)nums[n]);
16385                     }
16386                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16387                 }
16388             }
16389         }
16390         if ( k == REF && reginfo) {
16391             U32 n = ARG(o);  /* which paren pair */
16392             I32 ln = prog->offs[n].start;
16393             if (prog->lastparen < n || ln == -1)
16394                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16395             else if (ln == prog->offs[n].end)
16396                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16397             else {
16398                 const char *s = reginfo->strbeg + ln;
16399                 Perl_sv_catpvf(aTHX_ sv, ": ");
16400                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16401                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16402             }
16403         }
16404     } else if (k == GOSUB) {
16405         AV *name_list= NULL;
16406         if ( RXp_PAREN_NAMES(prog) ) {
16407             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16408         } else if ( pRExC_state ) {
16409             name_list= RExC_paren_name_list;
16410         }
16411
16412         /* Paren and offset */
16413         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16414         if (name_list) {
16415             SV **name= av_fetch(name_list, ARG(o), 0 );
16416             if (name)
16417                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16418         }
16419     }
16420     else if (k == VERB) {
16421         if (!o->flags)
16422             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16423                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16424     } else if (k == LOGICAL)
16425         /* 2: embedded, otherwise 1 */
16426         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16427     else if (k == ANYOF) {
16428         const U8 flags = ANYOF_FLAGS(o);
16429         int do_sep = 0;
16430         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16431
16432
16433         if (OP(o) == ANYOFL)
16434             sv_catpvs(sv, "{loc}");
16435         if (flags & ANYOF_LOC_FOLD)
16436             sv_catpvs(sv, "{i}");
16437         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16438         if (flags & ANYOF_INVERT)
16439             sv_catpvs(sv, "^");
16440
16441         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16442          * */
16443         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16444                                                             &bitmap_invlist);
16445
16446         /* output any special charclass tests (used entirely under use
16447          * locale) * */
16448         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16449             int i;
16450             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16451                 if (ANYOF_POSIXL_TEST(o,i)) {
16452                     sv_catpv(sv, anyofs[i]);
16453                     do_sep = 1;
16454                 }
16455             }
16456         }
16457
16458         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16459                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16460                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16461                       |ANYOF_LOC_FOLD)))
16462         {
16463             if (do_sep) {
16464                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16465                 if (flags & ANYOF_INVERT)
16466                     /*make sure the invert info is in each */
16467                     sv_catpvs(sv, "^");
16468             }
16469
16470             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16471                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16472             }
16473
16474             /* output information about the unicode matching */
16475             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16476                 sv_catpvs(sv, "{above_bitmap_all}");
16477             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16478                 SV *lv; /* Set if there is something outside the bit map. */
16479                 bool byte_output = FALSE;   /* If something in the bitmap has
16480                                                been output */
16481                 SV *only_utf8_locale;
16482
16483                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16484                  * is used to guarantee that nothing in the bitmap gets
16485                  * returned */
16486                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16487                                                     &lv, &only_utf8_locale,
16488                                                     bitmap_invlist);
16489                 if (lv && lv != &PL_sv_undef) {
16490                     char *s = savesvpv(lv);
16491                     char * const origs = s;
16492
16493                     while (*s && *s != '\n')
16494                         s++;
16495
16496                     if (*s == '\n') {
16497                         const char * const t = ++s;
16498
16499                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16500                             sv_catpvs(sv, "{outside bitmap}");
16501                         }
16502                         else {
16503                             sv_catpvs(sv, "{utf8}");
16504                         }
16505
16506                         if (byte_output) {
16507                             sv_catpvs(sv, " ");
16508                         }
16509
16510                         while (*s) {
16511                             if (*s == '\n') {
16512
16513                                 /* Truncate very long output */
16514                                 if (s - origs > 256) {
16515                                     Perl_sv_catpvf(aTHX_ sv,
16516                                                 "%.*s...",
16517                                                 (int) (s - origs - 1),
16518                                                 t);
16519                                     goto out_dump;
16520                                 }
16521                                 *s = ' ';
16522                             }
16523                             else if (*s == '\t') {
16524                                 *s = '-';
16525                             }
16526                             s++;
16527                         }
16528                         if (s[-1] == ' ')
16529                             s[-1] = 0;
16530
16531                         sv_catpv(sv, t);
16532                     }
16533
16534                 out_dump:
16535
16536                     Safefree(origs);
16537                     SvREFCNT_dec_NN(lv);
16538                 }
16539
16540                 if ((flags & ANYOF_LOC_FOLD)
16541                      && only_utf8_locale
16542                      && only_utf8_locale != &PL_sv_undef)
16543                 {
16544                     UV start, end;
16545                     int max_entries = 256;
16546
16547                     sv_catpvs(sv, "{utf8 locale}");
16548                     invlist_iterinit(only_utf8_locale);
16549                     while (invlist_iternext(only_utf8_locale,
16550                                             &start, &end)) {
16551                         put_range(sv, start, end, FALSE);
16552                         max_entries --;
16553                         if (max_entries < 0) {
16554                             sv_catpvs(sv, "...");
16555                             break;
16556                         }
16557                     }
16558                     invlist_iterfinish(only_utf8_locale);
16559                 }
16560             }
16561         }
16562         SvREFCNT_dec(bitmap_invlist);
16563
16564
16565         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16566     }
16567     else if (k == POSIXD || k == NPOSIXD) {
16568         U8 index = FLAGS(o) * 2;
16569         if (index < C_ARRAY_LENGTH(anyofs)) {
16570             if (*anyofs[index] != '[')  {
16571                 sv_catpv(sv, "[");
16572             }
16573             sv_catpv(sv, anyofs[index]);
16574             if (*anyofs[index] != '[')  {
16575                 sv_catpv(sv, "]");
16576             }
16577         }
16578         else {
16579             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16580         }
16581     }
16582     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16583         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16584     else if (OP(o) == SBOL)
16585         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16586 #else
16587     PERL_UNUSED_CONTEXT;
16588     PERL_UNUSED_ARG(sv);
16589     PERL_UNUSED_ARG(o);
16590     PERL_UNUSED_ARG(prog);
16591     PERL_UNUSED_ARG(reginfo);
16592     PERL_UNUSED_ARG(pRExC_state);
16593 #endif  /* DEBUGGING */
16594 }
16595
16596
16597
16598 SV *
16599 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16600 {                               /* Assume that RE_INTUIT is set */
16601     struct regexp *const prog = ReANY(r);
16602     GET_RE_DEBUG_FLAGS_DECL;
16603
16604     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16605     PERL_UNUSED_CONTEXT;
16606
16607     DEBUG_COMPILE_r(
16608         {
16609             const char * const s = SvPV_nolen_const(prog->check_substr
16610                       ? prog->check_substr : prog->check_utf8);
16611
16612             if (!PL_colorset) reginitcolors();
16613             PerlIO_printf(Perl_debug_log,
16614                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16615                       PL_colors[4],
16616                       prog->check_substr ? "" : "utf8 ",
16617                       PL_colors[5],PL_colors[0],
16618                       s,
16619                       PL_colors[1],
16620                       (strlen(s) > 60 ? "..." : ""));
16621         } );
16622
16623     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16624 }
16625
16626 /*
16627    pregfree()
16628
16629    handles refcounting and freeing the perl core regexp structure. When
16630    it is necessary to actually free the structure the first thing it
16631    does is call the 'free' method of the regexp_engine associated to
16632    the regexp, allowing the handling of the void *pprivate; member
16633    first. (This routine is not overridable by extensions, which is why
16634    the extensions free is called first.)
16635
16636    See regdupe and regdupe_internal if you change anything here.
16637 */
16638 #ifndef PERL_IN_XSUB_RE
16639 void
16640 Perl_pregfree(pTHX_ REGEXP *r)
16641 {
16642     SvREFCNT_dec(r);
16643 }
16644
16645 void
16646 Perl_pregfree2(pTHX_ REGEXP *rx)
16647 {
16648     struct regexp *const r = ReANY(rx);
16649     GET_RE_DEBUG_FLAGS_DECL;
16650
16651     PERL_ARGS_ASSERT_PREGFREE2;
16652
16653     if (r->mother_re) {
16654         ReREFCNT_dec(r->mother_re);
16655     } else {
16656         CALLREGFREE_PVT(rx); /* free the private data */
16657         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16658         Safefree(r->xpv_len_u.xpvlenu_pv);
16659     }
16660     if (r->substrs) {
16661         SvREFCNT_dec(r->anchored_substr);
16662         SvREFCNT_dec(r->anchored_utf8);
16663         SvREFCNT_dec(r->float_substr);
16664         SvREFCNT_dec(r->float_utf8);
16665         Safefree(r->substrs);
16666     }
16667     RX_MATCH_COPY_FREE(rx);
16668 #ifdef PERL_ANY_COW
16669     SvREFCNT_dec(r->saved_copy);
16670 #endif
16671     Safefree(r->offs);
16672     SvREFCNT_dec(r->qr_anoncv);
16673     rx->sv_u.svu_rx = 0;
16674 }
16675
16676 /*  reg_temp_copy()
16677
16678     This is a hacky workaround to the structural issue of match results
16679     being stored in the regexp structure which is in turn stored in
16680     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16681     could be PL_curpm in multiple contexts, and could require multiple
16682     result sets being associated with the pattern simultaneously, such
16683     as when doing a recursive match with (??{$qr})
16684
16685     The solution is to make a lightweight copy of the regexp structure
16686     when a qr// is returned from the code executed by (??{$qr}) this
16687     lightweight copy doesn't actually own any of its data except for
16688     the starp/end and the actual regexp structure itself.
16689
16690 */
16691
16692
16693 REGEXP *
16694 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16695 {
16696     struct regexp *ret;
16697     struct regexp *const r = ReANY(rx);
16698     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16699
16700     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16701
16702     if (!ret_x)
16703         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16704     else {
16705         SvOK_off((SV *)ret_x);
16706         if (islv) {
16707             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16708                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16709                made both spots point to the same regexp body.) */
16710             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16711             assert(!SvPVX(ret_x));
16712             ret_x->sv_u.svu_rx = temp->sv_any;
16713             temp->sv_any = NULL;
16714             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16715             SvREFCNT_dec_NN(temp);
16716             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16717                ing below will not set it. */
16718             SvCUR_set(ret_x, SvCUR(rx));
16719         }
16720     }
16721     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16722        sv_force_normal(sv) is called.  */
16723     SvFAKE_on(ret_x);
16724     ret = ReANY(ret_x);
16725
16726     SvFLAGS(ret_x) |= SvUTF8(rx);
16727     /* We share the same string buffer as the original regexp, on which we
16728        hold a reference count, incremented when mother_re is set below.
16729        The string pointer is copied here, being part of the regexp struct.
16730      */
16731     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16732            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16733     if (r->offs) {
16734         const I32 npar = r->nparens+1;
16735         Newx(ret->offs, npar, regexp_paren_pair);
16736         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16737     }
16738     if (r->substrs) {
16739         Newx(ret->substrs, 1, struct reg_substr_data);
16740         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16741
16742         SvREFCNT_inc_void(ret->anchored_substr);
16743         SvREFCNT_inc_void(ret->anchored_utf8);
16744         SvREFCNT_inc_void(ret->float_substr);
16745         SvREFCNT_inc_void(ret->float_utf8);
16746
16747         /* check_substr and check_utf8, if non-NULL, point to either their
16748            anchored or float namesakes, and don't hold a second reference.  */
16749     }
16750     RX_MATCH_COPIED_off(ret_x);
16751 #ifdef PERL_ANY_COW
16752     ret->saved_copy = NULL;
16753 #endif
16754     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16755     SvREFCNT_inc_void(ret->qr_anoncv);
16756
16757     return ret_x;
16758 }
16759 #endif
16760
16761 /* regfree_internal()
16762
16763    Free the private data in a regexp. This is overloadable by
16764    extensions. Perl takes care of the regexp structure in pregfree(),
16765    this covers the *pprivate pointer which technically perl doesn't
16766    know about, however of course we have to handle the
16767    regexp_internal structure when no extension is in use.
16768
16769    Note this is called before freeing anything in the regexp
16770    structure.
16771  */
16772
16773 void
16774 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16775 {
16776     struct regexp *const r = ReANY(rx);
16777     RXi_GET_DECL(r,ri);
16778     GET_RE_DEBUG_FLAGS_DECL;
16779
16780     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16781
16782     DEBUG_COMPILE_r({
16783         if (!PL_colorset)
16784             reginitcolors();
16785         {
16786             SV *dsv= sv_newmortal();
16787             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16788                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16789             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16790                 PL_colors[4],PL_colors[5],s);
16791         }
16792     });
16793 #ifdef RE_TRACK_PATTERN_OFFSETS
16794     if (ri->u.offsets)
16795         Safefree(ri->u.offsets);             /* 20010421 MJD */
16796 #endif
16797     if (ri->code_blocks) {
16798         int n;
16799         for (n = 0; n < ri->num_code_blocks; n++)
16800             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16801         Safefree(ri->code_blocks);
16802     }
16803
16804     if (ri->data) {
16805         int n = ri->data->count;
16806
16807         while (--n >= 0) {
16808           /* If you add a ->what type here, update the comment in regcomp.h */
16809             switch (ri->data->what[n]) {
16810             case 'a':
16811             case 'r':
16812             case 's':
16813             case 'S':
16814             case 'u':
16815                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16816                 break;
16817             case 'f':
16818                 Safefree(ri->data->data[n]);
16819                 break;
16820             case 'l':
16821             case 'L':
16822                 break;
16823             case 'T':
16824                 { /* Aho Corasick add-on structure for a trie node.
16825                      Used in stclass optimization only */
16826                     U32 refcount;
16827                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16828 #ifdef USE_ITHREADS
16829                     dVAR;
16830 #endif
16831                     OP_REFCNT_LOCK;
16832                     refcount = --aho->refcount;
16833                     OP_REFCNT_UNLOCK;
16834                     if ( !refcount ) {
16835                         PerlMemShared_free(aho->states);
16836                         PerlMemShared_free(aho->fail);
16837                          /* do this last!!!! */
16838                         PerlMemShared_free(ri->data->data[n]);
16839                         /* we should only ever get called once, so
16840                          * assert as much, and also guard the free
16841                          * which /might/ happen twice. At the least
16842                          * it will make code anlyzers happy and it
16843                          * doesn't cost much. - Yves */
16844                         assert(ri->regstclass);
16845                         if (ri->regstclass) {
16846                             PerlMemShared_free(ri->regstclass);
16847                             ri->regstclass = 0;
16848                         }
16849                     }
16850                 }
16851                 break;
16852             case 't':
16853                 {
16854                     /* trie structure. */
16855                     U32 refcount;
16856                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16857 #ifdef USE_ITHREADS
16858                     dVAR;
16859 #endif
16860                     OP_REFCNT_LOCK;
16861                     refcount = --trie->refcount;
16862                     OP_REFCNT_UNLOCK;
16863                     if ( !refcount ) {
16864                         PerlMemShared_free(trie->charmap);
16865                         PerlMemShared_free(trie->states);
16866                         PerlMemShared_free(trie->trans);
16867                         if (trie->bitmap)
16868                             PerlMemShared_free(trie->bitmap);
16869                         if (trie->jump)
16870                             PerlMemShared_free(trie->jump);
16871                         PerlMemShared_free(trie->wordinfo);
16872                         /* do this last!!!! */
16873                         PerlMemShared_free(ri->data->data[n]);
16874                     }
16875                 }
16876                 break;
16877             default:
16878                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16879                                                     ri->data->what[n]);
16880             }
16881         }
16882         Safefree(ri->data->what);
16883         Safefree(ri->data);
16884     }
16885
16886     Safefree(ri);
16887 }
16888
16889 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16890 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16891 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16892
16893 /*
16894    re_dup - duplicate a regexp.
16895
16896    This routine is expected to clone a given regexp structure. It is only
16897    compiled under USE_ITHREADS.
16898
16899    After all of the core data stored in struct regexp is duplicated
16900    the regexp_engine.dupe method is used to copy any private data
16901    stored in the *pprivate pointer. This allows extensions to handle
16902    any duplication it needs to do.
16903
16904    See pregfree() and regfree_internal() if you change anything here.
16905 */
16906 #if defined(USE_ITHREADS)
16907 #ifndef PERL_IN_XSUB_RE
16908 void
16909 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16910 {
16911     dVAR;
16912     I32 npar;
16913     const struct regexp *r = ReANY(sstr);
16914     struct regexp *ret = ReANY(dstr);
16915
16916     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16917
16918     npar = r->nparens+1;
16919     Newx(ret->offs, npar, regexp_paren_pair);
16920     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16921
16922     if (ret->substrs) {
16923         /* Do it this way to avoid reading from *r after the StructCopy().
16924            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16925            cache, it doesn't matter.  */
16926         const bool anchored = r->check_substr
16927             ? r->check_substr == r->anchored_substr
16928             : r->check_utf8 == r->anchored_utf8;
16929         Newx(ret->substrs, 1, struct reg_substr_data);
16930         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16931
16932         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16933         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16934         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16935         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16936
16937         /* check_substr and check_utf8, if non-NULL, point to either their
16938            anchored or float namesakes, and don't hold a second reference.  */
16939
16940         if (ret->check_substr) {
16941             if (anchored) {
16942                 assert(r->check_utf8 == r->anchored_utf8);
16943                 ret->check_substr = ret->anchored_substr;
16944                 ret->check_utf8 = ret->anchored_utf8;
16945             } else {
16946                 assert(r->check_substr == r->float_substr);
16947                 assert(r->check_utf8 == r->float_utf8);
16948                 ret->check_substr = ret->float_substr;
16949                 ret->check_utf8 = ret->float_utf8;
16950             }
16951         } else if (ret->check_utf8) {
16952             if (anchored) {
16953                 ret->check_utf8 = ret->anchored_utf8;
16954             } else {
16955                 ret->check_utf8 = ret->float_utf8;
16956             }
16957         }
16958     }
16959
16960     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16961     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16962
16963     if (ret->pprivate)
16964         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16965
16966     if (RX_MATCH_COPIED(dstr))
16967         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16968     else
16969         ret->subbeg = NULL;
16970 #ifdef PERL_ANY_COW
16971     ret->saved_copy = NULL;
16972 #endif
16973
16974     /* Whether mother_re be set or no, we need to copy the string.  We
16975        cannot refrain from copying it when the storage points directly to
16976        our mother regexp, because that's
16977                1: a buffer in a different thread
16978                2: something we no longer hold a reference on
16979                so we need to copy it locally.  */
16980     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16981     ret->mother_re   = NULL;
16982 }
16983 #endif /* PERL_IN_XSUB_RE */
16984
16985 /*
16986    regdupe_internal()
16987
16988    This is the internal complement to regdupe() which is used to copy
16989    the structure pointed to by the *pprivate pointer in the regexp.
16990    This is the core version of the extension overridable cloning hook.
16991    The regexp structure being duplicated will be copied by perl prior
16992    to this and will be provided as the regexp *r argument, however
16993    with the /old/ structures pprivate pointer value. Thus this routine
16994    may override any copying normally done by perl.
16995
16996    It returns a pointer to the new regexp_internal structure.
16997 */
16998
16999 void *
17000 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17001 {
17002     dVAR;
17003     struct regexp *const r = ReANY(rx);
17004     regexp_internal *reti;
17005     int len;
17006     RXi_GET_DECL(r,ri);
17007
17008     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17009
17010     len = ProgLen(ri);
17011
17012     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17013           char, regexp_internal);
17014     Copy(ri->program, reti->program, len+1, regnode);
17015
17016     reti->num_code_blocks = ri->num_code_blocks;
17017     if (ri->code_blocks) {
17018         int n;
17019         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17020                 struct reg_code_block);
17021         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17022                 struct reg_code_block);
17023         for (n = 0; n < ri->num_code_blocks; n++)
17024              reti->code_blocks[n].src_regex = (REGEXP*)
17025                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17026     }
17027     else
17028         reti->code_blocks = NULL;
17029
17030     reti->regstclass = NULL;
17031
17032     if (ri->data) {
17033         struct reg_data *d;
17034         const int count = ri->data->count;
17035         int i;
17036
17037         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17038                 char, struct reg_data);
17039         Newx(d->what, count, U8);
17040
17041         d->count = count;
17042         for (i = 0; i < count; i++) {
17043             d->what[i] = ri->data->what[i];
17044             switch (d->what[i]) {
17045                 /* see also regcomp.h and regfree_internal() */
17046             case 'a': /* actually an AV, but the dup function is identical.  */
17047             case 'r':
17048             case 's':
17049             case 'S':
17050             case 'u': /* actually an HV, but the dup function is identical.  */
17051                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17052                 break;
17053             case 'f':
17054                 /* This is cheating. */
17055                 Newx(d->data[i], 1, regnode_ssc);
17056                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17057                 reti->regstclass = (regnode*)d->data[i];
17058                 break;
17059             case 'T':
17060                 /* Trie stclasses are readonly and can thus be shared
17061                  * without duplication. We free the stclass in pregfree
17062                  * when the corresponding reg_ac_data struct is freed.
17063                  */
17064                 reti->regstclass= ri->regstclass;
17065                 /* FALLTHROUGH */
17066             case 't':
17067                 OP_REFCNT_LOCK;
17068                 ((reg_trie_data*)ri->data->data[i])->refcount++;
17069                 OP_REFCNT_UNLOCK;
17070                 /* FALLTHROUGH */
17071             case 'l':
17072             case 'L':
17073                 d->data[i] = ri->data->data[i];
17074                 break;
17075             default:
17076                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17077                                                            ri->data->what[i]);
17078             }
17079         }
17080
17081         reti->data = d;
17082     }
17083     else
17084         reti->data = NULL;
17085
17086     reti->name_list_idx = ri->name_list_idx;
17087
17088 #ifdef RE_TRACK_PATTERN_OFFSETS
17089     if (ri->u.offsets) {
17090         Newx(reti->u.offsets, 2*len+1, U32);
17091         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17092     }
17093 #else
17094     SetProgLen(reti,len);
17095 #endif
17096
17097     return (void*)reti;
17098 }
17099
17100 #endif    /* USE_ITHREADS */
17101
17102 #ifndef PERL_IN_XSUB_RE
17103
17104 /*
17105  - regnext - dig the "next" pointer out of a node
17106  */
17107 regnode *
17108 Perl_regnext(pTHX_ regnode *p)
17109 {
17110     I32 offset;
17111
17112     if (!p)
17113         return(NULL);
17114
17115     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
17116         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17117                                                 (int)OP(p), (int)REGNODE_MAX);
17118     }
17119
17120     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17121     if (offset == 0)
17122         return(NULL);
17123
17124     return(p+offset);
17125 }
17126 #endif
17127
17128 STATIC void
17129 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17130 {
17131     va_list args;
17132     STRLEN l1 = strlen(pat1);
17133     STRLEN l2 = strlen(pat2);
17134     char buf[512];
17135     SV *msv;
17136     const char *message;
17137
17138     PERL_ARGS_ASSERT_RE_CROAK2;
17139
17140     if (l1 > 510)
17141         l1 = 510;
17142     if (l1 + l2 > 510)
17143         l2 = 510 - l1;
17144     Copy(pat1, buf, l1 , char);
17145     Copy(pat2, buf + l1, l2 , char);
17146     buf[l1 + l2] = '\n';
17147     buf[l1 + l2 + 1] = '\0';
17148     va_start(args, pat2);
17149     msv = vmess(buf, &args);
17150     va_end(args);
17151     message = SvPV_const(msv,l1);
17152     if (l1 > 512)
17153         l1 = 512;
17154     Copy(message, buf, l1 , char);
17155     /* l1-1 to avoid \n */
17156     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17157 }
17158
17159 #ifdef DEBUGGING
17160 /* Certain characters are output as a sequence with the first being a
17161  * backslash. */
17162 #define isBACKSLASHED_PUNCT(c)                                              \
17163                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17164
17165 STATIC void
17166 S_put_code_point(pTHX_ SV *sv, UV c)
17167 {
17168     PERL_ARGS_ASSERT_PUT_CODE_POINT;
17169
17170     if (c > 255) {
17171         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17172     }
17173     else if (isPRINT(c)) {
17174         const char string = (char) c;
17175         if (isBACKSLASHED_PUNCT(c))
17176             sv_catpvs(sv, "\\");
17177         sv_catpvn(sv, &string, 1);
17178     }
17179     else {
17180         const char * const mnemonic = cntrl_to_mnemonic((char) c);
17181         if (mnemonic) {
17182             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17183         }
17184         else {
17185             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17186         }
17187     }
17188 }
17189
17190 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17191
17192 STATIC void
17193 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17194 {
17195     /* Appends to 'sv' a displayable version of the range of code points from
17196      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17197      * as-is (though some of these will be escaped by put_code_point()). */
17198
17199     const unsigned int min_range_count = 3;
17200
17201     assert(start <= end);
17202
17203     PERL_ARGS_ASSERT_PUT_RANGE;
17204
17205     while (start <= end) {
17206         UV this_end;
17207         const char * format;
17208
17209         if (end - start < min_range_count) {
17210
17211             /* Individual chars in short ranges */
17212             for (; start <= end; start++) {
17213                 put_code_point(sv, start);
17214             }
17215             break;
17216         }
17217
17218         /* If permitted by the input options, and there is a possibility that
17219          * this range contains a printable literal, look to see if there is
17220          * one.  */
17221         if (allow_literals && start <= MAX_PRINT_A) {
17222
17223             /* If the range begin isn't an ASCII printable, effectively split
17224              * the range into two parts:
17225              *  1) the portion before the first such printable,
17226              *  2) the rest
17227              * and output them separately. */
17228             if (! isPRINT_A(start)) {
17229                 UV temp_end = start + 1;
17230
17231                 /* There is no point looking beyond the final possible
17232                  * printable, in MAX_PRINT_A */
17233                 UV max = MIN(end, MAX_PRINT_A);
17234
17235                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17236                     temp_end++;
17237                 }
17238
17239                 /* Here, temp_end points to one beyond the first printable if
17240                  * found, or to one beyond 'max' if not.  If none found, make
17241                  * sure that we use the entire range */
17242                 if (temp_end > MAX_PRINT_A) {
17243                     temp_end = end + 1;
17244                 }
17245
17246                 /* Output the first part of the split range, the part that
17247                  * doesn't have printables, with no looking for literals
17248                  * (otherwise we would infinitely recurse) */
17249                 put_range(sv, start, temp_end - 1, FALSE);
17250
17251                 /* The 2nd part of the range (if any) starts here. */
17252                 start = temp_end;
17253
17254                 /* We continue instead of dropping down because even if the 2nd
17255                  * part is non-empty, it could be so short that we want to
17256                  * output it specially, as tested for at the top of this loop.
17257                  * */
17258                 continue;
17259             }
17260
17261             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17262              * output a sub-range of just the digits or letters, then process
17263              * the remaining portion as usual. */
17264             if (isALPHANUMERIC_A(start)) {
17265                 UV mask = (isDIGIT_A(start))
17266                            ? _CC_DIGIT
17267                              : isUPPER_A(start)
17268                                ? _CC_UPPER
17269                                : _CC_LOWER;
17270                 UV temp_end = start + 1;
17271
17272                 /* Find the end of the sub-range that includes just the
17273                  * characters in the same class as the first character in it */
17274                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17275                     temp_end++;
17276                 }
17277                 temp_end--;
17278
17279                 /* For short ranges, don't duplicate the code above to output
17280                  * them; just call recursively */
17281                 if (temp_end - start < min_range_count) {
17282                     put_range(sv, start, temp_end, FALSE);
17283                 }
17284                 else {  /* Output as a range */
17285                     put_code_point(sv, start);
17286                     sv_catpvs(sv, "-");
17287                     put_code_point(sv, temp_end);
17288                 }
17289                 start = temp_end + 1;
17290                 continue;
17291             }
17292
17293             /* We output any other printables as individual characters */
17294             if (isPUNCT_A(start) || isSPACE_A(start)) {
17295                 while (start <= end && (isPUNCT_A(start)
17296                                         || isSPACE_A(start)))
17297                 {
17298                     put_code_point(sv, start);
17299                     start++;
17300                 }
17301                 continue;
17302             }
17303         } /* End of looking for literals */
17304
17305         /* Here is not to output as a literal.  Some control characters have
17306          * mnemonic names.  Split off any of those at the beginning and end of
17307          * the range to print mnemonically.  It isn't possible for many of
17308          * these to be in a row, so this won't overwhelm with output */
17309         while (isMNEMONIC_CNTRL(start) && start <= end) {
17310             put_code_point(sv, start);
17311             start++;
17312         }
17313         if (start < end && isMNEMONIC_CNTRL(end)) {
17314
17315             /* Here, the final character in the range has a mnemonic name.
17316              * Work backwards from the end to find the final non-mnemonic */
17317             UV temp_end = end - 1;
17318             while (isMNEMONIC_CNTRL(temp_end)) {
17319                 temp_end--;
17320             }
17321
17322             /* And separately output the range that doesn't have mnemonics */
17323             put_range(sv, start, temp_end, FALSE);
17324
17325             /* Then output the mnemonic trailing controls */
17326             start = temp_end + 1;
17327             while (start <= end) {
17328                 put_code_point(sv, start);
17329                 start++;
17330             }
17331             break;
17332         }
17333
17334         /* As a final resort, output the range or subrange as hex. */
17335
17336         this_end = (end < NUM_ANYOF_CODE_POINTS)
17337                     ? end
17338                     : NUM_ANYOF_CODE_POINTS - 1;
17339         format = (this_end < 256)
17340                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17341                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17342         GCC_DIAG_IGNORE(-Wformat-nonliteral);
17343         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17344         GCC_DIAG_RESTORE;
17345         break;
17346     }
17347 }
17348
17349 STATIC bool
17350 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17351 {
17352     /* Appends to 'sv' a displayable version of the innards of the bracketed
17353      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17354      * output anything, and bitmap_invlist, if not NULL, will point to an
17355      * inversion list of what is in the bit map */
17356
17357     int i;
17358     UV start, end;
17359     unsigned int punct_count = 0;
17360     SV* invlist = NULL;
17361     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17362     bool allow_literals = TRUE;
17363
17364     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17365
17366     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17367
17368     /* Worst case is exactly every-other code point is in the list */
17369     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17370
17371     /* Convert the bit map to an inversion list, keeping track of how many
17372      * ASCII puncts are set, including an extra amount for the backslashed
17373      * ones.  */
17374     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17375         if (BITMAP_TEST(bitmap, i)) {
17376             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17377             if (isPUNCT_A(i)) {
17378                 punct_count++;
17379                 if isBACKSLASHED_PUNCT(i) {
17380                     punct_count++;
17381                 }
17382             }
17383         }
17384     }
17385
17386     /* Nothing to output */
17387     if (_invlist_len(*invlist_ptr) == 0) {
17388         SvREFCNT_dec(invlist);
17389         return FALSE;
17390     }
17391
17392     /* Generally, it is more readable if printable characters are output as
17393      * literals, but if a range (nearly) spans all of them, it's best to output
17394      * it as a single range.  This code will use a single range if all but 2
17395      * printables are in it */
17396     invlist_iterinit(*invlist_ptr);
17397     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17398
17399         /* If range starts beyond final printable, it doesn't have any in it */
17400         if (start > MAX_PRINT_A) {
17401             break;
17402         }
17403
17404         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17405          * all but two, the range must start and end no later than 2 from
17406          * either end */
17407         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17408             if (end > MAX_PRINT_A) {
17409                 end = MAX_PRINT_A;
17410             }
17411             if (start < ' ') {
17412                 start = ' ';
17413             }
17414             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17415                 allow_literals = FALSE;
17416             }
17417             break;
17418         }
17419     }
17420     invlist_iterfinish(*invlist_ptr);
17421
17422     /* The legibility of the output depends mostly on how many punctuation
17423      * characters are output.  There are 32 possible ASCII ones, and some have
17424      * an additional backslash, bringing it to currently 36, so if any more
17425      * than 18 are to be output, we can instead output it as its complement,
17426      * yielding fewer puncts, and making it more legible.  But give some weight
17427      * to the fact that outputting it as a complement is less legible than a
17428      * straight output, so don't complement unless we are somewhat over the 18
17429      * mark */
17430     if (allow_literals && punct_count > 22) {
17431         sv_catpvs(sv, "^");
17432
17433         /* Add everything remaining to the list, so when we invert it just
17434          * below, it will be excluded */
17435         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17436         _invlist_invert(*invlist_ptr);
17437     }
17438
17439     /* Here we have figured things out.  Output each range */
17440     invlist_iterinit(*invlist_ptr);
17441     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17442         if (start >= NUM_ANYOF_CODE_POINTS) {
17443             break;
17444         }
17445         put_range(sv, start, end, allow_literals);
17446     }
17447     invlist_iterfinish(*invlist_ptr);
17448
17449     return TRUE;
17450 }
17451
17452 #define CLEAR_OPTSTART \
17453     if (optstart) STMT_START {                                               \
17454         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17455                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17456         optstart=NULL;                                                       \
17457     } STMT_END
17458
17459 #define DUMPUNTIL(b,e)                                                       \
17460                     CLEAR_OPTSTART;                                          \
17461                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17462
17463 STATIC const regnode *
17464 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17465             const regnode *last, const regnode *plast,
17466             SV* sv, I32 indent, U32 depth)
17467 {
17468     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17469     const regnode *next;
17470     const regnode *optstart= NULL;
17471
17472     RXi_GET_DECL(r,ri);
17473     GET_RE_DEBUG_FLAGS_DECL;
17474
17475     PERL_ARGS_ASSERT_DUMPUNTIL;
17476
17477 #ifdef DEBUG_DUMPUNTIL
17478     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17479         last ? last-start : 0,plast ? plast-start : 0);
17480 #endif
17481
17482     if (plast && plast < last)
17483         last= plast;
17484
17485     while (PL_regkind[op] != END && (!last || node < last)) {
17486         assert(node);
17487         /* While that wasn't END last time... */
17488         NODE_ALIGN(node);
17489         op = OP(node);
17490         if (op == CLOSE || op == WHILEM)
17491             indent--;
17492         next = regnext((regnode *)node);
17493
17494         /* Where, what. */
17495         if (OP(node) == OPTIMIZED) {
17496             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17497                 optstart = node;
17498             else
17499                 goto after_print;
17500         } else
17501             CLEAR_OPTSTART;
17502
17503         regprop(r, sv, node, NULL, NULL);
17504         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17505                       (int)(2*indent + 1), "", SvPVX_const(sv));
17506
17507         if (OP(node) != OPTIMIZED) {
17508             if (next == NULL)           /* Next ptr. */
17509                 PerlIO_printf(Perl_debug_log, " (0)");
17510             else if (PL_regkind[(U8)op] == BRANCH
17511                      && PL_regkind[OP(next)] != BRANCH )
17512                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17513             else
17514                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17515             (void)PerlIO_putc(Perl_debug_log, '\n');
17516         }
17517
17518       after_print:
17519         if (PL_regkind[(U8)op] == BRANCHJ) {
17520             assert(next);
17521             {
17522                 const regnode *nnode = (OP(next) == LONGJMP
17523                                        ? regnext((regnode *)next)
17524                                        : next);
17525                 if (last && nnode > last)
17526                     nnode = last;
17527                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17528             }
17529         }
17530         else if (PL_regkind[(U8)op] == BRANCH) {
17531             assert(next);
17532             DUMPUNTIL(NEXTOPER(node), next);
17533         }
17534         else if ( PL_regkind[(U8)op]  == TRIE ) {
17535             const regnode *this_trie = node;
17536             const char op = OP(node);
17537             const U32 n = ARG(node);
17538             const reg_ac_data * const ac = op>=AHOCORASICK ?
17539                (reg_ac_data *)ri->data->data[n] :
17540                NULL;
17541             const reg_trie_data * const trie =
17542                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17543 #ifdef DEBUGGING
17544             AV *const trie_words
17545                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17546 #endif
17547             const regnode *nextbranch= NULL;
17548             I32 word_idx;
17549             sv_setpvs(sv, "");
17550             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17551                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17552
17553                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17554                    (int)(2*(indent+3)), "",
17555                     elem_ptr
17556                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17557                                 SvCUR(*elem_ptr), 60,
17558                                 PL_colors[0], PL_colors[1],
17559                                 (SvUTF8(*elem_ptr)
17560                                  ? PERL_PV_ESCAPE_UNI
17561                                  : 0)
17562                                 | PERL_PV_PRETTY_ELLIPSES
17563                                 | PERL_PV_PRETTY_LTGT
17564                             )
17565                     : "???"
17566                 );
17567                 if (trie->jump) {
17568                     U16 dist= trie->jump[word_idx+1];
17569                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17570                                (UV)((dist ? this_trie + dist : next) - start));
17571                     if (dist) {
17572                         if (!nextbranch)
17573                             nextbranch= this_trie + trie->jump[0];
17574                         DUMPUNTIL(this_trie + dist, nextbranch);
17575                     }
17576                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17577                         nextbranch= regnext((regnode *)nextbranch);
17578                 } else {
17579                     PerlIO_printf(Perl_debug_log, "\n");
17580                 }
17581             }
17582             if (last && next > last)
17583                 node= last;
17584             else
17585                 node= next;
17586         }
17587         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17588             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17589                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17590         }
17591         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17592             assert(next);
17593             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17594         }
17595         else if ( op == PLUS || op == STAR) {
17596             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17597         }
17598         else if (PL_regkind[(U8)op] == ANYOF) {
17599             /* arglen 1 + class block */
17600             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17601                           ? ANYOF_POSIXL_SKIP
17602                           : ANYOF_SKIP);
17603             node = NEXTOPER(node);
17604         }
17605         else if (PL_regkind[(U8)op] == EXACT) {
17606             /* Literal string, where present. */
17607             node += NODE_SZ_STR(node) - 1;
17608             node = NEXTOPER(node);
17609         }
17610         else {
17611             node = NEXTOPER(node);
17612             node += regarglen[(U8)op];
17613         }
17614         if (op == CURLYX || op == OPEN)
17615             indent++;
17616     }
17617     CLEAR_OPTSTART;
17618 #ifdef DEBUG_DUMPUNTIL
17619     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17620 #endif
17621     return node;
17622 }
17623
17624 #endif  /* DEBUGGING */
17625
17626 /*
17627  * Local variables:
17628  * c-indentation-style: bsd
17629  * c-basic-offset: 4
17630  * indent-tabs-mode: nil
17631  * End:
17632  *
17633  * ex: set ts=8 sts=4 sw=4 et:
17634  */