regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* this is a chain of data about sub patterns we are processing that
 105    need to be handled separately/specially in study_chunk. Its so
 106    we can simulate recursion without losing state.  */
 107 struct scan_frame;
 108 typedef struct scan_frame {
 109     regnode *last_regnode;      /* last node to process in this frame */
 110     regnode *next_regnode;      /* next node to process when last is reached */
 111     U32 prev_recursed_depth;
 112     I32 stopparen;              /* what stopparen do we use */
 113     U32 is_top_frame;           /* what flags do we use? */
 114
 115     struct scan_frame *this_prev_frame; /* this previous frame */
 116     struct scan_frame *prev_frame;      /* previous frame */
 117     struct scan_frame *next_frame;      /* next frame */
 118 } scan_frame;
 119
 120 /* Certain characters are output as a sequence with the first being a
 121  * backslash. */
 122 #define isBACKSLASHED_PUNCT(c)                                              \
 123                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 124
 125
 126 struct RExC_state_t {
 127     U32         flags;                  /* RXf_* are we folding, multilining? */
 128     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 129     char        *precomp;               /* uncompiled string. */
 130     char        *precomp_end;           /* pointer to end of uncompiled string. */
 131     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 132     regexp      *rx;                    /* perl core regexp structure */
 133     regexp_internal     *rxi;           /* internal data for regexp object
 134                                            pprivate field */
 135     char        *start;                 /* Start of input for compile */
 136     char        *end;                   /* End of input for compile */
 137     char        *parse;                 /* Input-scan pointer. */
 138     char        *adjusted_start;        /* 'start', adjusted.  See code use */
 139     STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
 140     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 141     regnode     *emit_start;            /* Start of emitted-code area */
 142     regnode     *emit_bound;            /* First regnode outside of the
 143                                            allocated space */
 144     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 145                                            implies compiling, so don't emit */
 146     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 147                                            large enough for the largest
 148                                            non-EXACTish node, so can use it as
 149                                            scratch in pass1 */
 150     I32         naughty;                /* How bad is this pattern? */
 151     I32         sawback;                /* Did we see \1, ...? */
 152     U32         seen;
 153     SSize_t     size;                   /* Code size. */
 154     I32                npar;            /* Capture buffer count, (OPEN) plus
 155                                            one. ("par" 0 is the whole
 156                                            pattern)*/
 157     I32         nestroot;               /* root parens we are in - used by
 158                                            accept */
 159     I32         extralen;
 160     I32         seen_zerolen;
 161     regnode     **open_parens;          /* pointers to open parens */
 162     regnode     **close_parens;         /* pointers to close parens */
 163     regnode     *end_op;                /* END node in program */
 164     I32         utf8;           /* whether the pattern is utf8 or not */
 165     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 166                                 /* XXX use this for future optimisation of case
 167                                  * where pattern must be upgraded to utf8. */
 168     I32         uni_semantics;  /* If a d charset modifier should use unicode
 169                                    rules, even if the pattern is not in
 170                                    utf8 */
 171     HV          *paren_names;           /* Paren names */
 172
 173     regnode     **recurse;              /* Recurse regops */
 174     I32                recurse_count;                /* Number of recurse regops we have generated */
 175     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 176                                            through */
 177     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 178     I32         in_lookbehind;
 179     I32         contains_locale;
 180     I32         override_recoding;
 181 #ifdef EBCDIC
 182     I32         recode_x_to_native;
 183 #endif
 184     I32         in_multi_char_class;
 185     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 186                                             within pattern */
 187     int         code_index;             /* next code_blocks[] slot */
 188     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 189     scan_frame *frame_head;
 190     scan_frame *frame_last;
 191     U32         frame_count;
 192     AV         *warn_text;
 193 #ifdef ADD_TO_REGEXEC
 194     char        *starttry;              /* -Dr: where regtry was called. */
 195 #define RExC_starttry   (pRExC_state->starttry)
 196 #endif
 197     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 198 #ifdef DEBUGGING
 199     const char  *lastparse;
 200     I32         lastnum;
 201     AV          *paren_name_list;       /* idx -> name */
 202     U32         study_chunk_recursed_count;
 203     SV          *mysv1;
 204     SV          *mysv2;
 205 #define RExC_lastparse  (pRExC_state->lastparse)
 206 #define RExC_lastnum    (pRExC_state->lastnum)
 207 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 208 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 209 #define RExC_mysv       (pRExC_state->mysv1)
 210 #define RExC_mysv1      (pRExC_state->mysv1)
 211 #define RExC_mysv2      (pRExC_state->mysv2)
 212
 213 #endif
 214     bool        seen_unfolded_sharp_s;
 215     bool        strict;
 216     bool        study_started;
 217 };
 218
 219 #define RExC_flags      (pRExC_state->flags)
 220 #define RExC_pm_flags   (pRExC_state->pm_flags)
 221 #define RExC_precomp    (pRExC_state->precomp)
 222 #define RExC_precomp_adj (pRExC_state->precomp_adj)
 223 #define RExC_adjusted_start  (pRExC_state->adjusted_start)
 224 #define RExC_precomp_end (pRExC_state->precomp_end)
 225 #define RExC_rx_sv      (pRExC_state->rx_sv)
 226 #define RExC_rx         (pRExC_state->rx)
 227 #define RExC_rxi        (pRExC_state->rxi)
 228 #define RExC_start      (pRExC_state->start)
 229 #define RExC_end        (pRExC_state->end)
 230 #define RExC_parse      (pRExC_state->parse)
 231 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 232
 233 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
 234  * EXACTF node, hence was parsed under /di rules.  If later in the parse,
 235  * something forces the pattern into using /ui rules, the sharp s should be
 236  * folded into the sequence 'ss', which takes up more space than previously
 237  * calculated.  This means that the sizing pass needs to be restarted.  (The
 238  * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
 239  * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
 240  * so there is no need to resize [perl #125990]. */
 241 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
 242
 243 #ifdef RE_TRACK_PATTERN_OFFSETS
 244 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 245                                                          others */
 246 #endif
 247 #define RExC_emit       (pRExC_state->emit)
 248 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 249 #define RExC_emit_start (pRExC_state->emit_start)
 250 #define RExC_emit_bound (pRExC_state->emit_bound)
 251 #define RExC_sawback    (pRExC_state->sawback)
 252 #define RExC_seen       (pRExC_state->seen)
 253 #define RExC_size       (pRExC_state->size)
 254 #define RExC_maxlen        (pRExC_state->maxlen)
 255 #define RExC_npar       (pRExC_state->npar)
 256 #define RExC_nestroot   (pRExC_state->nestroot)
 257 #define RExC_extralen   (pRExC_state->extralen)
 258 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 259 #define RExC_utf8       (pRExC_state->utf8)
 260 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 261 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 262 #define RExC_open_parens        (pRExC_state->open_parens)
 263 #define RExC_close_parens       (pRExC_state->close_parens)
 264 #define RExC_end_op     (pRExC_state->end_op)
 265 #define RExC_paren_names        (pRExC_state->paren_names)
 266 #define RExC_recurse    (pRExC_state->recurse)
 267 #define RExC_recurse_count      (pRExC_state->recurse_count)
 268 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 269 #define RExC_study_chunk_recursed_bytes  \
 270                                    (pRExC_state->study_chunk_recursed_bytes)
 271 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 272 #define RExC_contains_locale    (pRExC_state->contains_locale)
 273 #ifdef EBCDIC
 274 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 275 #endif
 276 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 277 #define RExC_frame_head (pRExC_state->frame_head)
 278 #define RExC_frame_last (pRExC_state->frame_last)
 279 #define RExC_frame_count (pRExC_state->frame_count)
 280 #define RExC_strict (pRExC_state->strict)
 281 #define RExC_study_started      (pRExC_state->study_started)
 282 #define RExC_warn_text (pRExC_state->warn_text)
 283
 284 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 285  * a flag to disable back-off on the fixed/floating substrings - if it's
 286  * a high complexity pattern we assume the benefit of avoiding a full match
 287  * is worth the cost of checking for the substrings even if they rarely help.
 288  */
 289 #define RExC_naughty    (pRExC_state->naughty)
 290 #define TOO_NAUGHTY (10)
 291 #define MARK_NAUGHTY(add) \
 292     if (RExC_naughty < TOO_NAUGHTY) \
 293         RExC_naughty += (add)
 294 #define MARK_NAUGHTY_EXP(exp, add) \
 295     if (RExC_naughty < TOO_NAUGHTY) \
 296         RExC_naughty += RExC_naughty / (exp) + (add)
 297
 298 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 299 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 300         ((*s) == '{' && regcurly(s)))
 301
 302 /*
 303  * Flags to be passed up and down.
 304  */
 305 #define WORST           0       /* Worst case. */
 306 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 307
 308 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 309  * character.  (There needs to be a case: in the switch statement in regexec.c
 310  * for any node marked SIMPLE.)  Note that this is not the same thing as
 311  * REGNODE_SIMPLE */
 312 #define SIMPLE          0x02
 313 #define SPSTART         0x04    /* Starts with * or + */
 314 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 315 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 316 #define RESTART_PASS1   0x20    /* Need to restart sizing pass */
 317 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
 318                                    calcuate sizes as UTF-8 */
 319
 320 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 321
 322 /* whether trie related optimizations are enabled */
 323 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 324 #define TRIE_STUDY_OPT
 325 #define FULL_TRIE_STUDY
 326 #define TRIE_STCLASS
 327 #endif
 328
 329
 330
 331 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 332 #define PBITVAL(paren) (1 << ((paren) & 7))
 333 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 334 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 335 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 336
 337 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 338                                      if (!UTF) {                           \
 339                                          assert(PASS1);                    \
 340                                          *flagp = RESTART_PASS1|NEED_UTF8; \
 341                                          return NULL;                      \
 342                                      }                                     \
 343                              } STMT_END
 344
 345 /* Change from /d into /u rules, and restart the parse if we've already seen
 346  * something whose size would increase as a result, by setting *flagp and
 347  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
 348  * we've change to /u during the parse.  */
 349 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 350     STMT_START {                                                            \
 351             if (DEPENDS_SEMANTICS) {                                        \
 352                 assert(PASS1);                                              \
 353                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 354                 RExC_uni_semantics = 1;                                     \
 355                 if (RExC_seen_unfolded_sharp_s) {                           \
 356                     *flagp |= RESTART_PASS1;                                \
 357                     return restart_retval;                                  \
 358                 }                                                           \
 359             }                                                               \
 360     } STMT_END
 361
 362 /* This converts the named class defined in regcomp.h to its equivalent class
 363  * number defined in handy.h. */
 364 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 365 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 366
 367 #define _invlist_union_complement_2nd(a, b, output) \
 368                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 369 #define _invlist_intersection_complement_2nd(a, b, output) \
 370                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 371
 372 /* About scan_data_t.
 373
 374   During optimisation we recurse through the regexp program performing
 375   various inplace (keyhole style) optimisations. In addition study_chunk
 376   and scan_commit populate this data structure with information about
 377   what strings MUST appear in the pattern. We look for the longest
 378   string that must appear at a fixed location, and we look for the
 379   longest string that may appear at a floating location. So for instance
 380   in the pattern:
 381
 382     /FOO[xX]A.*B[xX]BAR/
 383
 384   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 385   strings (because they follow a .* construct). study_chunk will identify
 386   both FOO and BAR as being the longest fixed and floating strings respectively.
 387
 388   The strings can be composites, for instance
 389
 390      /(f)(o)(o)/
 391
 392   will result in a composite fixed substring 'foo'.
 393
 394   For each string some basic information is maintained:
 395
 396   - offset or min_offset
 397     This is the position the string must appear at, or not before.
 398     It also implicitly (when combined with minlenp) tells us how many
 399     characters must match before the string we are searching for.
 400     Likewise when combined with minlenp and the length of the string it
 401     tells us how many characters must appear after the string we have
 402     found.
 403
 404   - max_offset
 405     Only used for floating strings. This is the rightmost point that
 406     the string can appear at. If set to SSize_t_MAX it indicates that the
 407     string can occur infinitely far to the right.
 408
 409   - minlenp
 410     A pointer to the minimum number of characters of the pattern that the
 411     string was found inside. This is important as in the case of positive
 412     lookahead or positive lookbehind we can have multiple patterns
 413     involved. Consider
 414
 415     /(?=FOO).*F/
 416
 417     The minimum length of the pattern overall is 3, the minimum length
 418     of the lookahead part is 3, but the minimum length of the part that
 419     will actually match is 1. So 'FOO's minimum length is 3, but the
 420     minimum length for the F is 1. This is important as the minimum length
 421     is used to determine offsets in front of and behind the string being
 422     looked for.  Since strings can be composites this is the length of the
 423     pattern at the time it was committed with a scan_commit. Note that
 424     the length is calculated by study_chunk, so that the minimum lengths
 425     are not known until the full pattern has been compiled, thus the
 426     pointer to the value.
 427
 428   - lookbehind
 429
 430     In the case of lookbehind the string being searched for can be
 431     offset past the start point of the final matching string.
 432     If this value was just blithely removed from the min_offset it would
 433     invalidate some of the calculations for how many chars must match
 434     before or after (as they are derived from min_offset and minlen and
 435     the length of the string being searched for).
 436     When the final pattern is compiled and the data is moved from the
 437     scan_data_t structure into the regexp structure the information
 438     about lookbehind is factored in, with the information that would
 439     have been lost precalculated in the end_shift field for the
 440     associated string.
 441
 442   The fields pos_min and pos_delta are used to store the minimum offset
 443   and the delta to the maximum offset at the current point in the pattern.
 444
 445 */
 446
 447 typedef struct scan_data_t {
 448     /*I32 len_min;      unused */
 449     /*I32 len_delta;    unused */
 450     SSize_t pos_min;
 451     SSize_t pos_delta;
 452     SV *last_found;
 453     SSize_t last_end;       /* min value, <0 unless valid. */
 454     SSize_t last_start_min;
 455     SSize_t last_start_max;
 456     SV **longest;           /* Either &l_fixed, or &l_float. */
 457     SV *longest_fixed;      /* longest fixed string found in pattern */
 458     SSize_t offset_fixed;   /* offset where it starts */
 459     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 460     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 461     SV *longest_float;      /* longest floating string found in pattern */
 462     SSize_t offset_float_min; /* earliest point in string it can appear */
 463     SSize_t offset_float_max; /* latest point in string it can appear */
 464     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 465     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 466     I32 flags;
 467     I32 whilem_c;
 468     SSize_t *last_closep;
 469     regnode_ssc *start_class;
 470 } scan_data_t;
 471
 472 /*
 473  * Forward declarations for pregcomp()'s friends.
 474  */
 475
 476 static const scan_data_t zero_scan_data =
 477   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 478
 479 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 480 #define SF_BEFORE_SEOL          0x0001
 481 #define SF_BEFORE_MEOL          0x0002
 482 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 483 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 484
 485 #define SF_FIX_SHIFT_EOL        (+2)
 486 #define SF_FL_SHIFT_EOL         (+4)
 487
 488 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 489 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 490
 491 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 492 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 493 #define SF_IS_INF               0x0040
 494 #define SF_HAS_PAR              0x0080
 495 #define SF_IN_PAR               0x0100
 496 #define SF_HAS_EVAL             0x0200
 497
 498
 499 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 500  * longest substring in the pattern. When it is not set the optimiser keeps
 501  * track of position, but does not keep track of the actual strings seen,
 502  *
 503  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 504  * /foo/i will not.
 505  *
 506  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 507  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 508  * turned off because of the alternation (BRANCH). */
 509 #define SCF_DO_SUBSTR           0x0400
 510
 511 #define SCF_DO_STCLASS_AND      0x0800
 512 #define SCF_DO_STCLASS_OR       0x1000
 513 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 514 #define SCF_WHILEM_VISITED_POS  0x2000
 515
 516 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 517 #define SCF_SEEN_ACCEPT         0x8000
 518 #define SCF_TRIE_DOING_RESTUDY 0x10000
 519 #define SCF_IN_DEFINE          0x20000
 520
 521
 522
 523
 524 #define UTF cBOOL(RExC_utf8)
 525
 526 /* The enums for all these are ordered so things work out correctly */
 527 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 528 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 529                                                      == REGEX_DEPENDS_CHARSET)
 530 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 531 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 532                                                      >= REGEX_UNICODE_CHARSET)
 533 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 534                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 535 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 536                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 537 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 538                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 539
 540 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 541
 542 /* For programs that want to be strictly Unicode compatible by dying if any
 543  * attempt is made to match a non-Unicode code point against a Unicode
 544  * property.  */
 545 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 546
 547 #define OOB_NAMEDCLASS          -1
 548
 549 /* There is no code point that is out-of-bounds, so this is problematic.  But
 550  * its only current use is to initialize a variable that is always set before
 551  * looked at. */
 552 #define OOB_UNICODE             0xDEADBEEF
 553
 554 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 555
 556
 557 /* length of regex to show in messages that don't mark a position within */
 558 #define RegexLengthToShowInErrorMessages 127
 559
 560 /*
 561  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 562  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 563  * op/pragma/warn/regcomp.
 564  */
 565 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 566 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 567
 568 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 569                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 570
 571 /* The code in this file in places uses one level of recursion with parsing
 572  * rebased to an alternate string constructed by us in memory.  This can take
 573  * the form of something that is completely different from the input, or
 574  * something that uses the input as part of the alternate.  In the first case,
 575  * there should be no possibility of an error, as we are in complete control of
 576  * the alternate string.  But in the second case we don't control the input
 577  * portion, so there may be errors in that.  Here's an example:
 578  *      /[abc\x{DF}def]/ui
 579  * is handled specially because \x{df} folds to a sequence of more than one
 580  * character, 'ss'.  What is done is to create and parse an alternate string,
 581  * which looks like this:
 582  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 583  * where it uses the input unchanged in the middle of something it constructs,
 584  * which is a branch for the DF outside the character class, and clustering
 585  * parens around the whole thing. (It knows enough to skip the DF inside the
 586  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 587  * need to be reported.  The general situation looks like this:
 588  *
 589  *              sI                       tI               xI       eI
 590  * Input:       ----------------------------------------------------
 591  * Constructed:         ---------------------------------------------------
 592  *                      sC               tC               xC       eC     EC
 593  *
 594  * The input string sI..eI is the input pattern.  The string sC..EC is the
 595  * constructed substitute parse string.  The portions sC..tC and eC..EC are
 596  * constructed by us.  The portion tC..eC is an exact duplicate of the input
 597  * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
 598  * while parsing, we find an error at xC.  We want to display a message showing
 599  * the real input string.  Thus we need to find the point xI in it which
 600  * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
 601  * been constructed by us, and so shouldn't have errors.  We get:
 602  *
 603  *      xI = sI + (tI - sI) + (xC - tC)
 604  *
 605  * and, the offset into sI is:
 606  *
 607  *      (xI - sI) = (tI - sI) + (xC - tC)
 608  *
 609  * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
 610  * and we save tC as RExC_adjusted_start.
 611  *
 612  * During normal processing of the input pattern, everything points to that,
 613  * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
 614  */
 615
 616 #define tI_sI           RExC_precomp_adj
 617 #define tC              RExC_adjusted_start
 618 #define sC              RExC_precomp
 619 #define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
 620 #define xI(xC)          (sC + xI_offset(xC))
 621 #define eC              RExC_precomp_end
 622
 623 #define REPORT_LOCATION_ARGS(xC)                                            \
 624     UTF8fARG(UTF,                                                           \
 625              (xI(xC) > eC) /* Don't run off end */                          \
 626               ? eC - sC   /* Length before the <--HERE */                   \
 627               : xI_offset(xC),                                              \
 628              sC),         /* The input pattern printed up to the <--HERE */ \
 629     UTF8fARG(UTF,                                                           \
 630              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
 631              (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 632
 633 /* Used to point after bad bytes for an error message, but avoid skipping
 634  * past a nul byte. */
 635 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 636
 637 /*
 638  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 639  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 640  * "...".
 641  */
 642 #define _FAIL(code) STMT_START {                                        \
 643     const char *ellipses = "";                                          \
 644     IV len = RExC_precomp_end - RExC_precomp;                                   \
 645                                                                         \
 646     if (!SIZE_ONLY)                                                     \
 647         SAVEFREESV(RExC_rx_sv);                                         \
 648     if (len > RegexLengthToShowInErrorMessages) {                       \
 649         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 650         len = RegexLengthToShowInErrorMessages - 10;                    \
 651         ellipses = "...";                                               \
 652     }                                                                   \
 653     code;                                                               \
 654 } STMT_END
 655
 656 #define FAIL(msg) _FAIL(                            \
 657     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 658             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 659
 660 #define FAIL2(msg,arg) _FAIL(                       \
 661     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 662             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 663
 664 /*
 665  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 666  */
 667 #define Simple_vFAIL(m) STMT_START {                                    \
 668     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 669             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 670 } STMT_END
 671
 672 /*
 673  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 674  */
 675 #define vFAIL(m) STMT_START {                           \
 676     if (!SIZE_ONLY)                                     \
 677         SAVEFREESV(RExC_rx_sv);                         \
 678     Simple_vFAIL(m);                                    \
 679 } STMT_END
 680
 681 /*
 682  * Like Simple_vFAIL(), but accepts two arguments.
 683  */
 684 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 685     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 686                       REPORT_LOCATION_ARGS(RExC_parse));        \
 687 } STMT_END
 688
 689 /*
 690  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 691  */
 692 #define vFAIL2(m,a1) STMT_START {                       \
 693     if (!SIZE_ONLY)                                     \
 694         SAVEFREESV(RExC_rx_sv);                         \
 695     Simple_vFAIL2(m, a1);                               \
 696 } STMT_END
 697
 698
 699 /*
 700  * Like Simple_vFAIL(), but accepts three arguments.
 701  */
 702 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 703     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 704             REPORT_LOCATION_ARGS(RExC_parse));                  \
 705 } STMT_END
 706
 707 /*
 708  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 709  */
 710 #define vFAIL3(m,a1,a2) STMT_START {                    \
 711     if (!SIZE_ONLY)                                     \
 712         SAVEFREESV(RExC_rx_sv);                         \
 713     Simple_vFAIL3(m, a1, a2);                           \
 714 } STMT_END
 715
 716 /*
 717  * Like Simple_vFAIL(), but accepts four arguments.
 718  */
 719 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 720     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 721             REPORT_LOCATION_ARGS(RExC_parse));                  \
 722 } STMT_END
 723
 724 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 725     if (!SIZE_ONLY)                                     \
 726         SAVEFREESV(RExC_rx_sv);                         \
 727     Simple_vFAIL4(m, a1, a2, a3);                       \
 728 } STMT_END
 729
 730 /* A specialized version of vFAIL2 that works with UTF8f */
 731 #define vFAIL2utf8f(m, a1) STMT_START {             \
 732     if (!SIZE_ONLY)                                 \
 733         SAVEFREESV(RExC_rx_sv);                     \
 734     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 735             REPORT_LOCATION_ARGS(RExC_parse));      \
 736 } STMT_END
 737
 738 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 739     if (!SIZE_ONLY)                                     \
 740         SAVEFREESV(RExC_rx_sv);                         \
 741     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 742             REPORT_LOCATION_ARGS(RExC_parse));          \
 743 } STMT_END
 744
 745 /* These have asserts in them because of [perl #122671] Many warnings in
 746  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 747  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 748  * would get output again.  So they should be output in pass2, and these
 749  * asserts make sure new warnings follow that paradigm. */
 750
 751 /* m is not necessarily a "literal string", in this macro */
 752 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 753     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 754                                        "%s" REPORT_LOCATION,            \
 755                                   m, REPORT_LOCATION_ARGS(loc));        \
 756 } STMT_END
 757
 758 #define ckWARNreg(loc,m) STMT_START {                                   \
 759     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 760                                           m REPORT_LOCATION,            \
 761                                           REPORT_LOCATION_ARGS(loc));   \
 762 } STMT_END
 763
 764 #define vWARN(loc, m) STMT_START {                                      \
 765     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 766                                        m REPORT_LOCATION,               \
 767                                        REPORT_LOCATION_ARGS(loc));      \
 768 } STMT_END
 769
 770 #define vWARN_dep(loc, m) STMT_START {                                  \
 771     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),       \
 772                                        m REPORT_LOCATION,               \
 773                                        REPORT_LOCATION_ARGS(loc));      \
 774 } STMT_END
 775
 776 #define ckWARNdep(loc,m) STMT_START {                                   \
 777     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),  \
 778                                             m REPORT_LOCATION,          \
 779                                             REPORT_LOCATION_ARGS(loc)); \
 780 } STMT_END
 781
 782 #define ckWARNregdep(loc,m) STMT_START {                                    \
 783     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,      \
 784                                                       WARN_REGEXP),         \
 785                                              m REPORT_LOCATION,             \
 786                                              REPORT_LOCATION_ARGS(loc));    \
 787 } STMT_END
 788
 789 #define ckWARN2reg_d(loc,m, a1) STMT_START {                                \
 790     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),          \
 791                                             m REPORT_LOCATION,              \
 792                                             a1, REPORT_LOCATION_ARGS(loc)); \
 793 } STMT_END
 794
 795 #define ckWARN2reg(loc, m, a1) STMT_START {                                 \
 796     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 797                                           m REPORT_LOCATION,                \
 798                                           a1, REPORT_LOCATION_ARGS(loc));   \
 799 } STMT_END
 800
 801 #define vWARN3(loc, m, a1, a2) STMT_START {                                 \
 802     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),               \
 803                                        m REPORT_LOCATION,                   \
 804                                        a1, a2, REPORT_LOCATION_ARGS(loc));  \
 805 } STMT_END
 806
 807 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                             \
 808     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 809                                           m REPORT_LOCATION,                \
 810                                           a1, a2,                           \
 811                                           REPORT_LOCATION_ARGS(loc));       \
 812 } STMT_END
 813
 814 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 815     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 816                                        m REPORT_LOCATION,               \
 817                                        a1, a2, a3,                      \
 818                                        REPORT_LOCATION_ARGS(loc));      \
 819 } STMT_END
 820
 821 #define vWARN4dep(loc, m, a1, a2, a3) STMT_START {                             \
 822     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN2(WARN_REGEXP,WARN_DEPRECATED), \
 823                                        m REPORT_LOCATION,                      \
 824                                        a1, a2, a3,                             \
 825                                        REPORT_LOCATION_ARGS(loc));             \
 826 } STMT_END
 827
 828 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 829     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 830                                           m REPORT_LOCATION,            \
 831                                           a1, a2, a3,                   \
 832                                           REPORT_LOCATION_ARGS(loc));   \
 833 } STMT_END
 834
 835 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 836     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 837                                        m REPORT_LOCATION,               \
 838                                        a1, a2, a3, a4,                  \
 839                                        REPORT_LOCATION_ARGS(loc));      \
 840 } STMT_END
 841
 842 /* Macros for recording node offsets.   20001227 mjd@plover.com
 843  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 844  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 845  * Element 0 holds the number n.
 846  * Position is 1 indexed.
 847  */
 848 #ifndef RE_TRACK_PATTERN_OFFSETS
 849 #define Set_Node_Offset_To_R(node,byte)
 850 #define Set_Node_Offset(node,byte)
 851 #define Set_Cur_Node_Offset
 852 #define Set_Node_Length_To_R(node,len)
 853 #define Set_Node_Length(node,len)
 854 #define Set_Node_Cur_Length(node,start)
 855 #define Node_Offset(n)
 856 #define Node_Length(n)
 857 #define Set_Node_Offset_Length(node,offset,len)
 858 #define ProgLen(ri) ri->u.proglen
 859 #define SetProgLen(ri,x) ri->u.proglen = x
 860 #else
 861 #define ProgLen(ri) ri->u.offsets[0]
 862 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 863 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 864     if (! SIZE_ONLY) {                                                  \
 865         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 866                     __LINE__, (int)(node), (int)(byte)));               \
 867         if((node) < 0) {                                                \
 868             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 869                                          (int)(node));                  \
 870         } else {                                                        \
 871             RExC_offsets[2*(node)-1] = (byte);                          \
 872         }                                                               \
 873     }                                                                   \
 874 } STMT_END
 875
 876 #define Set_Node_Offset(node,byte) \
 877     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 878 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 879
 880 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 881     if (! SIZE_ONLY) {                                                  \
 882         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 883                 __LINE__, (int)(node), (int)(len)));                    \
 884         if((node) < 0) {                                                \
 885             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 886                                          (int)(node));                  \
 887         } else {                                                        \
 888             RExC_offsets[2*(node)] = (len);                             \
 889         }                                                               \
 890     }                                                                   \
 891 } STMT_END
 892
 893 #define Set_Node_Length(node,len) \
 894     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 895 #define Set_Node_Cur_Length(node, start)                \
 896     Set_Node_Length(node, RExC_parse - start)
 897
 898 /* Get offsets and lengths */
 899 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 900 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 901
 902 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 903     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 904     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 905 } STMT_END
 906 #endif
 907
 908 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 909 #define EXPERIMENTAL_INPLACESCAN
 910 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 911
 912 #ifdef DEBUGGING
 913 int
 914 Perl_re_printf(pTHX_ const char *fmt, ...)
 915 {
 916     va_list ap;
 917     int result;
 918     PerlIO *f= Perl_debug_log;
 919     PERL_ARGS_ASSERT_RE_PRINTF;
 920     va_start(ap, fmt);
 921     result = PerlIO_vprintf(f, fmt, ap);
 922     va_end(ap);
 923     return result;
 924 }
 925
 926 int
 927 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
 928 {
 929     va_list ap;
 930     int result;
 931     PerlIO *f= Perl_debug_log;
 932     PERL_ARGS_ASSERT_RE_INDENTF;
 933     va_start(ap, depth);
 934     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
 935     result = PerlIO_vprintf(f, fmt, ap);
 936     va_end(ap);
 937     return result;
 938 }
 939 #endif /* DEBUGGING */
 940
 941 #define DEBUG_RExC_seen()                                                   \
 942         DEBUG_OPTIMISE_MORE_r({                                             \
 943             Perl_re_printf( aTHX_ "RExC_seen: ");                                       \
 944                                                                             \
 945             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 946                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                            \
 947                                                                             \
 948             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 949                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");                          \
 950                                                                             \
 951             if (RExC_seen & REG_GPOS_SEEN)                                  \
 952                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                                \
 953                                                                             \
 954             if (RExC_seen & REG_RECURSE_SEEN)                               \
 955                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                             \
 956                                                                             \
 957             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
 958                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");                  \
 959                                                                             \
 960             if (RExC_seen & REG_VERBARG_SEEN)                               \
 961                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                             \
 962                                                                             \
 963             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 964                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                            \
 965                                                                             \
 966             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 967                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");                      \
 968                                                                             \
 969             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 970                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");                      \
 971                                                                             \
 972             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
 973                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");                \
 974                                                                             \
 975             Perl_re_printf( aTHX_ "\n");                                                \
 976         });
 977
 978 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 979   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
 980
 981 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 982     if ( ( flags ) ) {                                                      \
 983         Perl_re_printf( aTHX_  "%s", open_str);                                         \
 984         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 985         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 986         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 987         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 988         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 989         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 990         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 991         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 992         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 993         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 994         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 995         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 996         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 997         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 998         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 999         Perl_re_printf( aTHX_  "%s", close_str);                                        \
1000     }
1001
1002
1003 #define DEBUG_STUDYDATA(str,data,depth)                              \
1004 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
1005     Perl_re_indentf( aTHX_  "" str "Pos:%" IVdf "/%" IVdf            \
1006         " Flags: 0x%" UVXf,                                          \
1007         depth,                                                       \
1008         (IV)((data)->pos_min),                                       \
1009         (IV)((data)->pos_delta),                                     \
1010         (UV)((data)->flags)                                          \
1011     );                                                               \
1012     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
1013     Perl_re_printf( aTHX_                                            \
1014         " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",                    \
1015         (IV)((data)->whilem_c),                                      \
1016         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
1017         is_inf ? "INF " : ""                                         \
1018     );                                                               \
1019     if ((data)->last_found)                                          \
1020         Perl_re_printf( aTHX_                                        \
1021             "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf                   \
1022             " %sFixed:'%s' @ %" IVdf                                 \
1023             " %sFloat: '%s' @ %" IVdf "/%" IVdf,                     \
1024             SvPVX_const((data)->last_found),                         \
1025             (IV)((data)->last_end),                                  \
1026             (IV)((data)->last_start_min),                            \
1027             (IV)((data)->last_start_max),                            \
1028             ((data)->longest &&                                      \
1029              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
1030             SvPVX_const((data)->longest_fixed),                      \
1031             (IV)((data)->offset_fixed),                              \
1032             ((data)->longest &&                                      \
1033              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
1034             SvPVX_const((data)->longest_float),                      \
1035             (IV)((data)->offset_float_min),                          \
1036             (IV)((data)->offset_float_max)                           \
1037         );                                                           \
1038     Perl_re_printf( aTHX_ "\n");                                                 \
1039 });
1040
1041
1042 /* =========================================================
1043  * BEGIN edit_distance stuff.
1044  *
1045  * This calculates how many single character changes of any type are needed to
1046  * transform a string into another one.  It is taken from version 3.1 of
1047  *
1048  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1049  */
1050
1051 /* Our unsorted dictionary linked list.   */
1052 /* Note we use UVs, not chars. */
1053
1054 struct dictionary{
1055   UV key;
1056   UV value;
1057   struct dictionary* next;
1058 };
1059 typedef struct dictionary item;
1060
1061
1062 PERL_STATIC_INLINE item*
1063 push(UV key,item* curr)
1064 {
1065     item* head;
1066     Newxz(head, 1, item);
1067     head->key = key;
1068     head->value = 0;
1069     head->next = curr;
1070     return head;
1071 }
1072
1073
1074 PERL_STATIC_INLINE item*
1075 find(item* head, UV key)
1076 {
1077     item* iterator = head;
1078     while (iterator){
1079         if (iterator->key == key){
1080             return iterator;
1081         }
1082         iterator = iterator->next;
1083     }
1084
1085     return NULL;
1086 }
1087
1088 PERL_STATIC_INLINE item*
1089 uniquePush(item* head,UV key)
1090 {
1091     item* iterator = head;
1092
1093     while (iterator){
1094         if (iterator->key == key) {
1095             return head;
1096         }
1097         iterator = iterator->next;
1098     }
1099
1100     return push(key,head);
1101 }
1102
1103 PERL_STATIC_INLINE void
1104 dict_free(item* head)
1105 {
1106     item* iterator = head;
1107
1108     while (iterator) {
1109         item* temp = iterator;
1110         iterator = iterator->next;
1111         Safefree(temp);
1112     }
1113
1114     head = NULL;
1115 }
1116
1117 /* End of Dictionary Stuff */
1118
1119 /* All calculations/work are done here */
1120 STATIC int
1121 S_edit_distance(const UV* src,
1122                 const UV* tgt,
1123                 const STRLEN x,             /* length of src[] */
1124                 const STRLEN y,             /* length of tgt[] */
1125                 const SSize_t maxDistance
1126 )
1127 {
1128     item *head = NULL;
1129     UV swapCount,swapScore,targetCharCount,i,j;
1130     UV *scores;
1131     UV score_ceil = x + y;
1132
1133     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1134
1135     /* intialize matrix start values */
1136     Newxz(scores, ( (x + 2) * (y + 2)), UV);
1137     scores[0] = score_ceil;
1138     scores[1 * (y + 2) + 0] = score_ceil;
1139     scores[0 * (y + 2) + 1] = score_ceil;
1140     scores[1 * (y + 2) + 1] = 0;
1141     head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1142
1143     /* work loops    */
1144     /* i = src index */
1145     /* j = tgt index */
1146     for (i=1;i<=x;i++) {
1147         if (i < x)
1148             head = uniquePush(head,src[i]);
1149         scores[(i+1) * (y + 2) + 1] = i;
1150         scores[(i+1) * (y + 2) + 0] = score_ceil;
1151         swapCount = 0;
1152
1153         for (j=1;j<=y;j++) {
1154             if (i == 1) {
1155                 if(j < y)
1156                 head = uniquePush(head,tgt[j]);
1157                 scores[1 * (y + 2) + (j + 1)] = j;
1158                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1159             }
1160
1161             targetCharCount = find(head,tgt[j-1])->value;
1162             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1163
1164             if (src[i-1] != tgt[j-1]){
1165                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1166             }
1167             else {
1168                 swapCount = j;
1169                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1170             }
1171         }
1172
1173         find(head,src[i-1])->value = i;
1174     }
1175
1176     {
1177         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1178         dict_free(head);
1179         Safefree(scores);
1180         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1181     }
1182 }
1183
1184 /* END of edit_distance() stuff
1185  * ========================================================= */
1186
1187 /* is c a control character for which we have a mnemonic? */
1188 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1189
1190 STATIC const char *
1191 S_cntrl_to_mnemonic(const U8 c)
1192 {
1193     /* Returns the mnemonic string that represents character 'c', if one
1194      * exists; NULL otherwise.  The only ones that exist for the purposes of
1195      * this routine are a few control characters */
1196
1197     switch (c) {
1198         case '\a':       return "\\a";
1199         case '\b':       return "\\b";
1200         case ESC_NATIVE: return "\\e";
1201         case '\f':       return "\\f";
1202         case '\n':       return "\\n";
1203         case '\r':       return "\\r";
1204         case '\t':       return "\\t";
1205     }
1206
1207     return NULL;
1208 }
1209
1210 /* Mark that we cannot extend a found fixed substring at this point.
1211    Update the longest found anchored substring and the longest found
1212    floating substrings if needed. */
1213
1214 STATIC void
1215 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1216                     SSize_t *minlenp, int is_inf)
1217 {
1218     const STRLEN l = CHR_SVLEN(data->last_found);
1219     const STRLEN old_l = CHR_SVLEN(*data->longest);
1220     GET_RE_DEBUG_FLAGS_DECL;
1221
1222     PERL_ARGS_ASSERT_SCAN_COMMIT;
1223
1224     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1225         SvSetMagicSV(*data->longest, data->last_found);
1226         if (*data->longest == data->longest_fixed) {
1227             data->offset_fixed = l ? data->last_start_min : data->pos_min;
1228             if (data->flags & SF_BEFORE_EOL)
1229                 data->flags
1230                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1231             else
1232                 data->flags &= ~SF_FIX_BEFORE_EOL;
1233             data->minlen_fixed=minlenp;
1234             data->lookbehind_fixed=0;
1235         }
1236         else { /* *data->longest == data->longest_float */
1237             data->offset_float_min = l ? data->last_start_min : data->pos_min;
1238             data->offset_float_max = (l
1239                           ? data->last_start_max
1240                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1241                                          ? SSize_t_MAX
1242                                          : data->pos_min + data->pos_delta));
1243             if (is_inf
1244                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1245                 data->offset_float_max = SSize_t_MAX;
1246             if (data->flags & SF_BEFORE_EOL)
1247                 data->flags
1248                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1249             else
1250                 data->flags &= ~SF_FL_BEFORE_EOL;
1251             data->minlen_float=minlenp;
1252             data->lookbehind_float=0;
1253         }
1254     }
1255     SvCUR_set(data->last_found, 0);
1256     {
1257         SV * const sv = data->last_found;
1258         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1259             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1260             if (mg)
1261                 mg->mg_len = 0;
1262         }
1263     }
1264     data->last_end = -1;
1265     data->flags &= ~SF_BEFORE_EOL;
1266     DEBUG_STUDYDATA("commit: ",data,0);
1267 }
1268
1269 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1270  * list that describes which code points it matches */
1271
1272 STATIC void
1273 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1274 {
1275     /* Set the SSC 'ssc' to match an empty string or any code point */
1276
1277     PERL_ARGS_ASSERT_SSC_ANYTHING;
1278
1279     assert(is_ANYOF_SYNTHETIC(ssc));
1280
1281     /* mortalize so won't leak */
1282     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1283     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1284 }
1285
1286 STATIC int
1287 S_ssc_is_anything(const regnode_ssc *ssc)
1288 {
1289     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1290      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1291      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1292      * in any way, so there's no point in using it */
1293
1294     UV start, end;
1295     bool ret;
1296
1297     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1298
1299     assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1302         return FALSE;
1303     }
1304
1305     /* See if the list consists solely of the range 0 - Infinity */
1306     invlist_iterinit(ssc->invlist);
1307     ret = invlist_iternext(ssc->invlist, &start, &end)
1308           && start == 0
1309           && end == UV_MAX;
1310
1311     invlist_iterfinish(ssc->invlist);
1312
1313     if (ret) {
1314         return TRUE;
1315     }
1316
1317     /* If e.g., both \w and \W are set, matches everything */
1318     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1319         int i;
1320         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1321             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1322                 return TRUE;
1323             }
1324         }
1325     }
1326
1327     return FALSE;
1328 }
1329
1330 STATIC void
1331 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1332 {
1333     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1334      * string, any code point, or any posix class under locale */
1335
1336     PERL_ARGS_ASSERT_SSC_INIT;
1337
1338     Zero(ssc, 1, regnode_ssc);
1339     set_ANYOF_SYNTHETIC(ssc);
1340     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1341     ssc_anything(ssc);
1342
1343     /* If any portion of the regex is to operate under locale rules that aren't
1344      * fully known at compile time, initialization includes it.  The reason
1345      * this isn't done for all regexes is that the optimizer was written under
1346      * the assumption that locale was all-or-nothing.  Given the complexity and
1347      * lack of documentation in the optimizer, and that there are inadequate
1348      * test cases for locale, many parts of it may not work properly, it is
1349      * safest to avoid locale unless necessary. */
1350     if (RExC_contains_locale) {
1351         ANYOF_POSIXL_SETALL(ssc);
1352     }
1353     else {
1354         ANYOF_POSIXL_ZERO(ssc);
1355     }
1356 }
1357
1358 STATIC int
1359 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1360                         const regnode_ssc *ssc)
1361 {
1362     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1363      * to the list of code points matched, and locale posix classes; hence does
1364      * not check its flags) */
1365
1366     UV start, end;
1367     bool ret;
1368
1369     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1370
1371     assert(is_ANYOF_SYNTHETIC(ssc));
1372
1373     invlist_iterinit(ssc->invlist);
1374     ret = invlist_iternext(ssc->invlist, &start, &end)
1375           && start == 0
1376           && end == UV_MAX;
1377
1378     invlist_iterfinish(ssc->invlist);
1379
1380     if (! ret) {
1381         return FALSE;
1382     }
1383
1384     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1385         return FALSE;
1386     }
1387
1388     return TRUE;
1389 }
1390
1391 STATIC SV*
1392 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1393                                const regnode_charclass* const node)
1394 {
1395     /* Returns a mortal inversion list defining which code points are matched
1396      * by 'node', which is of type ANYOF.  Handles complementing the result if
1397      * appropriate.  If some code points aren't knowable at this time, the
1398      * returned list must, and will, contain every code point that is a
1399      * possibility. */
1400
1401     SV* invlist = NULL;
1402     SV* only_utf8_locale_invlist = NULL;
1403     unsigned int i;
1404     const U32 n = ARG(node);
1405     bool new_node_has_latin1 = FALSE;
1406
1407     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1408
1409     /* Look at the data structure created by S_set_ANYOF_arg() */
1410     if (n != ANYOF_ONLY_HAS_BITMAP) {
1411         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1412         AV * const av = MUTABLE_AV(SvRV(rv));
1413         SV **const ary = AvARRAY(av);
1414         assert(RExC_rxi->data->what[n] == 's');
1415
1416         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1417             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1418         }
1419         else if (ary[0] && ary[0] != &PL_sv_undef) {
1420
1421             /* Here, no compile-time swash, and there are things that won't be
1422              * known until runtime -- we have to assume it could be anything */
1423             invlist = sv_2mortal(_new_invlist(1));
1424             return _add_range_to_invlist(invlist, 0, UV_MAX);
1425         }
1426         else if (ary[3] && ary[3] != &PL_sv_undef) {
1427
1428             /* Here no compile-time swash, and no run-time only data.  Use the
1429              * node's inversion list */
1430             invlist = sv_2mortal(invlist_clone(ary[3]));
1431         }
1432
1433         /* Get the code points valid only under UTF-8 locales */
1434         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1435             && ary[2] && ary[2] != &PL_sv_undef)
1436         {
1437             only_utf8_locale_invlist = ary[2];
1438         }
1439     }
1440
1441     if (! invlist) {
1442         invlist = sv_2mortal(_new_invlist(0));
1443     }
1444
1445     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1446      * code points, and an inversion list for the others, but if there are code
1447      * points that should match only conditionally on the target string being
1448      * UTF-8, those are placed in the inversion list, and not the bitmap.
1449      * Since there are circumstances under which they could match, they are
1450      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1451      * to exclude them here, so that when we invert below, the end result
1452      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1453      * have to do this here before we add the unconditionally matched code
1454      * points */
1455     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1456         _invlist_intersection_complement_2nd(invlist,
1457                                              PL_UpperLatin1,
1458                                              &invlist);
1459     }
1460
1461     /* Add in the points from the bit map */
1462     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1463         if (ANYOF_BITMAP_TEST(node, i)) {
1464             unsigned int start = i++;
1465
1466             for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1467                 /* empty */
1468             }
1469             invlist = _add_range_to_invlist(invlist, start, i-1);
1470             new_node_has_latin1 = TRUE;
1471         }
1472     }
1473
1474     /* If this can match all upper Latin1 code points, have to add them
1475      * as well.  But don't add them if inverting, as when that gets done below,
1476      * it would exclude all these characters, including the ones it shouldn't
1477      * that were added just above */
1478     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1479         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1480     {
1481         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1482     }
1483
1484     /* Similarly for these */
1485     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1486         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1487     }
1488
1489     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1490         _invlist_invert(invlist);
1491     }
1492     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1493
1494         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1495          * locale.  We can skip this if there are no 0-255 at all. */
1496         _invlist_union(invlist, PL_Latin1, &invlist);
1497     }
1498
1499     /* Similarly add the UTF-8 locale possible matches.  These have to be
1500      * deferred until after the non-UTF-8 locale ones are taken care of just
1501      * above, or it leads to wrong results under ANYOF_INVERT */
1502     if (only_utf8_locale_invlist) {
1503         _invlist_union_maybe_complement_2nd(invlist,
1504                                             only_utf8_locale_invlist,
1505                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1506                                             &invlist);
1507     }
1508
1509     return invlist;
1510 }
1511
1512 /* These two functions currently do the exact same thing */
1513 #define ssc_init_zero           ssc_init
1514
1515 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1516 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1517
1518 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1519  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1520  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1521
1522 STATIC void
1523 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1524                 const regnode_charclass *and_with)
1525 {
1526     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1527      * another SSC or a regular ANYOF class.  Can create false positives. */
1528
1529     SV* anded_cp_list;
1530     U8  anded_flags;
1531
1532     PERL_ARGS_ASSERT_SSC_AND;
1533
1534     assert(is_ANYOF_SYNTHETIC(ssc));
1535
1536     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1537      * the code point inversion list and just the relevant flags */
1538     if (is_ANYOF_SYNTHETIC(and_with)) {
1539         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1540         anded_flags = ANYOF_FLAGS(and_with);
1541
1542         /* XXX This is a kludge around what appears to be deficiencies in the
1543          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1544          * there are paths through the optimizer where it doesn't get weeded
1545          * out when it should.  And if we don't make some extra provision for
1546          * it like the code just below, it doesn't get added when it should.
1547          * This solution is to add it only when AND'ing, which is here, and
1548          * only when what is being AND'ed is the pristine, original node
1549          * matching anything.  Thus it is like adding it to ssc_anything() but
1550          * only when the result is to be AND'ed.  Probably the same solution
1551          * could be adopted for the same problem we have with /l matching,
1552          * which is solved differently in S_ssc_init(), and that would lead to
1553          * fewer false positives than that solution has.  But if this solution
1554          * creates bugs, the consequences are only that a warning isn't raised
1555          * that should be; while the consequences for having /l bugs is
1556          * incorrect matches */
1557         if (ssc_is_anything((regnode_ssc *)and_with)) {
1558             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1559         }
1560     }
1561     else {
1562         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1563         if (OP(and_with) == ANYOFD) {
1564             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1565         }
1566         else {
1567             anded_flags = ANYOF_FLAGS(and_with)
1568             &( ANYOF_COMMON_FLAGS
1569               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1570               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1571             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1572                 anded_flags &=
1573                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1574             }
1575         }
1576     }
1577
1578     ANYOF_FLAGS(ssc) &= anded_flags;
1579
1580     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1581      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1582      * 'and_with' may be inverted.  When not inverted, we have the situation of
1583      * computing:
1584      *  (C1 | P1) & (C2 | P2)
1585      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1586      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1587      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1588      *                    <=  ((C1 & C2) | P1 | P2)
1589      * Alternatively, the last few steps could be:
1590      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1591      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1592      *                    <=  (C1 | C2 | (P1 & P2))
1593      * We favor the second approach if either P1 or P2 is non-empty.  This is
1594      * because these components are a barrier to doing optimizations, as what
1595      * they match cannot be known until the moment of matching as they are
1596      * dependent on the current locale, 'AND"ing them likely will reduce or
1597      * eliminate them.
1598      * But we can do better if we know that C1,P1 are in their initial state (a
1599      * frequent occurrence), each matching everything:
1600      *  (<everything>) & (C2 | P2) =  C2 | P2
1601      * Similarly, if C2,P2 are in their initial state (again a frequent
1602      * occurrence), the result is a no-op
1603      *  (C1 | P1) & (<everything>) =  C1 | P1
1604      *
1605      * Inverted, we have
1606      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1607      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1608      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1609      * */
1610
1611     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1612         && ! is_ANYOF_SYNTHETIC(and_with))
1613     {
1614         unsigned int i;
1615
1616         ssc_intersection(ssc,
1617                          anded_cp_list,
1618                          FALSE /* Has already been inverted */
1619                          );
1620
1621         /* If either P1 or P2 is empty, the intersection will be also; can skip
1622          * the loop */
1623         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1624             ANYOF_POSIXL_ZERO(ssc);
1625         }
1626         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1627
1628             /* Note that the Posix class component P from 'and_with' actually
1629              * looks like:
1630              *      P = Pa | Pb | ... | Pn
1631              * where each component is one posix class, such as in [\w\s].
1632              * Thus
1633              *      ~P = ~(Pa | Pb | ... | Pn)
1634              *         = ~Pa & ~Pb & ... & ~Pn
1635              *        <= ~Pa | ~Pb | ... | ~Pn
1636              * The last is something we can easily calculate, but unfortunately
1637              * is likely to have many false positives.  We could do better
1638              * in some (but certainly not all) instances if two classes in
1639              * P have known relationships.  For example
1640              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1641              * So
1642              *      :lower: & :print: = :lower:
1643              * And similarly for classes that must be disjoint.  For example,
1644              * since \s and \w can have no elements in common based on rules in
1645              * the POSIX standard,
1646              *      \w & ^\S = nothing
1647              * Unfortunately, some vendor locales do not meet the Posix
1648              * standard, in particular almost everything by Microsoft.
1649              * The loop below just changes e.g., \w into \W and vice versa */
1650
1651             regnode_charclass_posixl temp;
1652             int add = 1;    /* To calculate the index of the complement */
1653
1654             ANYOF_POSIXL_ZERO(&temp);
1655             for (i = 0; i < ANYOF_MAX; i++) {
1656                 assert(i % 2 != 0
1657                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1658                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1659
1660                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1661                     ANYOF_POSIXL_SET(&temp, i + add);
1662                 }
1663                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1664             }
1665             ANYOF_POSIXL_AND(&temp, ssc);
1666
1667         } /* else ssc already has no posixes */
1668     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1669          in its initial state */
1670     else if (! is_ANYOF_SYNTHETIC(and_with)
1671              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1672     {
1673         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1674          * copy it over 'ssc' */
1675         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1676             if (is_ANYOF_SYNTHETIC(and_with)) {
1677                 StructCopy(and_with, ssc, regnode_ssc);
1678             }
1679             else {
1680                 ssc->invlist = anded_cp_list;
1681                 ANYOF_POSIXL_ZERO(ssc);
1682                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1683                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1684                 }
1685             }
1686         }
1687         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1688                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1689         {
1690             /* One or the other of P1, P2 is non-empty. */
1691             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1692                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1693             }
1694             ssc_union(ssc, anded_cp_list, FALSE);
1695         }
1696         else { /* P1 = P2 = empty */
1697             ssc_intersection(ssc, anded_cp_list, FALSE);
1698         }
1699     }
1700 }
1701
1702 STATIC void
1703 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1704                const regnode_charclass *or_with)
1705 {
1706     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1707      * another SSC or a regular ANYOF class.  Can create false positives if
1708      * 'or_with' is to be inverted. */
1709
1710     SV* ored_cp_list;
1711     U8 ored_flags;
1712
1713     PERL_ARGS_ASSERT_SSC_OR;
1714
1715     assert(is_ANYOF_SYNTHETIC(ssc));
1716
1717     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1718      * the code point inversion list and just the relevant flags */
1719     if (is_ANYOF_SYNTHETIC(or_with)) {
1720         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1721         ored_flags = ANYOF_FLAGS(or_with);
1722     }
1723     else {
1724         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1725         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1726         if (OP(or_with) != ANYOFD) {
1727             ored_flags
1728             |= ANYOF_FLAGS(or_with)
1729              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1730                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1731             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1732                 ored_flags |=
1733                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1734             }
1735         }
1736     }
1737
1738     ANYOF_FLAGS(ssc) |= ored_flags;
1739
1740     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1741      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1742      * 'or_with' may be inverted.  When not inverted, we have the simple
1743      * situation of computing:
1744      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1745      * If P1|P2 yields a situation with both a class and its complement are
1746      * set, like having both \w and \W, this matches all code points, and we
1747      * can delete these from the P component of the ssc going forward.  XXX We
1748      * might be able to delete all the P components, but I (khw) am not certain
1749      * about this, and it is better to be safe.
1750      *
1751      * Inverted, we have
1752      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1753      *                         <=  (C1 | P1) | ~C2
1754      *                         <=  (C1 | ~C2) | P1
1755      * (which results in actually simpler code than the non-inverted case)
1756      * */
1757
1758     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1759         && ! is_ANYOF_SYNTHETIC(or_with))
1760     {
1761         /* We ignore P2, leaving P1 going forward */
1762     }   /* else  Not inverted */
1763     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1764         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1765         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1766             unsigned int i;
1767             for (i = 0; i < ANYOF_MAX; i += 2) {
1768                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1769                 {
1770                     ssc_match_all_cp(ssc);
1771                     ANYOF_POSIXL_CLEAR(ssc, i);
1772                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1773                 }
1774             }
1775         }
1776     }
1777
1778     ssc_union(ssc,
1779               ored_cp_list,
1780               FALSE /* Already has been inverted */
1781               );
1782 }
1783
1784 PERL_STATIC_INLINE void
1785 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1786 {
1787     PERL_ARGS_ASSERT_SSC_UNION;
1788
1789     assert(is_ANYOF_SYNTHETIC(ssc));
1790
1791     _invlist_union_maybe_complement_2nd(ssc->invlist,
1792                                         invlist,
1793                                         invert2nd,
1794                                         &ssc->invlist);
1795 }
1796
1797 PERL_STATIC_INLINE void
1798 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1799                          SV* const invlist,
1800                          const bool invert2nd)
1801 {
1802     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1803
1804     assert(is_ANYOF_SYNTHETIC(ssc));
1805
1806     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1807                                                invlist,
1808                                                invert2nd,
1809                                                &ssc->invlist);
1810 }
1811
1812 PERL_STATIC_INLINE void
1813 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1814 {
1815     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1816
1817     assert(is_ANYOF_SYNTHETIC(ssc));
1818
1819     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1820 }
1821
1822 PERL_STATIC_INLINE void
1823 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1824 {
1825     /* AND just the single code point 'cp' into the SSC 'ssc' */
1826
1827     SV* cp_list = _new_invlist(2);
1828
1829     PERL_ARGS_ASSERT_SSC_CP_AND;
1830
1831     assert(is_ANYOF_SYNTHETIC(ssc));
1832
1833     cp_list = add_cp_to_invlist(cp_list, cp);
1834     ssc_intersection(ssc, cp_list,
1835                      FALSE /* Not inverted */
1836                      );
1837     SvREFCNT_dec_NN(cp_list);
1838 }
1839
1840 PERL_STATIC_INLINE void
1841 S_ssc_clear_locale(regnode_ssc *ssc)
1842 {
1843     /* Set the SSC 'ssc' to not match any locale things */
1844     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1845
1846     assert(is_ANYOF_SYNTHETIC(ssc));
1847
1848     ANYOF_POSIXL_ZERO(ssc);
1849     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1850 }
1851
1852 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1853
1854 STATIC bool
1855 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1856 {
1857     /* The synthetic start class is used to hopefully quickly winnow down
1858      * places where a pattern could start a match in the target string.  If it
1859      * doesn't really narrow things down that much, there isn't much point to
1860      * having the overhead of using it.  This function uses some very crude
1861      * heuristics to decide if to use the ssc or not.
1862      *
1863      * It returns TRUE if 'ssc' rules out more than half what it considers to
1864      * be the "likely" possible matches, but of course it doesn't know what the
1865      * actual things being matched are going to be; these are only guesses
1866      *
1867      * For /l matches, it assumes that the only likely matches are going to be
1868      *      in the 0-255 range, uniformly distributed, so half of that is 127
1869      * For /a and /d matches, it assumes that the likely matches will be just
1870      *      the ASCII range, so half of that is 63
1871      * For /u and there isn't anything matching above the Latin1 range, it
1872      *      assumes that that is the only range likely to be matched, and uses
1873      *      half that as the cut-off: 127.  If anything matches above Latin1,
1874      *      it assumes that all of Unicode could match (uniformly), except for
1875      *      non-Unicode code points and things in the General Category "Other"
1876      *      (unassigned, private use, surrogates, controls and formats).  This
1877      *      is a much large number. */
1878
1879     U32 count = 0;      /* Running total of number of code points matched by
1880                            'ssc' */
1881     UV start, end;      /* Start and end points of current range in inversion
1882                            list */
1883     const U32 max_code_points = (LOC)
1884                                 ?  256
1885                                 : ((   ! UNI_SEMANTICS
1886                                      || invlist_highest(ssc->invlist) < 256)
1887                                   ? 128
1888                                   : NON_OTHER_COUNT);
1889     const U32 max_match = max_code_points / 2;
1890
1891     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1892
1893     invlist_iterinit(ssc->invlist);
1894     while (invlist_iternext(ssc->invlist, &start, &end)) {
1895         if (start >= max_code_points) {
1896             break;
1897         }
1898         end = MIN(end, max_code_points - 1);
1899         count += end - start + 1;
1900         if (count >= max_match) {
1901             invlist_iterfinish(ssc->invlist);
1902             return FALSE;
1903         }
1904     }
1905
1906     return TRUE;
1907 }
1908
1909
1910 STATIC void
1911 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1912 {
1913     /* The inversion list in the SSC is marked mortal; now we need a more
1914      * permanent copy, which is stored the same way that is done in a regular
1915      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1916      * map */
1917
1918     SV* invlist = invlist_clone(ssc->invlist);
1919
1920     PERL_ARGS_ASSERT_SSC_FINALIZE;
1921
1922     assert(is_ANYOF_SYNTHETIC(ssc));
1923
1924     /* The code in this file assumes that all but these flags aren't relevant
1925      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1926      * by the time we reach here */
1927     assert(! (ANYOF_FLAGS(ssc)
1928         & ~( ANYOF_COMMON_FLAGS
1929             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1930             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1931
1932     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1933
1934     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1935                                 NULL, NULL, NULL, FALSE);
1936
1937     /* Make sure is clone-safe */
1938     ssc->invlist = NULL;
1939
1940     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1941         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1942     }
1943
1944     if (RExC_contains_locale) {
1945         OP(ssc) = ANYOFL;
1946     }
1947
1948     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1949 }
1950
1951 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1952 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1953 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1954 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1955                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1956                                : 0 )
1957
1958
1959 #ifdef DEBUGGING
1960 /*
1961    dump_trie(trie,widecharmap,revcharmap)
1962    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1963    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1964
1965    These routines dump out a trie in a somewhat readable format.
1966    The _interim_ variants are used for debugging the interim
1967    tables that are used to generate the final compressed
1968    representation which is what dump_trie expects.
1969
1970    Part of the reason for their existence is to provide a form
1971    of documentation as to how the different representations function.
1972
1973 */
1974
1975 /*
1976   Dumps the final compressed table form of the trie to Perl_debug_log.
1977   Used for debugging make_trie().
1978 */
1979
1980 STATIC void
1981 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1982             AV *revcharmap, U32 depth)
1983 {
1984     U32 state;
1985     SV *sv=sv_newmortal();
1986     int colwidth= widecharmap ? 6 : 4;
1987     U16 word;
1988     GET_RE_DEBUG_FLAGS_DECL;
1989
1990     PERL_ARGS_ASSERT_DUMP_TRIE;
1991
1992     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
1993         depth+1, "Match","Base","Ofs" );
1994
1995     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1996         SV ** const tmp = av_fetch( revcharmap, state, 0);
1997         if ( tmp ) {
1998             Perl_re_printf( aTHX_  "%*s",
1999                 colwidth,
2000                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2001                             PL_colors[0], PL_colors[1],
2002                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2003                             PERL_PV_ESCAPE_FIRSTCHAR
2004                 )
2005             );
2006         }
2007     }
2008     Perl_re_printf( aTHX_  "\n");
2009     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2010
2011     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2012         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2013     Perl_re_printf( aTHX_  "\n");
2014
2015     for( state = 1 ; state < trie->statecount ; state++ ) {
2016         const U32 base = trie->states[ state ].trans.base;
2017
2018         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2019
2020         if ( trie->states[ state ].wordnum ) {
2021             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2022         } else {
2023             Perl_re_printf( aTHX_  "%6s", "" );
2024         }
2025
2026         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2027
2028         if ( base ) {
2029             U32 ofs = 0;
2030
2031             while( ( base + ofs  < trie->uniquecharcount ) ||
2032                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2033                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2034                                                                     != state))
2035                     ofs++;
2036
2037             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2038
2039             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2040                 if ( ( base + ofs >= trie->uniquecharcount )
2041                         && ( base + ofs - trie->uniquecharcount
2042                                                         < trie->lasttrans )
2043                         && trie->trans[ base + ofs
2044                                     - trie->uniquecharcount ].check == state )
2045                 {
2046                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2047                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2048                    );
2049                 } else {
2050                     Perl_re_printf( aTHX_  "%*s",colwidth,"   ." );
2051                 }
2052             }
2053
2054             Perl_re_printf( aTHX_  "]");
2055
2056         }
2057         Perl_re_printf( aTHX_  "\n" );
2058     }
2059     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2060                                 depth);
2061     for (word=1; word <= trie->wordcount; word++) {
2062         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2063             (int)word, (int)(trie->wordinfo[word].prev),
2064             (int)(trie->wordinfo[word].len));
2065     }
2066     Perl_re_printf( aTHX_  "\n" );
2067 }
2068 /*
2069   Dumps a fully constructed but uncompressed trie in list form.
2070   List tries normally only are used for construction when the number of
2071   possible chars (trie->uniquecharcount) is very high.
2072   Used for debugging make_trie().
2073 */
2074 STATIC void
2075 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2076                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2077                          U32 depth)
2078 {
2079     U32 state;
2080     SV *sv=sv_newmortal();
2081     int colwidth= widecharmap ? 6 : 4;
2082     GET_RE_DEBUG_FLAGS_DECL;
2083
2084     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2085
2086     /* print out the table precompression.  */
2087     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2088             depth+1 );
2089     Perl_re_indentf( aTHX_  "%s",
2090             depth+1, "------:-----+-----------------\n" );
2091
2092     for( state=1 ; state < next_alloc ; state ++ ) {
2093         U16 charid;
2094
2095         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2096             depth+1, (UV)state  );
2097         if ( ! trie->states[ state ].wordnum ) {
2098             Perl_re_printf( aTHX_  "%5s| ","");
2099         } else {
2100             Perl_re_printf( aTHX_  "W%4x| ",
2101                 trie->states[ state ].wordnum
2102             );
2103         }
2104         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2105             SV ** const tmp = av_fetch( revcharmap,
2106                                         TRIE_LIST_ITEM(state,charid).forid, 0);
2107             if ( tmp ) {
2108                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2109                     colwidth,
2110                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2111                               colwidth,
2112                               PL_colors[0], PL_colors[1],
2113                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2114                               | PERL_PV_ESCAPE_FIRSTCHAR
2115                     ) ,
2116                     TRIE_LIST_ITEM(state,charid).forid,
2117                     (UV)TRIE_LIST_ITEM(state,charid).newstate
2118                 );
2119                 if (!(charid % 10))
2120                     Perl_re_printf( aTHX_  "\n%*s| ",
2121                         (int)((depth * 2) + 14), "");
2122             }
2123         }
2124         Perl_re_printf( aTHX_  "\n");
2125     }
2126 }
2127
2128 /*
2129   Dumps a fully constructed but uncompressed trie in table form.
2130   This is the normal DFA style state transition table, with a few
2131   twists to facilitate compression later.
2132   Used for debugging make_trie().
2133 */
2134 STATIC void
2135 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2136                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2137                           U32 depth)
2138 {
2139     U32 state;
2140     U16 charid;
2141     SV *sv=sv_newmortal();
2142     int colwidth= widecharmap ? 6 : 4;
2143     GET_RE_DEBUG_FLAGS_DECL;
2144
2145     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2146
2147     /*
2148        print out the table precompression so that we can do a visual check
2149        that they are identical.
2150      */
2151
2152     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2153
2154     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2155         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2156         if ( tmp ) {
2157             Perl_re_printf( aTHX_  "%*s",
2158                 colwidth,
2159                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2160                             PL_colors[0], PL_colors[1],
2161                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2162                             PERL_PV_ESCAPE_FIRSTCHAR
2163                 )
2164             );
2165         }
2166     }
2167
2168     Perl_re_printf( aTHX_ "\n");
2169     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2170
2171     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2172         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2173     }
2174
2175     Perl_re_printf( aTHX_  "\n" );
2176
2177     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2178
2179         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2180             depth+1,
2181             (UV)TRIE_NODENUM( state ) );
2182
2183         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2184             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2185             if (v)
2186                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2187             else
2188                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2189         }
2190         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2191             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2192                                             (UV)trie->trans[ state ].check );
2193         } else {
2194             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2195                                             (UV)trie->trans[ state ].check,
2196             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2197         }
2198     }
2199 }
2200
2201 #endif
2202
2203
2204 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2205   startbranch: the first branch in the whole branch sequence
2206   first      : start branch of sequence of branch-exact nodes.
2207                May be the same as startbranch
2208   last       : Thing following the last branch.
2209                May be the same as tail.
2210   tail       : item following the branch sequence
2211   count      : words in the sequence
2212   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2213   depth      : indent depth
2214
2215 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2216
2217 A trie is an N'ary tree where the branches are determined by digital
2218 decomposition of the key. IE, at the root node you look up the 1st character and
2219 follow that branch repeat until you find the end of the branches. Nodes can be
2220 marked as "accepting" meaning they represent a complete word. Eg:
2221
2222   /he|she|his|hers/
2223
2224 would convert into the following structure. Numbers represent states, letters
2225 following numbers represent valid transitions on the letter from that state, if
2226 the number is in square brackets it represents an accepting state, otherwise it
2227 will be in parenthesis.
2228
2229       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2230       |    |
2231       |   (2)
2232       |    |
2233      (1)   +-i->(6)-+-s->[7]
2234       |
2235       +-s->(3)-+-h->(4)-+-e->[5]
2236
2237       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2238
2239 This shows that when matching against the string 'hers' we will begin at state 1
2240 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2241 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2242 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2243 single traverse. We store a mapping from accepting to state to which word was
2244 matched, and then when we have multiple possibilities we try to complete the
2245 rest of the regex in the order in which they occurred in the alternation.
2246
2247 The only prior NFA like behaviour that would be changed by the TRIE support is
2248 the silent ignoring of duplicate alternations which are of the form:
2249
2250  / (DUPE|DUPE) X? (?{ ... }) Y /x
2251
2252 Thus EVAL blocks following a trie may be called a different number of times with
2253 and without the optimisation. With the optimisations dupes will be silently
2254 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2255 the following demonstrates:
2256
2257  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2258
2259 which prints out 'word' three times, but
2260
2261  'words'=~/(word|word|word)(?{ print $1 })S/
2262
2263 which doesnt print it out at all. This is due to other optimisations kicking in.
2264
2265 Example of what happens on a structural level:
2266
2267 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2268
2269    1: CURLYM[1] {1,32767}(18)
2270    5:   BRANCH(8)
2271    6:     EXACT <ac>(16)
2272    8:   BRANCH(11)
2273    9:     EXACT <ad>(16)
2274   11:   BRANCH(14)
2275   12:     EXACT <ab>(16)
2276   16:   SUCCEED(0)
2277   17:   NOTHING(18)
2278   18: END(0)
2279
2280 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2281 and should turn into:
2282
2283    1: CURLYM[1] {1,32767}(18)
2284    5:   TRIE(16)
2285         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2286           <ac>
2287           <ad>
2288           <ab>
2289   16:   SUCCEED(0)
2290   17:   NOTHING(18)
2291   18: END(0)
2292
2293 Cases where tail != last would be like /(?foo|bar)baz/:
2294
2295    1: BRANCH(4)
2296    2:   EXACT <foo>(8)
2297    4: BRANCH(7)
2298    5:   EXACT <bar>(8)
2299    7: TAIL(8)
2300    8: EXACT <baz>(10)
2301   10: END(0)
2302
2303 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2304 and would end up looking like:
2305
2306     1: TRIE(8)
2307       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2308         <foo>
2309         <bar>
2310    7: TAIL(8)
2311    8: EXACT <baz>(10)
2312   10: END(0)
2313
2314     d = uvchr_to_utf8_flags(d, uv, 0);
2315
2316 is the recommended Unicode-aware way of saying
2317
2318     *(d++) = uv;
2319 */
2320
2321 #define TRIE_STORE_REVCHAR(val)                                            \
2322     STMT_START {                                                           \
2323         if (UTF) {                                                         \
2324             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2325             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2326             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2327             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2328             SvPOK_on(zlopp);                                               \
2329             SvUTF8_on(zlopp);                                              \
2330             av_push(revcharmap, zlopp);                                    \
2331         } else {                                                           \
2332             char ooooff = (char)val;                                           \
2333             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2334         }                                                                  \
2335         } STMT_END
2336
2337 /* This gets the next character from the input, folding it if not already
2338  * folded. */
2339 #define TRIE_READ_CHAR STMT_START {                                           \
2340     wordlen++;                                                                \
2341     if ( UTF ) {                                                              \
2342         /* if it is UTF then it is either already folded, or does not need    \
2343          * folding */                                                         \
2344         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2345     }                                                                         \
2346     else if (folder == PL_fold_latin1) {                                      \
2347         /* This folder implies Unicode rules, which in the range expressible  \
2348          *  by not UTF is the lower case, with the two exceptions, one of     \
2349          *  which should have been taken care of before calling this */       \
2350         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2351         uvc = toLOWER_L1(*uc);                                                \
2352         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2353         len = 1;                                                              \
2354     } else {                                                                  \
2355         /* raw data, will be folded later if needed */                        \
2356         uvc = (U32)*uc;                                                       \
2357         len = 1;                                                              \
2358     }                                                                         \
2359 } STMT_END
2360
2361
2362
2363 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2364     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2365         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2366         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2367     }                                                           \
2368     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2369     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2370     TRIE_LIST_CUR( state )++;                                   \
2371 } STMT_END
2372
2373 #define TRIE_LIST_NEW(state) STMT_START {                       \
2374     Newxz( trie->states[ state ].trans.list,               \
2375         4, reg_trie_trans_le );                                 \
2376      TRIE_LIST_CUR( state ) = 1;                                \
2377      TRIE_LIST_LEN( state ) = 4;                                \
2378 } STMT_END
2379
2380 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2381     U16 dupe= trie->states[ state ].wordnum;                    \
2382     regnode * const noper_next = regnext( noper );              \
2383                                                                 \
2384     DEBUG_r({                                                   \
2385         /* store the word for dumping */                        \
2386         SV* tmp;                                                \
2387         if (OP(noper) != NOTHING)                               \
2388             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2389         else                                                    \
2390             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2391         av_push( trie_words, tmp );                             \
2392     });                                                         \
2393                                                                 \
2394     curword++;                                                  \
2395     trie->wordinfo[curword].prev   = 0;                         \
2396     trie->wordinfo[curword].len    = wordlen;                   \
2397     trie->wordinfo[curword].accept = state;                     \
2398                                                                 \
2399     if ( noper_next < tail ) {                                  \
2400         if (!trie->jump)                                        \
2401             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2402                                                  sizeof(U16) ); \
2403         trie->jump[curword] = (U16)(noper_next - convert);      \
2404         if (!jumper)                                            \
2405             jumper = noper_next;                                \
2406         if (!nextbranch)                                        \
2407             nextbranch= regnext(cur);                           \
2408     }                                                           \
2409                                                                 \
2410     if ( dupe ) {                                               \
2411         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2412         /* chain, so that when the bits of chain are later    */\
2413         /* linked together, the dups appear in the chain      */\
2414         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2415         trie->wordinfo[dupe].prev = curword;                    \
2416     } else {                                                    \
2417         /* we haven't inserted this word yet.                */ \
2418         trie->states[ state ].wordnum = curword;                \
2419     }                                                           \
2420 } STMT_END
2421
2422
2423 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2424      ( ( base + charid >=  ucharcount                                   \
2425          && base + charid < ubound                                      \
2426          && state == trie->trans[ base - ucharcount + charid ].check    \
2427          && trie->trans[ base - ucharcount + charid ].next )            \
2428            ? trie->trans[ base - ucharcount + charid ].next             \
2429            : ( state==1 ? special : 0 )                                 \
2430       )
2431
2432 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2433 STMT_START {                                                \
2434     TRIE_BITMAP_SET(trie, uvc);                             \
2435     /* store the folded codepoint */                        \
2436     if ( folder )                                           \
2437         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2438                                                             \
2439     if ( !UTF ) {                                           \
2440         /* store first byte of utf8 representation of */    \
2441         /* variant codepoints */                            \
2442         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2443             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2444         }                                                   \
2445     }                                                       \
2446 } STMT_END
2447 #define MADE_TRIE       1
2448 #define MADE_JUMP_TRIE  2
2449 #define MADE_EXACT_TRIE 4
2450
2451 STATIC I32
2452 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2453                   regnode *first, regnode *last, regnode *tail,
2454                   U32 word_count, U32 flags, U32 depth)
2455 {
2456     /* first pass, loop through and scan words */
2457     reg_trie_data *trie;
2458     HV *widecharmap = NULL;
2459     AV *revcharmap = newAV();
2460     regnode *cur;
2461     STRLEN len = 0;
2462     UV uvc = 0;
2463     U16 curword = 0;
2464     U32 next_alloc = 0;
2465     regnode *jumper = NULL;
2466     regnode *nextbranch = NULL;
2467     regnode *convert = NULL;
2468     U32 *prev_states; /* temp array mapping each state to previous one */
2469     /* we just use folder as a flag in utf8 */
2470     const U8 * folder = NULL;
2471
2472 #ifdef DEBUGGING
2473     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2474     AV *trie_words = NULL;
2475     /* along with revcharmap, this only used during construction but both are
2476      * useful during debugging so we store them in the struct when debugging.
2477      */
2478 #else
2479     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2480     STRLEN trie_charcount=0;
2481 #endif
2482     SV *re_trie_maxbuff;
2483     GET_RE_DEBUG_FLAGS_DECL;
2484
2485     PERL_ARGS_ASSERT_MAKE_TRIE;
2486 #ifndef DEBUGGING
2487     PERL_UNUSED_ARG(depth);
2488 #endif
2489
2490     switch (flags) {
2491         case EXACT: case EXACTL: break;
2492         case EXACTFA:
2493         case EXACTFU_SS:
2494         case EXACTFU:
2495         case EXACTFLU8: folder = PL_fold_latin1; break;
2496         case EXACTF:  folder = PL_fold; break;
2497         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2498     }
2499
2500     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2501     trie->refcount = 1;
2502     trie->startstate = 1;
2503     trie->wordcount = word_count;
2504     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2505     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2506     if (flags == EXACT || flags == EXACTL)
2507         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2508     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2509                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2510
2511     DEBUG_r({
2512         trie_words = newAV();
2513     });
2514
2515     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2516     assert(re_trie_maxbuff);
2517     if (!SvIOK(re_trie_maxbuff)) {
2518         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2519     }
2520     DEBUG_TRIE_COMPILE_r({
2521         Perl_re_indentf( aTHX_
2522           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2523           depth+1,
2524           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2525           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2526     });
2527
2528    /* Find the node we are going to overwrite */
2529     if ( first == startbranch && OP( last ) != BRANCH ) {
2530         /* whole branch chain */
2531         convert = first;
2532     } else {
2533         /* branch sub-chain */
2534         convert = NEXTOPER( first );
2535     }
2536
2537     /*  -- First loop and Setup --
2538
2539        We first traverse the branches and scan each word to determine if it
2540        contains widechars, and how many unique chars there are, this is
2541        important as we have to build a table with at least as many columns as we
2542        have unique chars.
2543
2544        We use an array of integers to represent the character codes 0..255
2545        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2546        the native representation of the character value as the key and IV's for
2547        the coded index.
2548
2549        *TODO* If we keep track of how many times each character is used we can
2550        remap the columns so that the table compression later on is more
2551        efficient in terms of memory by ensuring the most common value is in the
2552        middle and the least common are on the outside.  IMO this would be better
2553        than a most to least common mapping as theres a decent chance the most
2554        common letter will share a node with the least common, meaning the node
2555        will not be compressible. With a middle is most common approach the worst
2556        case is when we have the least common nodes twice.
2557
2558      */
2559
2560     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2561         regnode *noper = NEXTOPER( cur );
2562         const U8 *uc;
2563         const U8 *e;
2564         int foldlen = 0;
2565         U32 wordlen      = 0;         /* required init */
2566         STRLEN minchars = 0;
2567         STRLEN maxchars = 0;
2568         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2569                                                bitmap?*/
2570
2571         if (OP(noper) == NOTHING) {
2572             /* skip past a NOTHING at the start of an alternation
2573              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2574              */
2575             regnode *noper_next= regnext(noper);
2576             if (noper_next < tail)
2577                 noper= noper_next;
2578         }
2579
2580         if ( noper < tail &&
2581                 (
2582                     OP(noper) == flags ||
2583                     (
2584                         flags == EXACTFU &&
2585                         OP(noper) == EXACTFU_SS
2586                     )
2587                 )
2588         ) {
2589             uc= (U8*)STRING(noper);
2590             e= uc + STR_LEN(noper);
2591         } else {
2592             trie->minlen= 0;
2593             continue;
2594         }
2595
2596
2597         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2598             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2599                                           regardless of encoding */
2600             if (OP( noper ) == EXACTFU_SS) {
2601                 /* false positives are ok, so just set this */
2602                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2603             }
2604         }
2605
2606         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2607                                            branch */
2608             TRIE_CHARCOUNT(trie)++;
2609             TRIE_READ_CHAR;
2610
2611             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2612              * is in effect.  Under /i, this character can match itself, or
2613              * anything that folds to it.  If not under /i, it can match just
2614              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2615              * all fold to k, and all are single characters.   But some folds
2616              * expand to more than one character, so for example LATIN SMALL
2617              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2618              * the string beginning at 'uc' is 'ffi', it could be matched by
2619              * three characters, or just by the one ligature character. (It
2620              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2621              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2622              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2623              * match.)  The trie needs to know the minimum and maximum number
2624              * of characters that could match so that it can use size alone to
2625              * quickly reject many match attempts.  The max is simple: it is
2626              * the number of folded characters in this branch (since a fold is
2627              * never shorter than what folds to it. */
2628
2629             maxchars++;
2630
2631             /* And the min is equal to the max if not under /i (indicated by
2632              * 'folder' being NULL), or there are no multi-character folds.  If
2633              * there is a multi-character fold, the min is incremented just
2634              * once, for the character that folds to the sequence.  Each
2635              * character in the sequence needs to be added to the list below of
2636              * characters in the trie, but we count only the first towards the
2637              * min number of characters needed.  This is done through the
2638              * variable 'foldlen', which is returned by the macros that look
2639              * for these sequences as the number of bytes the sequence
2640              * occupies.  Each time through the loop, we decrement 'foldlen' by
2641              * how many bytes the current char occupies.  Only when it reaches
2642              * 0 do we increment 'minchars' or look for another multi-character
2643              * sequence. */
2644             if (folder == NULL) {
2645                 minchars++;
2646             }
2647             else if (foldlen > 0) {
2648                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2649             }
2650             else {
2651                 minchars++;
2652
2653                 /* See if *uc is the beginning of a multi-character fold.  If
2654                  * so, we decrement the length remaining to look at, to account
2655                  * for the current character this iteration.  (We can use 'uc'
2656                  * instead of the fold returned by TRIE_READ_CHAR because for
2657                  * non-UTF, the latin1_safe macro is smart enough to account
2658                  * for all the unfolded characters, and because for UTF, the
2659                  * string will already have been folded earlier in the
2660                  * compilation process */
2661                 if (UTF) {
2662                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2663                         foldlen -= UTF8SKIP(uc);
2664                     }
2665                 }
2666                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2667                     foldlen--;
2668                 }
2669             }
2670
2671             /* The current character (and any potential folds) should be added
2672              * to the possible matching characters for this position in this
2673              * branch */
2674             if ( uvc < 256 ) {
2675                 if ( folder ) {
2676                     U8 folded= folder[ (U8) uvc ];
2677                     if ( !trie->charmap[ folded ] ) {
2678                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2679                         TRIE_STORE_REVCHAR( folded );
2680                     }
2681                 }
2682                 if ( !trie->charmap[ uvc ] ) {
2683                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2684                     TRIE_STORE_REVCHAR( uvc );
2685                 }
2686                 if ( set_bit ) {
2687                     /* store the codepoint in the bitmap, and its folded
2688                      * equivalent. */
2689                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2690                     set_bit = 0; /* We've done our bit :-) */
2691                 }
2692             } else {
2693
2694                 /* XXX We could come up with the list of code points that fold
2695                  * to this using PL_utf8_foldclosures, except not for
2696                  * multi-char folds, as there may be multiple combinations
2697                  * there that could work, which needs to wait until runtime to
2698                  * resolve (The comment about LIGATURE FFI above is such an
2699                  * example */
2700
2701                 SV** svpp;
2702                 if ( !widecharmap )
2703                     widecharmap = newHV();
2704
2705                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2706
2707                 if ( !svpp )
2708                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2709
2710                 if ( !SvTRUE( *svpp ) ) {
2711                     sv_setiv( *svpp, ++trie->uniquecharcount );
2712                     TRIE_STORE_REVCHAR(uvc);
2713                 }
2714             }
2715         } /* end loop through characters in this branch of the trie */
2716
2717         /* We take the min and max for this branch and combine to find the min
2718          * and max for all branches processed so far */
2719         if( cur == first ) {
2720             trie->minlen = minchars;
2721             trie->maxlen = maxchars;
2722         } else if (minchars < trie->minlen) {
2723             trie->minlen = minchars;
2724         } else if (maxchars > trie->maxlen) {
2725             trie->maxlen = maxchars;
2726         }
2727     } /* end first pass */
2728     DEBUG_TRIE_COMPILE_r(
2729         Perl_re_indentf( aTHX_
2730                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2731                 depth+1,
2732                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2733                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2734                 (int)trie->minlen, (int)trie->maxlen )
2735     );
2736
2737     /*
2738         We now know what we are dealing with in terms of unique chars and
2739         string sizes so we can calculate how much memory a naive
2740         representation using a flat table  will take. If it's over a reasonable
2741         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2742         conservative but potentially much slower representation using an array
2743         of lists.
2744
2745         At the end we convert both representations into the same compressed
2746         form that will be used in regexec.c for matching with. The latter
2747         is a form that cannot be used to construct with but has memory
2748         properties similar to the list form and access properties similar
2749         to the table form making it both suitable for fast searches and
2750         small enough that its feasable to store for the duration of a program.
2751
2752         See the comment in the code where the compressed table is produced
2753         inplace from the flat tabe representation for an explanation of how
2754         the compression works.
2755
2756     */
2757
2758
2759     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2760     prev_states[1] = 0;
2761
2762     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2763                                                     > SvIV(re_trie_maxbuff) )
2764     {
2765         /*
2766             Second Pass -- Array Of Lists Representation
2767
2768             Each state will be represented by a list of charid:state records
2769             (reg_trie_trans_le) the first such element holds the CUR and LEN
2770             points of the allocated array. (See defines above).
2771
2772             We build the initial structure using the lists, and then convert
2773             it into the compressed table form which allows faster lookups
2774             (but cant be modified once converted).
2775         */
2776
2777         STRLEN transcount = 1;
2778
2779         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2780             depth+1));
2781
2782         trie->states = (reg_trie_state *)
2783             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2784                                   sizeof(reg_trie_state) );
2785         TRIE_LIST_NEW(1);
2786         next_alloc = 2;
2787
2788         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2789
2790             regnode *noper   = NEXTOPER( cur );
2791             U32 state        = 1;         /* required init */
2792             U16 charid       = 0;         /* sanity init */
2793             U32 wordlen      = 0;         /* required init */
2794
2795             if (OP(noper) == NOTHING) {
2796                 regnode *noper_next= regnext(noper);
2797                 if (noper_next < tail)
2798                     noper= noper_next;
2799             }
2800
2801             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2802                 const U8 *uc= (U8*)STRING(noper);
2803                 const U8 *e= uc + STR_LEN(noper);
2804
2805                 for ( ; uc < e ; uc += len ) {
2806
2807                     TRIE_READ_CHAR;
2808
2809                     if ( uvc < 256 ) {
2810                         charid = trie->charmap[ uvc ];
2811                     } else {
2812                         SV** const svpp = hv_fetch( widecharmap,
2813                                                     (char*)&uvc,
2814                                                     sizeof( UV ),
2815                                                     0);
2816                         if ( !svpp ) {
2817                             charid = 0;
2818                         } else {
2819                             charid=(U16)SvIV( *svpp );
2820                         }
2821                     }
2822                     /* charid is now 0 if we dont know the char read, or
2823                      * nonzero if we do */
2824                     if ( charid ) {
2825
2826                         U16 check;
2827                         U32 newstate = 0;
2828
2829                         charid--;
2830                         if ( !trie->states[ state ].trans.list ) {
2831                             TRIE_LIST_NEW( state );
2832                         }
2833                         for ( check = 1;
2834                               check <= TRIE_LIST_USED( state );
2835                               check++ )
2836                         {
2837                             if ( TRIE_LIST_ITEM( state, check ).forid
2838                                                                     == charid )
2839                             {
2840                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2841                                 break;
2842                             }
2843                         }
2844                         if ( ! newstate ) {
2845                             newstate = next_alloc++;
2846                             prev_states[newstate] = state;
2847                             TRIE_LIST_PUSH( state, charid, newstate );
2848                             transcount++;
2849                         }
2850                         state = newstate;
2851                     } else {
2852                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
2853                     }
2854                 }
2855             }
2856             TRIE_HANDLE_WORD(state);
2857
2858         } /* end second pass */
2859
2860         /* next alloc is the NEXT state to be allocated */
2861         trie->statecount = next_alloc;
2862         trie->states = (reg_trie_state *)
2863             PerlMemShared_realloc( trie->states,
2864                                    next_alloc
2865                                    * sizeof(reg_trie_state) );
2866
2867         /* and now dump it out before we compress it */
2868         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2869                                                          revcharmap, next_alloc,
2870                                                          depth+1)
2871         );
2872
2873         trie->trans = (reg_trie_trans *)
2874             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2875         {
2876             U32 state;
2877             U32 tp = 0;
2878             U32 zp = 0;
2879
2880
2881             for( state=1 ; state < next_alloc ; state ++ ) {
2882                 U32 base=0;
2883
2884                 /*
2885                 DEBUG_TRIE_COMPILE_MORE_r(
2886                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
2887                 );
2888                 */
2889
2890                 if (trie->states[state].trans.list) {
2891                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2892                     U16 maxid=minid;
2893                     U16 idx;
2894
2895                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2896                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2897                         if ( forid < minid ) {
2898                             minid=forid;
2899                         } else if ( forid > maxid ) {
2900                             maxid=forid;
2901                         }
2902                     }
2903                     if ( transcount < tp + maxid - minid + 1) {
2904                         transcount *= 2;
2905                         trie->trans = (reg_trie_trans *)
2906                             PerlMemShared_realloc( trie->trans,
2907                                                      transcount
2908                                                      * sizeof(reg_trie_trans) );
2909                         Zero( trie->trans + (transcount / 2),
2910                               transcount / 2,
2911                               reg_trie_trans );
2912                     }
2913                     base = trie->uniquecharcount + tp - minid;
2914                     if ( maxid == minid ) {
2915                         U32 set = 0;
2916                         for ( ; zp < tp ; zp++ ) {
2917                             if ( ! trie->trans[ zp ].next ) {
2918                                 base = trie->uniquecharcount + zp - minid;
2919                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2920                                                                    1).newstate;
2921                                 trie->trans[ zp ].check = state;
2922                                 set = 1;
2923                                 break;
2924                             }
2925                         }
2926                         if ( !set ) {
2927                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2928                                                                    1).newstate;
2929                             trie->trans[ tp ].check = state;
2930                             tp++;
2931                             zp = tp;
2932                         }
2933                     } else {
2934                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2935                             const U32 tid = base
2936                                            - trie->uniquecharcount
2937                                            + TRIE_LIST_ITEM( state, idx ).forid;
2938                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2939                                                                 idx ).newstate;
2940                             trie->trans[ tid ].check = state;
2941                         }
2942                         tp += ( maxid - minid + 1 );
2943                     }
2944                     Safefree(trie->states[ state ].trans.list);
2945                 }
2946                 /*
2947                 DEBUG_TRIE_COMPILE_MORE_r(
2948                     Perl_re_printf( aTHX_  " base: %d\n",base);
2949                 );
2950                 */
2951                 trie->states[ state ].trans.base=base;
2952             }
2953             trie->lasttrans = tp + 1;
2954         }
2955     } else {
2956         /*
2957            Second Pass -- Flat Table Representation.
2958
2959            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2960            each.  We know that we will need Charcount+1 trans at most to store
2961            the data (one row per char at worst case) So we preallocate both
2962            structures assuming worst case.
2963
2964            We then construct the trie using only the .next slots of the entry
2965            structs.
2966
2967            We use the .check field of the first entry of the node temporarily
2968            to make compression both faster and easier by keeping track of how
2969            many non zero fields are in the node.
2970
2971            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2972            transition.
2973
2974            There are two terms at use here: state as a TRIE_NODEIDX() which is
2975            a number representing the first entry of the node, and state as a
2976            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2977            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2978            if there are 2 entrys per node. eg:
2979
2980              A B       A B
2981           1. 2 4    1. 3 7
2982           2. 0 3    3. 0 5
2983           3. 0 0    5. 0 0
2984           4. 0 0    7. 0 0
2985
2986            The table is internally in the right hand, idx form. However as we
2987            also have to deal with the states array which is indexed by nodenum
2988            we have to use TRIE_NODENUM() to convert.
2989
2990         */
2991         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
2992             depth+1));
2993
2994         trie->trans = (reg_trie_trans *)
2995             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2996                                   * trie->uniquecharcount + 1,
2997                                   sizeof(reg_trie_trans) );
2998         trie->states = (reg_trie_state *)
2999             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3000                                   sizeof(reg_trie_state) );
3001         next_alloc = trie->uniquecharcount + 1;
3002
3003
3004         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3005
3006             regnode *noper   = NEXTOPER( cur );
3007
3008             U32 state        = 1;         /* required init */
3009
3010             U16 charid       = 0;         /* sanity init */
3011             U32 accept_state = 0;         /* sanity init */
3012
3013             U32 wordlen      = 0;         /* required init */
3014
3015             if (OP(noper) == NOTHING) {
3016                 regnode *noper_next= regnext(noper);
3017                 if (noper_next < tail)
3018                     noper= noper_next;
3019             }
3020
3021             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3022                 const U8 *uc= (U8*)STRING(noper);
3023                 const U8 *e= uc + STR_LEN(noper);
3024
3025                 for ( ; uc < e ; uc += len ) {
3026
3027                     TRIE_READ_CHAR;
3028
3029                     if ( uvc < 256 ) {
3030                         charid = trie->charmap[ uvc ];
3031                     } else {
3032                         SV* const * const svpp = hv_fetch( widecharmap,
3033                                                            (char*)&uvc,
3034                                                            sizeof( UV ),
3035                                                            0);
3036                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3037                     }
3038                     if ( charid ) {
3039                         charid--;
3040                         if ( !trie->trans[ state + charid ].next ) {
3041                             trie->trans[ state + charid ].next = next_alloc;
3042                             trie->trans[ state ].check++;
3043                             prev_states[TRIE_NODENUM(next_alloc)]
3044                                     = TRIE_NODENUM(state);
3045                             next_alloc += trie->uniquecharcount;
3046                         }
3047                         state = trie->trans[ state + charid ].next;
3048                     } else {
3049                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3050                     }
3051                     /* charid is now 0 if we dont know the char read, or
3052                      * nonzero if we do */
3053                 }
3054             }
3055             accept_state = TRIE_NODENUM( state );
3056             TRIE_HANDLE_WORD(accept_state);
3057
3058         } /* end second pass */
3059
3060         /* and now dump it out before we compress it */
3061         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3062                                                           revcharmap,
3063                                                           next_alloc, depth+1));
3064
3065         {
3066         /*
3067            * Inplace compress the table.*
3068
3069            For sparse data sets the table constructed by the trie algorithm will
3070            be mostly 0/FAIL transitions or to put it another way mostly empty.
3071            (Note that leaf nodes will not contain any transitions.)
3072
3073            This algorithm compresses the tables by eliminating most such
3074            transitions, at the cost of a modest bit of extra work during lookup:
3075
3076            - Each states[] entry contains a .base field which indicates the
3077            index in the state[] array wheres its transition data is stored.
3078
3079            - If .base is 0 there are no valid transitions from that node.
3080
3081            - If .base is nonzero then charid is added to it to find an entry in
3082            the trans array.
3083
3084            -If trans[states[state].base+charid].check!=state then the
3085            transition is taken to be a 0/Fail transition. Thus if there are fail
3086            transitions at the front of the node then the .base offset will point
3087            somewhere inside the previous nodes data (or maybe even into a node
3088            even earlier), but the .check field determines if the transition is
3089            valid.
3090
3091            XXX - wrong maybe?
3092            The following process inplace converts the table to the compressed
3093            table: We first do not compress the root node 1,and mark all its
3094            .check pointers as 1 and set its .base pointer as 1 as well. This
3095            allows us to do a DFA construction from the compressed table later,
3096            and ensures that any .base pointers we calculate later are greater
3097            than 0.
3098
3099            - We set 'pos' to indicate the first entry of the second node.
3100
3101            - We then iterate over the columns of the node, finding the first and
3102            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3103            and set the .check pointers accordingly, and advance pos
3104            appropriately and repreat for the next node. Note that when we copy
3105            the next pointers we have to convert them from the original
3106            NODEIDX form to NODENUM form as the former is not valid post
3107            compression.
3108
3109            - If a node has no transitions used we mark its base as 0 and do not
3110            advance the pos pointer.
3111
3112            - If a node only has one transition we use a second pointer into the
3113            structure to fill in allocated fail transitions from other states.
3114            This pointer is independent of the main pointer and scans forward
3115            looking for null transitions that are allocated to a state. When it
3116            finds one it writes the single transition into the "hole".  If the
3117            pointer doesnt find one the single transition is appended as normal.
3118
3119            - Once compressed we can Renew/realloc the structures to release the
3120            excess space.
3121
3122            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3123            specifically Fig 3.47 and the associated pseudocode.
3124
3125            demq
3126         */
3127         const U32 laststate = TRIE_NODENUM( next_alloc );
3128         U32 state, charid;
3129         U32 pos = 0, zp=0;
3130         trie->statecount = laststate;
3131
3132         for ( state = 1 ; state < laststate ; state++ ) {
3133             U8 flag = 0;
3134             const U32 stateidx = TRIE_NODEIDX( state );
3135             const U32 o_used = trie->trans[ stateidx ].check;
3136             U32 used = trie->trans[ stateidx ].check;
3137             trie->trans[ stateidx ].check = 0;
3138
3139             for ( charid = 0;
3140                   used && charid < trie->uniquecharcount;
3141                   charid++ )
3142             {
3143                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3144                     if ( trie->trans[ stateidx + charid ].next ) {
3145                         if (o_used == 1) {
3146                             for ( ; zp < pos ; zp++ ) {
3147                                 if ( ! trie->trans[ zp ].next ) {
3148                                     break;
3149                                 }
3150                             }
3151                             trie->states[ state ].trans.base
3152                                                     = zp
3153                                                       + trie->uniquecharcount
3154                                                       - charid ;
3155                             trie->trans[ zp ].next
3156                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3157                                                              + charid ].next );
3158                             trie->trans[ zp ].check = state;
3159                             if ( ++zp > pos ) pos = zp;
3160                             break;
3161                         }
3162                         used--;
3163                     }
3164                     if ( !flag ) {
3165                         flag = 1;
3166                         trie->states[ state ].trans.base
3167                                        = pos + trie->uniquecharcount - charid ;
3168                     }
3169                     trie->trans[ pos ].next
3170                         = SAFE_TRIE_NODENUM(
3171                                        trie->trans[ stateidx + charid ].next );
3172                     trie->trans[ pos ].check = state;
3173                     pos++;
3174                 }
3175             }
3176         }
3177         trie->lasttrans = pos + 1;
3178         trie->states = (reg_trie_state *)
3179             PerlMemShared_realloc( trie->states, laststate
3180                                    * sizeof(reg_trie_state) );
3181         DEBUG_TRIE_COMPILE_MORE_r(
3182             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3183                 depth+1,
3184                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3185                        + 1 ),
3186                 (IV)next_alloc,
3187                 (IV)pos,
3188                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3189             );
3190
3191         } /* end table compress */
3192     }
3193     DEBUG_TRIE_COMPILE_MORE_r(
3194             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3195                 depth+1,
3196                 (UV)trie->statecount,
3197                 (UV)trie->lasttrans)
3198     );
3199     /* resize the trans array to remove unused space */
3200     trie->trans = (reg_trie_trans *)
3201         PerlMemShared_realloc( trie->trans, trie->lasttrans
3202                                * sizeof(reg_trie_trans) );
3203
3204     {   /* Modify the program and insert the new TRIE node */
3205         U8 nodetype =(U8)(flags & 0xFF);
3206         char *str=NULL;
3207
3208 #ifdef DEBUGGING
3209         regnode *optimize = NULL;
3210 #ifdef RE_TRACK_PATTERN_OFFSETS
3211
3212         U32 mjd_offset = 0;
3213         U32 mjd_nodelen = 0;
3214 #endif /* RE_TRACK_PATTERN_OFFSETS */
3215 #endif /* DEBUGGING */
3216         /*
3217            This means we convert either the first branch or the first Exact,
3218            depending on whether the thing following (in 'last') is a branch
3219            or not and whther first is the startbranch (ie is it a sub part of
3220            the alternation or is it the whole thing.)
3221            Assuming its a sub part we convert the EXACT otherwise we convert
3222            the whole branch sequence, including the first.
3223          */
3224         /* Find the node we are going to overwrite */
3225         if ( first != startbranch || OP( last ) == BRANCH ) {
3226             /* branch sub-chain */
3227             NEXT_OFF( first ) = (U16)(last - first);
3228 #ifdef RE_TRACK_PATTERN_OFFSETS
3229             DEBUG_r({
3230                 mjd_offset= Node_Offset((convert));
3231                 mjd_nodelen= Node_Length((convert));
3232             });
3233 #endif
3234             /* whole branch chain */
3235         }
3236 #ifdef RE_TRACK_PATTERN_OFFSETS
3237         else {
3238             DEBUG_r({
3239                 const  regnode *nop = NEXTOPER( convert );
3240                 mjd_offset= Node_Offset((nop));
3241                 mjd_nodelen= Node_Length((nop));
3242             });
3243         }
3244         DEBUG_OPTIMISE_r(
3245             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3246                 depth+1,
3247                 (UV)mjd_offset, (UV)mjd_nodelen)
3248         );
3249 #endif
3250         /* But first we check to see if there is a common prefix we can
3251            split out as an EXACT and put in front of the TRIE node.  */
3252         trie->startstate= 1;
3253         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3254             /* we want to find the first state that has more than
3255              * one transition, if that state is not the first state
3256              * then we have a common prefix which we can remove.
3257              */
3258             U32 state;
3259             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3260                 U32 ofs = 0;
3261                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3262                                        transition, -1 means none */
3263                 U32 count = 0;
3264                 const U32 base = trie->states[ state ].trans.base;
3265
3266                 /* does this state terminate an alternation? */
3267                 if ( trie->states[state].wordnum )
3268                         count = 1;
3269
3270                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3271                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3272                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3273                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3274                     {
3275                         if ( ++count > 1 ) {
3276                             /* we have more than one transition */
3277                             SV **tmp;
3278                             U8 *ch;
3279                             /* if this is the first state there is no common prefix
3280                              * to extract, so we can exit */
3281                             if ( state == 1 ) break;
3282                             tmp = av_fetch( revcharmap, ofs, 0);
3283                             ch = (U8*)SvPV_nolen_const( *tmp );
3284
3285                             /* if we are on count 2 then we need to initialize the
3286                              * bitmap, and store the previous char if there was one
3287                              * in it*/
3288                             if ( count == 2 ) {
3289                                 /* clear the bitmap */
3290                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3291                                 DEBUG_OPTIMISE_r(
3292                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3293                                         depth+1,
3294                                         (UV)state));
3295                                 if (first_ofs >= 0) {
3296                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3297                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3298
3299                                     TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3300                                     DEBUG_OPTIMISE_r(
3301                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3302                                     );
3303                                 }
3304                             }
3305                             /* store the current firstchar in the bitmap */
3306                             TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3307                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3308                         }
3309                         first_ofs = ofs;
3310                     }
3311                 }
3312                 if ( count == 1 ) {
3313                     /* This state has only one transition, its transition is part
3314                      * of a common prefix - we need to concatenate the char it
3315                      * represents to what we have so far. */
3316                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3317                     STRLEN len;
3318                     char *ch = SvPV( *tmp, len );
3319                     DEBUG_OPTIMISE_r({
3320                         SV *sv=sv_newmortal();
3321                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3322                             depth+1,
3323                             (UV)state, (UV)first_ofs,
3324                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3325                                 PL_colors[0], PL_colors[1],
3326                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3327                                 PERL_PV_ESCAPE_FIRSTCHAR
3328                             )
3329                         );
3330                     });
3331                     if ( state==1 ) {
3332                         OP( convert ) = nodetype;
3333                         str=STRING(convert);
3334                         STR_LEN(convert)=0;
3335                     }
3336                     STR_LEN(convert) += len;
3337                     while (len--)
3338                         *str++ = *ch++;
3339                 } else {
3340 #ifdef DEBUGGING
3341                     if (state>1)
3342                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3343 #endif
3344                     break;
3345                 }
3346             }
3347             trie->prefixlen = (state-1);
3348             if (str) {
3349                 regnode *n = convert+NODE_SZ_STR(convert);
3350                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3351                 trie->startstate = state;
3352                 trie->minlen -= (state - 1);
3353                 trie->maxlen -= (state - 1);
3354 #ifdef DEBUGGING
3355                /* At least the UNICOS C compiler choked on this
3356                 * being argument to DEBUG_r(), so let's just have
3357                 * it right here. */
3358                if (
3359 #ifdef PERL_EXT_RE_BUILD
3360                    1
3361 #else
3362                    DEBUG_r_TEST
3363 #endif
3364                    ) {
3365                    regnode *fix = convert;
3366                    U32 word = trie->wordcount;
3367                    mjd_nodelen++;
3368                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3369                    while( ++fix < n ) {
3370                        Set_Node_Offset_Length(fix, 0, 0);
3371                    }
3372                    while (word--) {
3373                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3374                        if (tmp) {
3375                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3376                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3377                            else
3378                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3379                        }
3380                    }
3381                }
3382 #endif
3383                 if (trie->maxlen) {
3384                     convert = n;
3385                 } else {
3386                     NEXT_OFF(convert) = (U16)(tail - convert);
3387                     DEBUG_r(optimize= n);
3388                 }
3389             }
3390         }
3391         if (!jumper)
3392             jumper = last;
3393         if ( trie->maxlen ) {
3394             NEXT_OFF( convert ) = (U16)(tail - convert);
3395             ARG_SET( convert, data_slot );
3396             /* Store the offset to the first unabsorbed branch in
3397                jump[0], which is otherwise unused by the jump logic.
3398                We use this when dumping a trie and during optimisation. */
3399             if (trie->jump)
3400                 trie->jump[0] = (U16)(nextbranch - convert);
3401
3402             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3403              *   and there is a bitmap
3404              *   and the first "jump target" node we found leaves enough room
3405              * then convert the TRIE node into a TRIEC node, with the bitmap
3406              * embedded inline in the opcode - this is hypothetically faster.
3407              */
3408             if ( !trie->states[trie->startstate].wordnum
3409                  && trie->bitmap
3410                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3411             {
3412                 OP( convert ) = TRIEC;
3413                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3414                 PerlMemShared_free(trie->bitmap);
3415                 trie->bitmap= NULL;
3416             } else
3417                 OP( convert ) = TRIE;
3418
3419             /* store the type in the flags */
3420             convert->flags = nodetype;
3421             DEBUG_r({
3422             optimize = convert
3423                       + NODE_STEP_REGNODE
3424                       + regarglen[ OP( convert ) ];
3425             });
3426             /* XXX We really should free up the resource in trie now,
3427                    as we won't use them - (which resources?) dmq */
3428         }
3429         /* needed for dumping*/
3430         DEBUG_r(if (optimize) {
3431             regnode *opt = convert;
3432
3433             while ( ++opt < optimize) {
3434                 Set_Node_Offset_Length(opt,0,0);
3435             }
3436             /*
3437                 Try to clean up some of the debris left after the
3438                 optimisation.
3439              */
3440             while( optimize < jumper ) {
3441                 mjd_nodelen += Node_Length((optimize));
3442                 OP( optimize ) = OPTIMIZED;
3443                 Set_Node_Offset_Length(optimize,0,0);
3444                 optimize++;
3445             }
3446             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3447         });
3448     } /* end node insert */
3449
3450     /*  Finish populating the prev field of the wordinfo array.  Walk back
3451      *  from each accept state until we find another accept state, and if
3452      *  so, point the first word's .prev field at the second word. If the
3453      *  second already has a .prev field set, stop now. This will be the
3454      *  case either if we've already processed that word's accept state,
3455      *  or that state had multiple words, and the overspill words were
3456      *  already linked up earlier.
3457      */
3458     {
3459         U16 word;
3460         U32 state;
3461         U16 prev;
3462
3463         for (word=1; word <= trie->wordcount; word++) {
3464             prev = 0;
3465             if (trie->wordinfo[word].prev)
3466                 continue;
3467             state = trie->wordinfo[word].accept;
3468             while (state) {
3469                 state = prev_states[state];
3470                 if (!state)
3471                     break;
3472                 prev = trie->states[state].wordnum;
3473                 if (prev)
3474                     break;
3475             }
3476             trie->wordinfo[word].prev = prev;
3477         }
3478         Safefree(prev_states);
3479     }
3480
3481
3482     /* and now dump out the compressed format */
3483     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3484
3485     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3486 #ifdef DEBUGGING
3487     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3488     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3489 #else
3490     SvREFCNT_dec_NN(revcharmap);
3491 #endif
3492     return trie->jump
3493            ? MADE_JUMP_TRIE
3494            : trie->startstate>1
3495              ? MADE_EXACT_TRIE
3496              : MADE_TRIE;
3497 }
3498
3499 STATIC regnode *
3500 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3501 {
3502 /* The Trie is constructed and compressed now so we can build a fail array if
3503  * it's needed
3504
3505    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3506    3.32 in the
3507    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3508    Ullman 1985/88
3509    ISBN 0-201-10088-6
3510
3511    We find the fail state for each state in the trie, this state is the longest
3512    proper suffix of the current state's 'word' that is also a proper prefix of
3513    another word in our trie. State 1 represents the word '' and is thus the
3514    default fail state. This allows the DFA not to have to restart after its
3515    tried and failed a word at a given point, it simply continues as though it
3516    had been matching the other word in the first place.
3517    Consider
3518       'abcdgu'=~/abcdefg|cdgu/
3519    When we get to 'd' we are still matching the first word, we would encounter
3520    'g' which would fail, which would bring us to the state representing 'd' in
3521    the second word where we would try 'g' and succeed, proceeding to match
3522    'cdgu'.
3523  */
3524  /* add a fail transition */
3525     const U32 trie_offset = ARG(source);
3526     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3527     U32 *q;
3528     const U32 ucharcount = trie->uniquecharcount;
3529     const U32 numstates = trie->statecount;
3530     const U32 ubound = trie->lasttrans + ucharcount;
3531     U32 q_read = 0;
3532     U32 q_write = 0;
3533     U32 charid;
3534     U32 base = trie->states[ 1 ].trans.base;
3535     U32 *fail;
3536     reg_ac_data *aho;
3537     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3538     regnode *stclass;
3539     GET_RE_DEBUG_FLAGS_DECL;
3540
3541     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3542     PERL_UNUSED_CONTEXT;
3543 #ifndef DEBUGGING
3544     PERL_UNUSED_ARG(depth);
3545 #endif
3546
3547     if ( OP(source) == TRIE ) {
3548         struct regnode_1 *op = (struct regnode_1 *)
3549             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3550         StructCopy(source,op,struct regnode_1);
3551         stclass = (regnode *)op;
3552     } else {
3553         struct regnode_charclass *op = (struct regnode_charclass *)
3554             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3555         StructCopy(source,op,struct regnode_charclass);
3556         stclass = (regnode *)op;
3557     }
3558     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3559
3560     ARG_SET( stclass, data_slot );
3561     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3562     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3563     aho->trie=trie_offset;
3564     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3565     Copy( trie->states, aho->states, numstates, reg_trie_state );
3566     Newxz( q, numstates, U32);
3567     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3568     aho->refcount = 1;
3569     fail = aho->fail;
3570     /* initialize fail[0..1] to be 1 so that we always have
3571        a valid final fail state */
3572     fail[ 0 ] = fail[ 1 ] = 1;
3573
3574     for ( charid = 0; charid < ucharcount ; charid++ ) {
3575         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3576         if ( newstate ) {
3577             q[ q_write ] = newstate;
3578             /* set to point at the root */
3579             fail[ q[ q_write++ ] ]=1;
3580         }
3581     }
3582     while ( q_read < q_write) {
3583         const U32 cur = q[ q_read++ % numstates ];
3584         base = trie->states[ cur ].trans.base;
3585
3586         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3587             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3588             if (ch_state) {
3589                 U32 fail_state = cur;
3590                 U32 fail_base;
3591                 do {
3592                     fail_state = fail[ fail_state ];
3593                     fail_base = aho->states[ fail_state ].trans.base;
3594                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3595
3596                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3597                 fail[ ch_state ] = fail_state;
3598                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3599                 {
3600                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3601                 }
3602                 q[ q_write++ % numstates] = ch_state;
3603             }
3604         }
3605     }
3606     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3607        when we fail in state 1, this allows us to use the
3608        charclass scan to find a valid start char. This is based on the principle
3609        that theres a good chance the string being searched contains lots of stuff
3610        that cant be a start char.
3611      */
3612     fail[ 0 ] = fail[ 1 ] = 0;
3613     DEBUG_TRIE_COMPILE_r({
3614         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3615                       depth, (UV)numstates
3616         );
3617         for( q_read=1; q_read<numstates; q_read++ ) {
3618             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3619         }
3620         Perl_re_printf( aTHX_  "\n");
3621     });
3622     Safefree(q);
3623     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3624     return stclass;
3625 }
3626
3627
3628 #define DEBUG_PEEP(str,scan,depth)         \
3629     DEBUG_OPTIMISE_r({if (scan){           \
3630        regnode *Next = regnext(scan);      \
3631        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
3632        Perl_re_indentf( aTHX_  "" str ">%3d: %s (%d)", \
3633            depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3634            Next ? (REG_NODE_NUM(Next)) : 0 );\
3635        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3636        Perl_re_printf( aTHX_  "\n");                   \
3637    }});
3638
3639 /* The below joins as many adjacent EXACTish nodes as possible into a single
3640  * one.  The regop may be changed if the node(s) contain certain sequences that
3641  * require special handling.  The joining is only done if:
3642  * 1) there is room in the current conglomerated node to entirely contain the
3643  *    next one.
3644  * 2) they are the exact same node type
3645  *
3646  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3647  * these get optimized out
3648  *
3649  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3650  * as possible, even if that means splitting an existing node so that its first
3651  * part is moved to the preceeding node.  This would maximise the efficiency of
3652  * memEQ during matching.  Elsewhere in this file, khw proposes splitting
3653  * EXACTFish nodes into portions that don't change under folding vs those that
3654  * do.  Those portions that don't change may be the only things in the pattern that
3655  * could be used to find fixed and floating strings.
3656  *
3657  * If a node is to match under /i (folded), the number of characters it matches
3658  * can be different than its character length if it contains a multi-character
3659  * fold.  *min_subtract is set to the total delta number of characters of the
3660  * input nodes.
3661  *
3662  * And *unfolded_multi_char is set to indicate whether or not the node contains
3663  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3664  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3665  * SMALL LETTER SHARP S, as only if the target string being matched against
3666  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3667  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3668  * whose components are all above the Latin1 range are not run-time locale
3669  * dependent, and have already been folded by the time this function is
3670  * called.)
3671  *
3672  * This is as good a place as any to discuss the design of handling these
3673  * multi-character fold sequences.  It's been wrong in Perl for a very long
3674  * time.  There are three code points in Unicode whose multi-character folds
3675  * were long ago discovered to mess things up.  The previous designs for
3676  * dealing with these involved assigning a special node for them.  This
3677  * approach doesn't always work, as evidenced by this example:
3678  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3679  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3680  * would match just the \xDF, it won't be able to handle the case where a
3681  * successful match would have to cross the node's boundary.  The new approach
3682  * that hopefully generally solves the problem generates an EXACTFU_SS node
3683  * that is "sss" in this case.
3684  *
3685  * It turns out that there are problems with all multi-character folds, and not
3686  * just these three.  Now the code is general, for all such cases.  The
3687  * approach taken is:
3688  * 1)   This routine examines each EXACTFish node that could contain multi-
3689  *      character folded sequences.  Since a single character can fold into
3690  *      such a sequence, the minimum match length for this node is less than
3691  *      the number of characters in the node.  This routine returns in
3692  *      *min_subtract how many characters to subtract from the the actual
3693  *      length of the string to get a real minimum match length; it is 0 if
3694  *      there are no multi-char foldeds.  This delta is used by the caller to
3695  *      adjust the min length of the match, and the delta between min and max,
3696  *      so that the optimizer doesn't reject these possibilities based on size
3697  *      constraints.
3698  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3699  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3700  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3701  *      there is a possible fold length change.  That means that a regular
3702  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3703  *      with length changes, and so can be processed faster.  regexec.c takes
3704  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3705  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3706  *      known until runtime).  This saves effort in regex matching.  However,
3707  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3708  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3709  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3710  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3711  *      possibilities for the non-UTF8 patterns are quite simple, except for
3712  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3713  *      members of a fold-pair, and arrays are set up for all of them so that
3714  *      the other member of the pair can be found quickly.  Code elsewhere in
3715  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3716  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3717  *      described in the next item.
3718  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3719  *      validity of the fold won't be known until runtime, and so must remain
3720  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3721  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3722  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3723  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3724  *      The reason this is a problem is that the optimizer part of regexec.c
3725  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3726  *      that a character in the pattern corresponds to at most a single
3727  *      character in the target string.  (And I do mean character, and not byte
3728  *      here, unlike other parts of the documentation that have never been
3729  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3730  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3731  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3732  *      nodes, violate the assumption, and they are the only instances where it
3733  *      is violated.  I'm reluctant to try to change the assumption, as the
3734  *      code involved is impenetrable to me (khw), so instead the code here
3735  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3736  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3737  *      boolean indicating whether or not the node contains such a fold.  When
3738  *      it is true, the caller sets a flag that later causes the optimizer in
3739  *      this file to not set values for the floating and fixed string lengths,
3740  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3741  *      assumption.  Thus, there is no optimization based on string lengths for
3742  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3743  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3744  *      assumption is wrong only in these cases is that all other non-UTF-8
3745  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3746  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3747  *      EXACTF nodes because we don't know at compile time if it actually
3748  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3749  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3750  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3751  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3752  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3753  *      string would require the pattern to be forced into UTF-8, the overhead
3754  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3755  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3756  *      locale.)
3757  *
3758  *      Similarly, the code that generates tries doesn't currently handle
3759  *      not-already-folded multi-char folds, and it looks like a pain to change
3760  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3761  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3762  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3763  *      using /iaa matching will be doing so almost entirely with ASCII
3764  *      strings, so this should rarely be encountered in practice */
3765
3766 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3767     if (PL_regkind[OP(scan)] == EXACT) \
3768         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3769
3770 STATIC U32
3771 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3772                    UV *min_subtract, bool *unfolded_multi_char,
3773                    U32 flags,regnode *val, U32 depth)
3774 {
3775     /* Merge several consecutive EXACTish nodes into one. */
3776     regnode *n = regnext(scan);
3777     U32 stringok = 1;
3778     regnode *next = scan + NODE_SZ_STR(scan);
3779     U32 merged = 0;
3780     U32 stopnow = 0;
3781 #ifdef DEBUGGING
3782     regnode *stop = scan;
3783     GET_RE_DEBUG_FLAGS_DECL;
3784 #else
3785     PERL_UNUSED_ARG(depth);
3786 #endif
3787
3788     PERL_ARGS_ASSERT_JOIN_EXACT;
3789 #ifndef EXPERIMENTAL_INPLACESCAN
3790     PERL_UNUSED_ARG(flags);
3791     PERL_UNUSED_ARG(val);
3792 #endif
3793     DEBUG_PEEP("join",scan,depth);
3794
3795     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3796      * EXACT ones that are mergeable to the current one. */
3797     while (n
3798            && (PL_regkind[OP(n)] == NOTHING
3799                || (stringok && OP(n) == OP(scan)))
3800            && NEXT_OFF(n)
3801            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3802     {
3803
3804         if (OP(n) == TAIL || n > next)
3805             stringok = 0;
3806         if (PL_regkind[OP(n)] == NOTHING) {
3807             DEBUG_PEEP("skip:",n,depth);
3808             NEXT_OFF(scan) += NEXT_OFF(n);
3809             next = n + NODE_STEP_REGNODE;
3810 #ifdef DEBUGGING
3811             if (stringok)
3812                 stop = n;
3813 #endif
3814             n = regnext(n);
3815         }
3816         else if (stringok) {
3817             const unsigned int oldl = STR_LEN(scan);
3818             regnode * const nnext = regnext(n);
3819
3820             /* XXX I (khw) kind of doubt that this works on platforms (should
3821              * Perl ever run on one) where U8_MAX is above 255 because of lots
3822              * of other assumptions */
3823             /* Don't join if the sum can't fit into a single node */
3824             if (oldl + STR_LEN(n) > U8_MAX)
3825                 break;
3826
3827             DEBUG_PEEP("merg",n,depth);
3828             merged++;
3829
3830             NEXT_OFF(scan) += NEXT_OFF(n);
3831             STR_LEN(scan) += STR_LEN(n);
3832             next = n + NODE_SZ_STR(n);
3833             /* Now we can overwrite *n : */
3834             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3835 #ifdef DEBUGGING
3836             stop = next - 1;
3837 #endif
3838             n = nnext;
3839             if (stopnow) break;
3840         }
3841
3842 #ifdef EXPERIMENTAL_INPLACESCAN
3843         if (flags && !NEXT_OFF(n)) {
3844             DEBUG_PEEP("atch", val, depth);
3845             if (reg_off_by_arg[OP(n)]) {
3846                 ARG_SET(n, val - n);
3847             }
3848             else {
3849                 NEXT_OFF(n) = val - n;
3850             }
3851             stopnow = 1;
3852         }
3853 #endif
3854     }
3855
3856     *min_subtract = 0;
3857     *unfolded_multi_char = FALSE;
3858
3859     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3860      * can now analyze for sequences of problematic code points.  (Prior to
3861      * this final joining, sequences could have been split over boundaries, and
3862      * hence missed).  The sequences only happen in folding, hence for any
3863      * non-EXACT EXACTish node */
3864     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3865         U8* s0 = (U8*) STRING(scan);
3866         U8* s = s0;
3867         U8* s_end = s0 + STR_LEN(scan);
3868
3869         int total_count_delta = 0;  /* Total delta number of characters that
3870                                        multi-char folds expand to */
3871
3872         /* One pass is made over the node's string looking for all the
3873          * possibilities.  To avoid some tests in the loop, there are two main
3874          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3875          * non-UTF-8 */
3876         if (UTF) {
3877             U8* folded = NULL;
3878
3879             if (OP(scan) == EXACTFL) {
3880                 U8 *d;
3881
3882                 /* An EXACTFL node would already have been changed to another
3883                  * node type unless there is at least one character in it that
3884                  * is problematic; likely a character whose fold definition
3885                  * won't be known until runtime, and so has yet to be folded.
3886                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3887                  * to handle the UTF-8 case, we need to create a temporary
3888                  * folded copy using UTF-8 locale rules in order to analyze it.
3889                  * This is because our macros that look to see if a sequence is
3890                  * a multi-char fold assume everything is folded (otherwise the
3891                  * tests in those macros would be too complicated and slow).
3892                  * Note that here, the non-problematic folds will have already
3893                  * been done, so we can just copy such characters.  We actually
3894                  * don't completely fold the EXACTFL string.  We skip the
3895                  * unfolded multi-char folds, as that would just create work
3896                  * below to figure out the size they already are */
3897
3898                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3899                 d = folded;
3900                 while (s < s_end) {
3901                     STRLEN s_len = UTF8SKIP(s);
3902                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3903                         Copy(s, d, s_len, U8);
3904                         d += s_len;
3905                     }
3906                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3907                         *unfolded_multi_char = TRUE;
3908                         Copy(s, d, s_len, U8);
3909                         d += s_len;
3910                     }
3911                     else if (isASCII(*s)) {
3912                         *(d++) = toFOLD(*s);
3913                     }
3914                     else {
3915                         STRLEN len;
3916                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
3917                         d += len;
3918                     }
3919                     s += s_len;
3920                 }
3921
3922                 /* Point the remainder of the routine to look at our temporary
3923                  * folded copy */
3924                 s = folded;
3925                 s_end = d;
3926             } /* End of creating folded copy of EXACTFL string */
3927
3928             /* Examine the string for a multi-character fold sequence.  UTF-8
3929              * patterns have all characters pre-folded by the time this code is
3930              * executed */
3931             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3932                                      length sequence we are looking for is 2 */
3933             {
3934                 int count = 0;  /* How many characters in a multi-char fold */
3935                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3936                 if (! len) {    /* Not a multi-char fold: get next char */
3937                     s += UTF8SKIP(s);
3938                     continue;
3939                 }
3940
3941                 /* Nodes with 'ss' require special handling, except for
3942                  * EXACTFA-ish for which there is no multi-char fold to this */
3943                 if (len == 2 && *s == 's' && *(s+1) == 's'
3944                     && OP(scan) != EXACTFA
3945                     && OP(scan) != EXACTFA_NO_TRIE)
3946                 {
3947                     count = 2;
3948                     if (OP(scan) != EXACTFL) {
3949                         OP(scan) = EXACTFU_SS;
3950                     }
3951                     s += 2;
3952                 }
3953                 else { /* Here is a generic multi-char fold. */
3954                     U8* multi_end  = s + len;
3955
3956                     /* Count how many characters are in it.  In the case of
3957                      * /aa, no folds which contain ASCII code points are
3958                      * allowed, so check for those, and skip if found. */
3959                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3960                         count = utf8_length(s, multi_end);
3961                         s = multi_end;
3962                     }
3963                     else {
3964                         while (s < multi_end) {
3965                             if (isASCII(*s)) {
3966                                 s++;
3967                                 goto next_iteration;
3968                             }
3969                             else {
3970                                 s += UTF8SKIP(s);
3971                             }
3972                             count++;
3973                         }
3974                     }
3975                 }
3976
3977                 /* The delta is how long the sequence is minus 1 (1 is how long
3978                  * the character that folds to the sequence is) */
3979                 total_count_delta += count - 1;
3980               next_iteration: ;
3981             }
3982
3983             /* We created a temporary folded copy of the string in EXACTFL
3984              * nodes.  Therefore we need to be sure it doesn't go below zero,
3985              * as the real string could be shorter */
3986             if (OP(scan) == EXACTFL) {
3987                 int total_chars = utf8_length((U8*) STRING(scan),
3988                                            (U8*) STRING(scan) + STR_LEN(scan));
3989                 if (total_count_delta > total_chars) {
3990                     total_count_delta = total_chars;
3991                 }
3992             }
3993
3994             *min_subtract += total_count_delta;
3995             Safefree(folded);
3996         }
3997         else if (OP(scan) == EXACTFA) {
3998
3999             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
4000              * fold to the ASCII range (and there are no existing ones in the
4001              * upper latin1 range).  But, as outlined in the comments preceding
4002              * this function, we need to flag any occurrences of the sharp s.
4003              * This character forbids trie formation (because of added
4004              * complexity) */
4005 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4006    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4007                                       || UNICODE_DOT_DOT_VERSION > 0)
4008             while (s < s_end) {
4009                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4010                     OP(scan) = EXACTFA_NO_TRIE;
4011                     *unfolded_multi_char = TRUE;
4012                     break;
4013                 }
4014                 s++;
4015             }
4016         }
4017         else {
4018
4019             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
4020              * folds that are all Latin1.  As explained in the comments
4021              * preceding this function, we look also for the sharp s in EXACTF
4022              * and EXACTFL nodes; it can be in the final position.  Otherwise
4023              * we can stop looking 1 byte earlier because have to find at least
4024              * two characters for a multi-fold */
4025             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4026                               ? s_end
4027                               : s_end -1;
4028
4029             while (s < upper) {
4030                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4031                 if (! len) {    /* Not a multi-char fold. */
4032                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4033                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4034                     {
4035                         *unfolded_multi_char = TRUE;
4036                     }
4037                     s++;
4038                     continue;
4039                 }
4040
4041                 if (len == 2
4042                     && isALPHA_FOLD_EQ(*s, 's')
4043                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4044                 {
4045
4046                     /* EXACTF nodes need to know that the minimum length
4047                      * changed so that a sharp s in the string can match this
4048                      * ss in the pattern, but they remain EXACTF nodes, as they
4049                      * won't match this unless the target string is is UTF-8,
4050                      * which we don't know until runtime.  EXACTFL nodes can't
4051                      * transform into EXACTFU nodes */
4052                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4053                         OP(scan) = EXACTFU_SS;
4054                     }
4055                 }
4056
4057                 *min_subtract += len - 1;
4058                 s += len;
4059             }
4060 #endif
4061         }
4062     }
4063
4064 #ifdef DEBUGGING
4065     /* Allow dumping but overwriting the collection of skipped
4066      * ops and/or strings with fake optimized ops */
4067     n = scan + NODE_SZ_STR(scan);
4068     while (n <= stop) {
4069         OP(n) = OPTIMIZED;
4070         FLAGS(n) = 0;
4071         NEXT_OFF(n) = 0;
4072         n++;
4073     }
4074 #endif
4075     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
4076     return stopnow;
4077 }
4078
4079 /* REx optimizer.  Converts nodes into quicker variants "in place".
4080    Finds fixed substrings.  */
4081
4082 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4083    to the position after last scanned or to NULL. */
4084
4085 #define INIT_AND_WITHP \
4086     assert(!and_withp); \
4087     Newx(and_withp,1, regnode_ssc); \
4088     SAVEFREEPV(and_withp)
4089
4090
4091 static void
4092 S_unwind_scan_frames(pTHX_ const void *p)
4093 {
4094     scan_frame *f= (scan_frame *)p;
4095     do {
4096         scan_frame *n= f->next_frame;
4097         Safefree(f);
4098         f= n;
4099     } while (f);
4100 }
4101
4102
4103 STATIC SSize_t
4104 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4105                         SSize_t *minlenp, SSize_t *deltap,
4106                         regnode *last,
4107                         scan_data_t *data,
4108                         I32 stopparen,
4109                         U32 recursed_depth,
4110                         regnode_ssc *and_withp,
4111                         U32 flags, U32 depth)
4112                         /* scanp: Start here (read-write). */
4113                         /* deltap: Write maxlen-minlen here. */
4114                         /* last: Stop before this one. */
4115                         /* data: string data about the pattern */
4116                         /* stopparen: treat close N as END */
4117                         /* recursed: which subroutines have we recursed into */
4118                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4119 {
4120     /* There must be at least this number of characters to match */
4121     SSize_t min = 0;
4122     I32 pars = 0, code;
4123     regnode *scan = *scanp, *next;
4124     SSize_t delta = 0;
4125     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4126     int is_inf_internal = 0;            /* The studied chunk is infinite */
4127     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4128     scan_data_t data_fake;
4129     SV *re_trie_maxbuff = NULL;
4130     regnode *first_non_open = scan;
4131     SSize_t stopmin = SSize_t_MAX;
4132     scan_frame *frame = NULL;
4133     GET_RE_DEBUG_FLAGS_DECL;
4134
4135     PERL_ARGS_ASSERT_STUDY_CHUNK;
4136     RExC_study_started= 1;
4137
4138
4139     if ( depth == 0 ) {
4140         while (first_non_open && OP(first_non_open) == OPEN)
4141             first_non_open=regnext(first_non_open);
4142     }
4143
4144
4145   fake_study_recurse:
4146     DEBUG_r(
4147         RExC_study_chunk_recursed_count++;
4148     );
4149     DEBUG_OPTIMISE_MORE_r(
4150     {
4151         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4152             depth, (long)stopparen,
4153             (unsigned long)RExC_study_chunk_recursed_count,
4154             (unsigned long)depth, (unsigned long)recursed_depth,
4155             scan,
4156             last);
4157         if (recursed_depth) {
4158             U32 i;
4159             U32 j;
4160             for ( j = 0 ; j < recursed_depth ; j++ ) {
4161                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4162                     if (
4163                         PAREN_TEST(RExC_study_chunk_recursed +
4164                                    ( j * RExC_study_chunk_recursed_bytes), i )
4165                         && (
4166                             !j ||
4167                             !PAREN_TEST(RExC_study_chunk_recursed +
4168                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4169                         )
4170                     ) {
4171                         Perl_re_printf( aTHX_ " %d",(int)i);
4172                         break;
4173                     }
4174                 }
4175                 if ( j + 1 < recursed_depth ) {
4176                     Perl_re_printf( aTHX_  ",");
4177                 }
4178             }
4179         }
4180         Perl_re_printf( aTHX_ "\n");
4181     }
4182     );
4183     while ( scan && OP(scan) != END && scan < last ){
4184         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4185                                    node length to get a real minimum (because
4186                                    the folded version may be shorter) */
4187         bool unfolded_multi_char = FALSE;
4188         /* Peephole optimizer: */
4189         DEBUG_STUDYDATA("Peep:", data, depth);
4190         DEBUG_PEEP("Peep", scan, depth);
4191
4192
4193         /* The reason we do this here is that we need to deal with things like
4194          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4195          * parsing code, as each (?:..) is handled by a different invocation of
4196          * reg() -- Yves
4197          */
4198         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4199
4200         /* Follow the next-chain of the current node and optimize
4201            away all the NOTHINGs from it.  */
4202         if (OP(scan) != CURLYX) {
4203             const int max = (reg_off_by_arg[OP(scan)]
4204                        ? I32_MAX
4205                        /* I32 may be smaller than U16 on CRAYs! */
4206                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4207             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4208             int noff;
4209             regnode *n = scan;
4210
4211             /* Skip NOTHING and LONGJMP. */
4212             while ((n = regnext(n))
4213                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4214                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4215                    && off + noff < max)
4216                 off += noff;
4217             if (reg_off_by_arg[OP(scan)])
4218                 ARG(scan) = off;
4219             else
4220                 NEXT_OFF(scan) = off;
4221         }
4222
4223         /* The principal pseudo-switch.  Cannot be a switch, since we
4224            look into several different things.  */
4225         if ( OP(scan) == DEFINEP ) {
4226             SSize_t minlen = 0;
4227             SSize_t deltanext = 0;
4228             SSize_t fake_last_close = 0;
4229             I32 f = SCF_IN_DEFINE;
4230
4231             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4232             scan = regnext(scan);
4233             assert( OP(scan) == IFTHEN );
4234             DEBUG_PEEP("expect IFTHEN", scan, depth);
4235
4236             data_fake.last_closep= &fake_last_close;
4237             minlen = *minlenp;
4238             next = regnext(scan);
4239             scan = NEXTOPER(NEXTOPER(scan));
4240             DEBUG_PEEP("scan", scan, depth);
4241             DEBUG_PEEP("next", next, depth);
4242
4243             /* we suppose the run is continuous, last=next...
4244              * NOTE we dont use the return here! */
4245             (void)study_chunk(pRExC_state, &scan, &minlen,
4246                               &deltanext, next, &data_fake, stopparen,
4247                               recursed_depth, NULL, f, depth+1);
4248
4249             scan = next;
4250         } else
4251         if (
4252             OP(scan) == BRANCH  ||
4253             OP(scan) == BRANCHJ ||
4254             OP(scan) == IFTHEN
4255         ) {
4256             next = regnext(scan);
4257             code = OP(scan);
4258
4259             /* The op(next)==code check below is to see if we
4260              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4261              * IFTHEN is special as it might not appear in pairs.
4262              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4263              * we dont handle it cleanly. */
4264             if (OP(next) == code || code == IFTHEN) {
4265                 /* NOTE - There is similar code to this block below for
4266                  * handling TRIE nodes on a re-study.  If you change stuff here
4267                  * check there too. */
4268                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4269                 regnode_ssc accum;
4270                 regnode * const startbranch=scan;
4271
4272                 if (flags & SCF_DO_SUBSTR) {
4273                     /* Cannot merge strings after this. */
4274                     scan_commit(pRExC_state, data, minlenp, is_inf);
4275                 }
4276
4277                 if (flags & SCF_DO_STCLASS)
4278                     ssc_init_zero(pRExC_state, &accum);
4279
4280                 while (OP(scan) == code) {
4281                     SSize_t deltanext, minnext, fake;
4282                     I32 f = 0;
4283                     regnode_ssc this_class;
4284
4285                     DEBUG_PEEP("Branch", scan, depth);
4286
4287                     num++;
4288                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4289                     if (data) {
4290                         data_fake.whilem_c = data->whilem_c;
4291                         data_fake.last_closep = data->last_closep;
4292                     }
4293                     else
4294                         data_fake.last_closep = &fake;
4295
4296                     data_fake.pos_delta = delta;
4297                     next = regnext(scan);
4298
4299                     scan = NEXTOPER(scan); /* everything */
4300                     if (code != BRANCH)    /* everything but BRANCH */
4301                         scan = NEXTOPER(scan);
4302
4303                     if (flags & SCF_DO_STCLASS) {
4304                         ssc_init(pRExC_state, &this_class);
4305                         data_fake.start_class = &this_class;
4306                         f = SCF_DO_STCLASS_AND;
4307                     }
4308                     if (flags & SCF_WHILEM_VISITED_POS)
4309                         f |= SCF_WHILEM_VISITED_POS;
4310
4311                     /* we suppose the run is continuous, last=next...*/
4312                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4313                                       &deltanext, next, &data_fake, stopparen,
4314                                       recursed_depth, NULL, f,depth+1);
4315
4316                     if (min1 > minnext)
4317                         min1 = minnext;
4318                     if (deltanext == SSize_t_MAX) {
4319                         is_inf = is_inf_internal = 1;
4320                         max1 = SSize_t_MAX;
4321                     } else if (max1 < minnext + deltanext)
4322                         max1 = minnext + deltanext;
4323                     scan = next;
4324                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4325                         pars++;
4326                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4327                         if ( stopmin > minnext)
4328                             stopmin = min + min1;
4329                         flags &= ~SCF_DO_SUBSTR;
4330                         if (data)
4331                             data->flags |= SCF_SEEN_ACCEPT;
4332                     }
4333                     if (data) {
4334                         if (data_fake.flags & SF_HAS_EVAL)
4335                             data->flags |= SF_HAS_EVAL;
4336                         data->whilem_c = data_fake.whilem_c;
4337                     }
4338                     if (flags & SCF_DO_STCLASS)
4339                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4340                 }
4341                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4342                     min1 = 0;
4343                 if (flags & SCF_DO_SUBSTR) {
4344                     data->pos_min += min1;
4345                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4346                         data->pos_delta = SSize_t_MAX;
4347                     else
4348                         data->pos_delta += max1 - min1;
4349                     if (max1 != min1 || is_inf)
4350                         data->longest = &(data->longest_float);
4351                 }
4352                 min += min1;
4353                 if (delta == SSize_t_MAX
4354                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4355                     delta = SSize_t_MAX;
4356                 else
4357                     delta += max1 - min1;
4358                 if (flags & SCF_DO_STCLASS_OR) {
4359                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4360                     if (min1) {
4361                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4362                         flags &= ~SCF_DO_STCLASS;
4363                     }
4364                 }
4365                 else if (flags & SCF_DO_STCLASS_AND) {
4366                     if (min1) {
4367                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4368                         flags &= ~SCF_DO_STCLASS;
4369                     }
4370                     else {
4371                         /* Switch to OR mode: cache the old value of
4372                          * data->start_class */
4373                         INIT_AND_WITHP;
4374                         StructCopy(data->start_class, and_withp, regnode_ssc);
4375                         flags &= ~SCF_DO_STCLASS_AND;
4376                         StructCopy(&accum, data->start_class, regnode_ssc);
4377                         flags |= SCF_DO_STCLASS_OR;
4378                     }
4379                 }
4380
4381                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4382                         OP( startbranch ) == BRANCH )
4383                 {
4384                 /* demq.
4385
4386                    Assuming this was/is a branch we are dealing with: 'scan'
4387                    now points at the item that follows the branch sequence,
4388                    whatever it is. We now start at the beginning of the
4389                    sequence and look for subsequences of
4390
4391                    BRANCH->EXACT=>x1
4392                    BRANCH->EXACT=>x2
4393                    tail
4394
4395                    which would be constructed from a pattern like
4396                    /A|LIST|OF|WORDS/
4397
4398                    If we can find such a subsequence we need to turn the first
4399                    element into a trie and then add the subsequent branch exact
4400                    strings to the trie.
4401
4402                    We have two cases
4403
4404                      1. patterns where the whole set of branches can be
4405                         converted.
4406
4407                      2. patterns where only a subset can be converted.
4408
4409                    In case 1 we can replace the whole set with a single regop
4410                    for the trie. In case 2 we need to keep the start and end
4411                    branches so
4412
4413                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4414                      becomes BRANCH TRIE; BRANCH X;
4415
4416                   There is an additional case, that being where there is a
4417                   common prefix, which gets split out into an EXACT like node
4418                   preceding the TRIE node.
4419
4420                   If x(1..n)==tail then we can do a simple trie, if not we make
4421                   a "jump" trie, such that when we match the appropriate word
4422                   we "jump" to the appropriate tail node. Essentially we turn
4423                   a nested if into a case structure of sorts.
4424
4425                 */
4426
4427                     int made=0;
4428                     if (!re_trie_maxbuff) {
4429                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4430                         if (!SvIOK(re_trie_maxbuff))
4431                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4432                     }
4433                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4434                         regnode *cur;
4435                         regnode *first = (regnode *)NULL;
4436                         regnode *last = (regnode *)NULL;
4437                         regnode *tail = scan;
4438                         U8 trietype = 0;
4439                         U32 count=0;
4440
4441                         /* var tail is used because there may be a TAIL
4442                            regop in the way. Ie, the exacts will point to the
4443                            thing following the TAIL, but the last branch will
4444                            point at the TAIL. So we advance tail. If we
4445                            have nested (?:) we may have to move through several
4446                            tails.
4447                          */
4448
4449                         while ( OP( tail ) == TAIL ) {
4450                             /* this is the TAIL generated by (?:) */
4451                             tail = regnext( tail );
4452                         }
4453
4454
4455                         DEBUG_TRIE_COMPILE_r({
4456                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4457                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4458                               depth+1,
4459                               "Looking for TRIE'able sequences. Tail node is ",
4460                               (UV)(tail - RExC_emit_start),
4461                               SvPV_nolen_const( RExC_mysv )
4462                             );
4463                         });
4464
4465                         /*
4466
4467                             Step through the branches
4468                                 cur represents each branch,
4469                                 noper is the first thing to be matched as part
4470                                       of that branch
4471                                 noper_next is the regnext() of that node.
4472
4473                             We normally handle a case like this
4474                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4475                             support building with NOJUMPTRIE, which restricts
4476                             the trie logic to structures like /FOO|BAR/.
4477
4478                             If noper is a trieable nodetype then the branch is
4479                             a possible optimization target. If we are building
4480                             under NOJUMPTRIE then we require that noper_next is
4481                             the same as scan (our current position in the regex
4482                             program).
4483
4484                             Once we have two or more consecutive such branches
4485                             we can create a trie of the EXACT's contents and
4486                             stitch it in place into the program.
4487
4488                             If the sequence represents all of the branches in
4489                             the alternation we replace the entire thing with a
4490                             single TRIE node.
4491
4492                             Otherwise when it is a subsequence we need to
4493                             stitch it in place and replace only the relevant
4494                             branches. This means the first branch has to remain
4495                             as it is used by the alternation logic, and its
4496                             next pointer, and needs to be repointed at the item
4497                             on the branch chain following the last branch we
4498                             have optimized away.
4499
4500                             This could be either a BRANCH, in which case the
4501                             subsequence is internal, or it could be the item
4502                             following the branch sequence in which case the
4503                             subsequence is at the end (which does not
4504                             necessarily mean the first node is the start of the
4505                             alternation).
4506
4507                             TRIE_TYPE(X) is a define which maps the optype to a
4508                             trietype.
4509
4510                                 optype          |  trietype
4511                                 ----------------+-----------
4512                                 NOTHING         | NOTHING
4513                                 EXACT           | EXACT
4514                                 EXACTFU         | EXACTFU
4515                                 EXACTFU_SS      | EXACTFU
4516                                 EXACTFA         | EXACTFA
4517                                 EXACTL          | EXACTL
4518                                 EXACTFLU8       | EXACTFLU8
4519
4520
4521                         */
4522 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4523                        ? NOTHING                                            \
4524                        : ( EXACT == (X) )                                   \
4525                          ? EXACT                                            \
4526                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4527                            ? EXACTFU                                        \
4528                            : ( EXACTFA == (X) )                             \
4529                              ? EXACTFA                                      \
4530                              : ( EXACTL == (X) )                            \
4531                                ? EXACTL                                     \
4532                                : ( EXACTFLU8 == (X) )                        \
4533                                  ? EXACTFLU8                                 \
4534                                  : 0 )
4535
4536                         /* dont use tail as the end marker for this traverse */
4537                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4538                             regnode * const noper = NEXTOPER( cur );
4539                             U8 noper_type = OP( noper );
4540                             U8 noper_trietype = TRIE_TYPE( noper_type );
4541 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4542                             regnode * const noper_next = regnext( noper );
4543                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4544                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4545 #endif
4546
4547                             DEBUG_TRIE_COMPILE_r({
4548                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4549                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4550                                    depth+1,
4551                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4552
4553                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4554                                 Perl_re_printf( aTHX_  " -> %d:%s",
4555                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4556
4557                                 if ( noper_next ) {
4558                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4559                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4560                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4561                                 }
4562                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4563                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4564                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4565                                 );
4566                             });
4567
4568                             /* Is noper a trieable nodetype that can be merged
4569                              * with the current trie (if there is one)? */
4570                             if ( noper_trietype
4571                                   &&
4572                                   (
4573                                         ( noper_trietype == NOTHING )
4574                                         || ( trietype == NOTHING )
4575                                         || ( trietype == noper_trietype )
4576                                   )
4577 #ifdef NOJUMPTRIE
4578                                   && noper_next >= tail
4579 #endif
4580                                   && count < U16_MAX)
4581                             {
4582                                 /* Handle mergable triable node Either we are
4583                                  * the first node in a new trieable sequence,
4584                                  * in which case we do some bookkeeping,
4585                                  * otherwise we update the end pointer. */
4586                                 if ( !first ) {
4587                                     first = cur;
4588                                     if ( noper_trietype == NOTHING ) {
4589 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4590                                         regnode * const noper_next = regnext( noper );
4591                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4592                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4593 #endif
4594
4595                                         if ( noper_next_trietype ) {
4596                                             trietype = noper_next_trietype;
4597                                         } else if (noper_next_type)  {
4598                                             /* a NOTHING regop is 1 regop wide.
4599                                              * We need at least two for a trie
4600                                              * so we can't merge this in */
4601                                             first = NULL;
4602                                         }
4603                                     } else {
4604                                         trietype = noper_trietype;
4605                                     }
4606                                 } else {
4607                                     if ( trietype == NOTHING )
4608                                         trietype = noper_trietype;
4609                                     last = cur;
4610                                 }
4611                                 if (first)
4612                                     count++;
4613                             } /* end handle mergable triable node */
4614                             else {
4615                                 /* handle unmergable node -
4616                                  * noper may either be a triable node which can
4617                                  * not be tried together with the current trie,
4618                                  * or a non triable node */
4619                                 if ( last ) {
4620                                     /* If last is set and trietype is not
4621                                      * NOTHING then we have found at least two
4622                                      * triable branch sequences in a row of a
4623                                      * similar trietype so we can turn them
4624                                      * into a trie. If/when we allow NOTHING to
4625                                      * start a trie sequence this condition
4626                                      * will be required, and it isn't expensive
4627                                      * so we leave it in for now. */
4628                                     if ( trietype && trietype != NOTHING )
4629                                         make_trie( pRExC_state,
4630                                                 startbranch, first, cur, tail,
4631                                                 count, trietype, depth+1 );
4632                                     last = NULL; /* note: we clear/update
4633                                                     first, trietype etc below,
4634                                                     so we dont do it here */
4635                                 }
4636                                 if ( noper_trietype
4637 #ifdef NOJUMPTRIE
4638                                      && noper_next >= tail
4639 #endif
4640                                 ){
4641                                     /* noper is triable, so we can start a new
4642                                      * trie sequence */
4643                                     count = 1;
4644                                     first = cur;
4645                                     trietype = noper_trietype;
4646                                 } else if (first) {
4647                                     /* if we already saw a first but the
4648                                      * current node is not triable then we have
4649                                      * to reset the first information. */
4650                                     count = 0;
4651                                     first = NULL;
4652                                     trietype = 0;
4653                                 }
4654                             } /* end handle unmergable node */
4655                         } /* loop over branches */
4656                         DEBUG_TRIE_COMPILE_r({
4657                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4658                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4659                               depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4660                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4661                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4662                                PL_reg_name[trietype]
4663                             );
4664
4665                         });
4666                         if ( last && trietype ) {
4667                             if ( trietype != NOTHING ) {
4668                                 /* the last branch of the sequence was part of
4669                                  * a trie, so we have to construct it here
4670                                  * outside of the loop */
4671                                 made= make_trie( pRExC_state, startbranch,
4672                                                  first, scan, tail, count,
4673                                                  trietype, depth+1 );
4674 #ifdef TRIE_STUDY_OPT
4675                                 if ( ((made == MADE_EXACT_TRIE &&
4676                                      startbranch == first)
4677                                      || ( first_non_open == first )) &&
4678                                      depth==0 ) {
4679                                     flags |= SCF_TRIE_RESTUDY;
4680                                     if ( startbranch == first
4681                                          && scan >= tail )
4682                                     {
4683                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4684                                     }
4685                                 }
4686 #endif
4687                             } else {
4688                                 /* at this point we know whatever we have is a
4689                                  * NOTHING sequence/branch AND if 'startbranch'
4690                                  * is 'first' then we can turn the whole thing
4691                                  * into a NOTHING
4692                                  */
4693                                 if ( startbranch == first ) {
4694                                     regnode *opt;
4695                                     /* the entire thing is a NOTHING sequence,
4696                                      * something like this: (?:|) So we can
4697                                      * turn it into a plain NOTHING op. */
4698                                     DEBUG_TRIE_COMPILE_r({
4699                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4700                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4701                                           depth+1,
4702                                           SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4703
4704                                     });
4705                                     OP(startbranch)= NOTHING;
4706                                     NEXT_OFF(startbranch)= tail - startbranch;
4707                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4708                                         OP(opt)= OPTIMIZED;
4709                                 }
4710                             }
4711                         } /* end if ( last) */
4712                     } /* TRIE_MAXBUF is non zero */
4713
4714                 } /* do trie */
4715
4716             }
4717             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4718                 scan = NEXTOPER(NEXTOPER(scan));
4719             } else                      /* single branch is optimized. */
4720                 scan = NEXTOPER(scan);
4721             continue;
4722         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4723             I32 paren = 0;
4724             regnode *start = NULL;
4725             regnode *end = NULL;
4726             U32 my_recursed_depth= recursed_depth;
4727
4728             if (OP(scan) != SUSPEND) { /* GOSUB */
4729                 /* Do setup, note this code has side effects beyond
4730                  * the rest of this block. Specifically setting
4731                  * RExC_recurse[] must happen at least once during
4732                  * study_chunk(). */
4733                 paren = ARG(scan);
4734                 RExC_recurse[ARG2L(scan)] = scan;
4735                 start = RExC_open_parens[paren];
4736                 end   = RExC_close_parens[paren];
4737
4738                 /* NOTE we MUST always execute the above code, even
4739                  * if we do nothing with a GOSUB */
4740                 if (
4741                     ( flags & SCF_IN_DEFINE )
4742                     ||
4743                     (
4744                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4745                         &&
4746                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4747                     )
4748                 ) {
4749                     /* no need to do anything here if we are in a define. */
4750                     /* or we are after some kind of infinite construct
4751                      * so we can skip recursing into this item.
4752                      * Since it is infinite we will not change the maxlen
4753                      * or delta, and if we miss something that might raise
4754                      * the minlen it will merely pessimise a little.
4755                      *
4756                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4757                      * might result in a minlen of 1 and not of 4,
4758                      * but this doesn't make us mismatch, just try a bit
4759                      * harder than we should.
4760                      * */
4761                     scan= regnext(scan);
4762                     continue;
4763                 }
4764
4765                 if (
4766                     !recursed_depth
4767                     ||
4768                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4769                 ) {
4770                     /* it is quite possible that there are more efficient ways
4771                      * to do this. We maintain a bitmap per level of recursion
4772                      * of which patterns we have entered so we can detect if a
4773                      * pattern creates a possible infinite loop. When we
4774                      * recurse down a level we copy the previous levels bitmap
4775                      * down. When we are at recursion level 0 we zero the top
4776                      * level bitmap. It would be nice to implement a different
4777                      * more efficient way of doing this. In particular the top
4778                      * level bitmap may be unnecessary.
4779                      */
4780                     if (!recursed_depth) {
4781                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4782                     } else {
4783                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4784                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4785                              RExC_study_chunk_recursed_bytes, U8);
4786                     }
4787                     /* we havent recursed into this paren yet, so recurse into it */
4788                     DEBUG_STUDYDATA("gosub-set:", data,depth);
4789                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4790                     my_recursed_depth= recursed_depth + 1;
4791                 } else {
4792                     DEBUG_STUDYDATA("gosub-inf:", data,depth);
4793                     /* some form of infinite recursion, assume infinite length
4794                      * */
4795                     if (flags & SCF_DO_SUBSTR) {
4796                         scan_commit(pRExC_state, data, minlenp, is_inf);
4797                         data->longest = &(data->longest_float);
4798                     }
4799                     is_inf = is_inf_internal = 1;
4800                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4801                         ssc_anything(data->start_class);
4802                     flags &= ~SCF_DO_STCLASS;
4803
4804                     start= NULL; /* reset start so we dont recurse later on. */
4805                 }
4806             } else {
4807                 paren = stopparen;
4808                 start = scan + 2;
4809                 end = regnext(scan);
4810             }
4811             if (start) {
4812                 scan_frame *newframe;
4813                 assert(end);
4814                 if (!RExC_frame_last) {
4815                     Newxz(newframe, 1, scan_frame);
4816                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4817                     RExC_frame_head= newframe;
4818                     RExC_frame_count++;
4819                 } else if (!RExC_frame_last->next_frame) {
4820                     Newxz(newframe,1,scan_frame);
4821                     RExC_frame_last->next_frame= newframe;
4822                     newframe->prev_frame= RExC_frame_last;
4823                     RExC_frame_count++;
4824                 } else {
4825                     newframe= RExC_frame_last->next_frame;
4826                 }
4827                 RExC_frame_last= newframe;
4828
4829                 newframe->next_regnode = regnext(scan);
4830                 newframe->last_regnode = last;
4831                 newframe->stopparen = stopparen;
4832                 newframe->prev_recursed_depth = recursed_depth;
4833                 newframe->this_prev_frame= frame;
4834
4835                 DEBUG_STUDYDATA("frame-new:",data,depth);
4836                 DEBUG_PEEP("fnew", scan, depth);
4837
4838                 frame = newframe;
4839                 scan =  start;
4840                 stopparen = paren;
4841                 last = end;
4842                 depth = depth + 1;
4843                 recursed_depth= my_recursed_depth;
4844
4845                 continue;
4846             }
4847         }
4848         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4849             SSize_t l = STR_LEN(scan);
4850             UV uc;
4851             if (UTF) {
4852                 const U8 * const s = (U8*)STRING(scan);
4853                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4854                 l = utf8_length(s, s + l);
4855             } else {
4856                 uc = *((U8*)STRING(scan));
4857             }
4858             min += l;
4859             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4860                 /* The code below prefers earlier match for fixed
4861                    offset, later match for variable offset.  */
4862                 if (data->last_end == -1) { /* Update the start info. */
4863                     data->last_start_min = data->pos_min;
4864                     data->last_start_max = is_inf
4865                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4866                 }
4867                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4868                 if (UTF)
4869                     SvUTF8_on(data->last_found);
4870                 {
4871                     SV * const sv = data->last_found;
4872                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4873                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4874                     if (mg && mg->mg_len >= 0)
4875                         mg->mg_len += utf8_length((U8*)STRING(scan),
4876                                               (U8*)STRING(scan)+STR_LEN(scan));
4877                 }
4878                 data->last_end = data->pos_min + l;
4879                 data->pos_min += l; /* As in the first entry. */
4880                 data->flags &= ~SF_BEFORE_EOL;
4881             }
4882
4883             /* ANDing the code point leaves at most it, and not in locale, and
4884              * can't match null string */
4885             if (flags & SCF_DO_STCLASS_AND) {
4886                 ssc_cp_and(data->start_class, uc);
4887                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4888                 ssc_clear_locale(data->start_class);
4889             }
4890             else if (flags & SCF_DO_STCLASS_OR) {
4891                 ssc_add_cp(data->start_class, uc);
4892                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4893
4894                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4895                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4896             }
4897             flags &= ~SCF_DO_STCLASS;
4898         }
4899         else if (PL_regkind[OP(scan)] == EXACT) {
4900             /* But OP != EXACT!, so is EXACTFish */
4901             SSize_t l = STR_LEN(scan);
4902             const U8 * s = (U8*)STRING(scan);
4903
4904             /* Search for fixed substrings supports EXACT only. */
4905             if (flags & SCF_DO_SUBSTR) {
4906                 assert(data);
4907                 scan_commit(pRExC_state, data, minlenp, is_inf);
4908             }
4909             if (UTF) {
4910                 l = utf8_length(s, s + l);
4911             }
4912             if (unfolded_multi_char) {
4913                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4914             }
4915             min += l - min_subtract;
4916             assert (min >= 0);
4917             delta += min_subtract;
4918             if (flags & SCF_DO_SUBSTR) {
4919                 data->pos_min += l - min_subtract;
4920                 if (data->pos_min < 0) {
4921                     data->pos_min = 0;
4922                 }
4923                 data->pos_delta += min_subtract;
4924                 if (min_subtract) {
4925                     data->longest = &(data->longest_float);
4926                 }
4927             }
4928
4929             if (flags & SCF_DO_STCLASS) {
4930                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4931
4932                 assert(EXACTF_invlist);
4933                 if (flags & SCF_DO_STCLASS_AND) {
4934                     if (OP(scan) != EXACTFL)
4935                         ssc_clear_locale(data->start_class);
4936                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4937                     ANYOF_POSIXL_ZERO(data->start_class);
4938                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4939                 }
4940                 else {  /* SCF_DO_STCLASS_OR */
4941                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4942                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4943
4944                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4945                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4946                 }
4947                 flags &= ~SCF_DO_STCLASS;
4948                 SvREFCNT_dec(EXACTF_invlist);
4949             }
4950         }
4951         else if (REGNODE_VARIES(OP(scan))) {
4952             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4953             I32 fl = 0, f = flags;
4954             regnode * const oscan = scan;
4955             regnode_ssc this_class;
4956             regnode_ssc *oclass = NULL;
4957             I32 next_is_eval = 0;
4958
4959             switch (PL_regkind[OP(scan)]) {
4960             case WHILEM:                /* End of (?:...)* . */
4961                 scan = NEXTOPER(scan);
4962                 goto finish;
4963             case PLUS:
4964                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4965                     next = NEXTOPER(scan);
4966                     if (OP(next) == EXACT
4967                         || OP(next) == EXACTL
4968                         || (flags & SCF_DO_STCLASS))
4969                     {
4970                         mincount = 1;
4971                         maxcount = REG_INFTY;
4972                         next = regnext(scan);
4973                         scan = NEXTOPER(scan);
4974                         goto do_curly;
4975                     }
4976                 }
4977                 if (flags & SCF_DO_SUBSTR)
4978                     data->pos_min++;
4979                 min++;
4980                 /* FALLTHROUGH */
4981             case STAR:
4982                 if (flags & SCF_DO_STCLASS) {
4983                     mincount = 0;
4984                     maxcount = REG_INFTY;
4985                     next = regnext(scan);
4986                     scan = NEXTOPER(scan);
4987                     goto do_curly;
4988                 }
4989                 if (flags & SCF_DO_SUBSTR) {
4990                     scan_commit(pRExC_state, data, minlenp, is_inf);
4991                     /* Cannot extend fixed substrings */
4992                     data->longest = &(data->longest_float);
4993                 }
4994                 is_inf = is_inf_internal = 1;
4995                 scan = regnext(scan);
4996                 goto optimize_curly_tail;
4997             case CURLY:
4998                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4999                     && (scan->flags == stopparen))
5000                 {
5001                     mincount = 1;
5002                     maxcount = 1;
5003                 } else {
5004                     mincount = ARG1(scan);
5005                     maxcount = ARG2(scan);
5006                 }
5007                 next = regnext(scan);
5008                 if (OP(scan) == CURLYX) {
5009                     I32 lp = (data ? *(data->last_closep) : 0);
5010                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5011                 }
5012                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5013                 next_is_eval = (OP(scan) == EVAL);
5014               do_curly:
5015                 if (flags & SCF_DO_SUBSTR) {
5016                     if (mincount == 0)
5017                         scan_commit(pRExC_state, data, minlenp, is_inf);
5018                     /* Cannot extend fixed substrings */
5019                     pos_before = data->pos_min;
5020                 }
5021                 if (data) {
5022                     fl = data->flags;
5023                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5024                     if (is_inf)
5025                         data->flags |= SF_IS_INF;
5026                 }
5027                 if (flags & SCF_DO_STCLASS) {
5028                     ssc_init(pRExC_state, &this_class);
5029                     oclass = data->start_class;
5030                     data->start_class = &this_class;
5031                     f |= SCF_DO_STCLASS_AND;
5032                     f &= ~SCF_DO_STCLASS_OR;
5033                 }
5034                 /* Exclude from super-linear cache processing any {n,m}
5035                    regops for which the combination of input pos and regex
5036                    pos is not enough information to determine if a match
5037                    will be possible.
5038
5039                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5040                    regex pos at the \s*, the prospects for a match depend not
5041                    only on the input position but also on how many (bar\s*)
5042                    repeats into the {4,8} we are. */
5043                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5044                     f &= ~SCF_WHILEM_VISITED_POS;
5045
5046                 /* This will finish on WHILEM, setting scan, or on NULL: */
5047                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5048                                   last, data, stopparen, recursed_depth, NULL,
5049                                   (mincount == 0
5050                                    ? (f & ~SCF_DO_SUBSTR)
5051                                    : f)
5052                                   ,depth+1);
5053
5054                 if (flags & SCF_DO_STCLASS)
5055                     data->start_class = oclass;
5056                 if (mincount == 0 || minnext == 0) {
5057                     if (flags & SCF_DO_STCLASS_OR) {
5058                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5059                     }
5060                     else if (flags & SCF_DO_STCLASS_AND) {
5061                         /* Switch to OR mode: cache the old value of
5062                          * data->start_class */
5063                         INIT_AND_WITHP;
5064                         StructCopy(data->start_class, and_withp, regnode_ssc);
5065                         flags &= ~SCF_DO_STCLASS_AND;
5066                         StructCopy(&this_class, data->start_class, regnode_ssc);
5067                         flags |= SCF_DO_STCLASS_OR;
5068                         ANYOF_FLAGS(data->start_class)
5069                                                 |= SSC_MATCHES_EMPTY_STRING;
5070                     }
5071                 } else {                /* Non-zero len */
5072                     if (flags & SCF_DO_STCLASS_OR) {
5073                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5074                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5075                     }
5076                     else if (flags & SCF_DO_STCLASS_AND)
5077                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5078                     flags &= ~SCF_DO_STCLASS;
5079                 }
5080                 if (!scan)              /* It was not CURLYX, but CURLY. */
5081                     scan = next;
5082                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
5083                     /* ? quantifier ok, except for (?{ ... }) */
5084                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5085                     && (minnext == 0) && (deltanext == 0)
5086                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5087                     && maxcount <= REG_INFTY/3) /* Complement check for big
5088                                                    count */
5089                 {
5090                     /* Fatal warnings may leak the regexp without this: */
5091                     SAVEFREESV(RExC_rx_sv);
5092                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5093                         "Quantifier unexpected on zero-length expression "
5094                         "in regex m/%" UTF8f "/",
5095                          UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5096                                   RExC_precomp));
5097                     (void)ReREFCNT_inc(RExC_rx_sv);
5098                 }
5099
5100                 min += minnext * mincount;
5101                 is_inf_internal |= deltanext == SSize_t_MAX
5102                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5103                 is_inf |= is_inf_internal;
5104                 if (is_inf) {
5105                     delta = SSize_t_MAX;
5106                 } else {
5107                     delta += (minnext + deltanext) * maxcount
5108                              - minnext * mincount;
5109                 }
5110                 /* Try powerful optimization CURLYX => CURLYN. */
5111                 if (  OP(oscan) == CURLYX && data
5112                       && data->flags & SF_IN_PAR
5113                       && !(data->flags & SF_HAS_EVAL)
5114                       && !deltanext && minnext == 1 ) {
5115                     /* Try to optimize to CURLYN.  */
5116                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5117                     regnode * const nxt1 = nxt;
5118 #ifdef DEBUGGING
5119                     regnode *nxt2;
5120 #endif
5121
5122                     /* Skip open. */
5123                     nxt = regnext(nxt);
5124                     if (!REGNODE_SIMPLE(OP(nxt))
5125                         && !(PL_regkind[OP(nxt)] == EXACT
5126                              && STR_LEN(nxt) == 1))
5127                         goto nogo;
5128 #ifdef DEBUGGING
5129                     nxt2 = nxt;
5130 #endif
5131                     nxt = regnext(nxt);
5132                     if (OP(nxt) != CLOSE)
5133                         goto nogo;
5134                     if (RExC_open_parens) {
5135                         RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5136                         RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5137                     }
5138                     /* Now we know that nxt2 is the only contents: */
5139                     oscan->flags = (U8)ARG(nxt);
5140                     OP(oscan) = CURLYN;
5141                     OP(nxt1) = NOTHING; /* was OPEN. */
5142
5143 #ifdef DEBUGGING
5144                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5145                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5146                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5147                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5148                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5149                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5150 #endif
5151                 }
5152               nogo:
5153
5154                 /* Try optimization CURLYX => CURLYM. */
5155                 if (  OP(oscan) == CURLYX && data
5156                       && !(data->flags & SF_HAS_PAR)
5157                       && !(data->flags & SF_HAS_EVAL)
5158                       && !deltanext     /* atom is fixed width */
5159                       && minnext != 0   /* CURLYM can't handle zero width */
5160
5161                          /* Nor characters whose fold at run-time may be
5162                           * multi-character */
5163                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5164                 ) {
5165                     /* XXXX How to optimize if data == 0? */
5166                     /* Optimize to a simpler form.  */
5167                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5168                     regnode *nxt2;
5169
5170                     OP(oscan) = CURLYM;
5171                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5172                             && (OP(nxt2) != WHILEM))
5173                         nxt = nxt2;
5174                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5175                     /* Need to optimize away parenths. */
5176                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5177                         /* Set the parenth number.  */
5178                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5179
5180                         oscan->flags = (U8)ARG(nxt);
5181                         if (RExC_open_parens) {
5182                             RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5183                             RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5184                         }
5185                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5186                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5187
5188 #ifdef DEBUGGING
5189                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5190                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5191                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5192                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5193 #endif
5194 #if 0
5195                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5196                             regnode *nnxt = regnext(nxt1);
5197                             if (nnxt == nxt) {
5198                                 if (reg_off_by_arg[OP(nxt1)])
5199                                     ARG_SET(nxt1, nxt2 - nxt1);
5200                                 else if (nxt2 - nxt1 < U16_MAX)
5201                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5202                                 else
5203                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5204                             }
5205                             nxt1 = nnxt;
5206                         }
5207 #endif
5208                         /* Optimize again: */
5209                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5210                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5211                     }
5212                     else
5213                         oscan->flags = 0;
5214                 }
5215                 else if ((OP(oscan) == CURLYX)
5216                          && (flags & SCF_WHILEM_VISITED_POS)
5217                          /* See the comment on a similar expression above.
5218                             However, this time it's not a subexpression
5219                             we care about, but the expression itself. */
5220                          && (maxcount == REG_INFTY)
5221                          && data) {
5222                     /* This stays as CURLYX, we can put the count/of pair. */
5223                     /* Find WHILEM (as in regexec.c) */
5224                     regnode *nxt = oscan + NEXT_OFF(oscan);
5225
5226                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5227                         nxt += ARG(nxt);
5228                     nxt = PREVOPER(nxt);
5229                     if (nxt->flags & 0xf) {
5230                         /* we've already set whilem count on this node */
5231                     } else if (++data->whilem_c < 16) {
5232                         assert(data->whilem_c <= RExC_whilem_seen);
5233                         nxt->flags = (U8)(data->whilem_c
5234                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5235                     }
5236                 }
5237                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5238                     pars++;
5239                 if (flags & SCF_DO_SUBSTR) {
5240                     SV *last_str = NULL;
5241                     STRLEN last_chrs = 0;
5242                     int counted = mincount != 0;
5243
5244                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5245                                                                   string. */
5246                         SSize_t b = pos_before >= data->last_start_min
5247                             ? pos_before : data->last_start_min;
5248                         STRLEN l;
5249                         const char * const s = SvPV_const(data->last_found, l);
5250                         SSize_t old = b - data->last_start_min;
5251
5252                         if (UTF)
5253                             old = utf8_hop((U8*)s, old) - (U8*)s;
5254                         l -= old;
5255                         /* Get the added string: */
5256                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5257                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5258                                             (U8*)(s + old + l)) : l;
5259                         if (deltanext == 0 && pos_before == b) {
5260                             /* What was added is a constant string */
5261                             if (mincount > 1) {
5262
5263                                 SvGROW(last_str, (mincount * l) + 1);
5264                                 repeatcpy(SvPVX(last_str) + l,
5265                                           SvPVX_const(last_str), l,
5266                                           mincount - 1);
5267                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5268                                 /* Add additional parts. */
5269                                 SvCUR_set(data->last_found,
5270                                           SvCUR(data->last_found) - l);
5271                                 sv_catsv(data->last_found, last_str);
5272                                 {
5273                                     SV * sv = data->last_found;
5274                                     MAGIC *mg =
5275                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5276                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5277                                     if (mg && mg->mg_len >= 0)
5278                                         mg->mg_len += last_chrs * (mincount-1);
5279                                 }
5280                                 last_chrs *= mincount;
5281                                 data->last_end += l * (mincount - 1);
5282                             }
5283                         } else {
5284                             /* start offset must point into the last copy */
5285                             data->last_start_min += minnext * (mincount - 1);
5286                             data->last_start_max =
5287                               is_inf
5288                                ? SSize_t_MAX
5289                                : data->last_start_max +
5290                                  (maxcount - 1) * (minnext + data->pos_delta);
5291                         }
5292                     }
5293                     /* It is counted once already... */
5294                     data->pos_min += minnext * (mincount - counted);
5295 #if 0
5296 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5297                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5298                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5299     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5300     (UV)mincount);
5301 if (deltanext != SSize_t_MAX)
5302 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5303     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5304           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5305 #endif
5306                     if (deltanext == SSize_t_MAX
5307                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5308                         data->pos_delta = SSize_t_MAX;
5309                     else
5310                         data->pos_delta += - counted * deltanext +
5311                         (minnext + deltanext) * maxcount - minnext * mincount;
5312                     if (mincount != maxcount) {
5313                          /* Cannot extend fixed substrings found inside
5314                             the group.  */
5315                         scan_commit(pRExC_state, data, minlenp, is_inf);
5316                         if (mincount && last_str) {
5317                             SV * const sv = data->last_found;
5318                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5319                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5320
5321                             if (mg)
5322                                 mg->mg_len = -1;
5323                             sv_setsv(sv, last_str);
5324                             data->last_end = data->pos_min;
5325                             data->last_start_min = data->pos_min - last_chrs;
5326                             data->last_start_max = is_inf
5327                                 ? SSize_t_MAX
5328                                 : data->pos_min + data->pos_delta - last_chrs;
5329                         }
5330                         data->longest = &(data->longest_float);
5331                     }
5332                     SvREFCNT_dec(last_str);
5333                 }
5334                 if (data && (fl & SF_HAS_EVAL))
5335                     data->flags |= SF_HAS_EVAL;
5336               optimize_curly_tail:
5337                 if (OP(oscan) != CURLYX) {
5338                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5339                            && NEXT_OFF(next))
5340                         NEXT_OFF(oscan) += NEXT_OFF(next);
5341                 }
5342                 continue;
5343
5344             default:
5345 #ifdef DEBUGGING
5346                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5347                                                                     OP(scan));
5348 #endif
5349             case REF:
5350             case CLUMP:
5351                 if (flags & SCF_DO_SUBSTR) {
5352                     /* Cannot expect anything... */
5353                     scan_commit(pRExC_state, data, minlenp, is_inf);
5354                     data->longest = &(data->longest_float);
5355                 }
5356                 is_inf = is_inf_internal = 1;
5357                 if (flags & SCF_DO_STCLASS_OR) {
5358                     if (OP(scan) == CLUMP) {
5359                         /* Actually is any start char, but very few code points
5360                          * aren't start characters */
5361                         ssc_match_all_cp(data->start_class);
5362                     }
5363                     else {
5364                         ssc_anything(data->start_class);
5365                     }
5366                 }
5367                 flags &= ~SCF_DO_STCLASS;
5368                 break;
5369             }
5370         }
5371         else if (OP(scan) == LNBREAK) {
5372             if (flags & SCF_DO_STCLASS) {
5373                 if (flags & SCF_DO_STCLASS_AND) {
5374                     ssc_intersection(data->start_class,
5375                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5376                     ssc_clear_locale(data->start_class);
5377                     ANYOF_FLAGS(data->start_class)
5378                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5379                 }
5380                 else if (flags & SCF_DO_STCLASS_OR) {
5381                     ssc_union(data->start_class,
5382                               PL_XPosix_ptrs[_CC_VERTSPACE],
5383                               FALSE);
5384                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5385
5386                     /* See commit msg for
5387                      * 749e076fceedeb708a624933726e7989f2302f6a */
5388                     ANYOF_FLAGS(data->start_class)
5389                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5390                 }
5391                 flags &= ~SCF_DO_STCLASS;
5392             }
5393             min++;
5394             if (delta != SSize_t_MAX)
5395                 delta++;    /* Because of the 2 char string cr-lf */
5396             if (flags & SCF_DO_SUBSTR) {
5397                 /* Cannot expect anything... */
5398                 scan_commit(pRExC_state, data, minlenp, is_inf);
5399                 data->pos_min += 1;
5400                 data->pos_delta += 1;
5401                 data->longest = &(data->longest_float);
5402             }
5403         }
5404         else if (REGNODE_SIMPLE(OP(scan))) {
5405
5406             if (flags & SCF_DO_SUBSTR) {
5407                 scan_commit(pRExC_state, data, minlenp, is_inf);
5408                 data->pos_min++;
5409             }
5410             min++;
5411             if (flags & SCF_DO_STCLASS) {
5412                 bool invert = 0;
5413                 SV* my_invlist = NULL;
5414                 U8 namedclass;
5415
5416                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5417                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5418
5419                 /* Some of the logic below assumes that switching
5420                    locale on will only add false positives. */
5421                 switch (OP(scan)) {
5422
5423                 default:
5424 #ifdef DEBUGGING
5425                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5426                                                                      OP(scan));
5427 #endif
5428                 case SANY:
5429                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5430                         ssc_match_all_cp(data->start_class);
5431                     break;
5432
5433                 case REG_ANY:
5434                     {
5435                         SV* REG_ANY_invlist = _new_invlist(2);
5436                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5437                                                             '\n');
5438                         if (flags & SCF_DO_STCLASS_OR) {
5439                             ssc_union(data->start_class,
5440                                       REG_ANY_invlist,
5441                                       TRUE /* TRUE => invert, hence all but \n
5442                                             */
5443                                       );
5444                         }
5445                         else if (flags & SCF_DO_STCLASS_AND) {
5446                             ssc_intersection(data->start_class,
5447                                              REG_ANY_invlist,
5448                                              TRUE  /* TRUE => invert */
5449                                              );
5450                             ssc_clear_locale(data->start_class);
5451                         }
5452                         SvREFCNT_dec_NN(REG_ANY_invlist);
5453                     }
5454                     break;
5455
5456                 case ANYOFD:
5457                 case ANYOFL:
5458                 case ANYOF:
5459                     if (flags & SCF_DO_STCLASS_AND)
5460                         ssc_and(pRExC_state, data->start_class,
5461                                 (regnode_charclass *) scan);
5462                     else
5463                         ssc_or(pRExC_state, data->start_class,
5464                                                           (regnode_charclass *) scan);
5465                     break;
5466
5467                 case NPOSIXL:
5468                     invert = 1;
5469                     /* FALLTHROUGH */
5470
5471                 case POSIXL:
5472                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5473                     if (flags & SCF_DO_STCLASS_AND) {
5474                         bool was_there = cBOOL(
5475                                           ANYOF_POSIXL_TEST(data->start_class,
5476                                                                  namedclass));
5477                         ANYOF_POSIXL_ZERO(data->start_class);
5478                         if (was_there) {    /* Do an AND */
5479                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5480                         }
5481                         /* No individual code points can now match */
5482                         data->start_class->invlist
5483                                                 = sv_2mortal(_new_invlist(0));
5484                     }
5485                     else {
5486                         int complement = namedclass + ((invert) ? -1 : 1);
5487
5488                         assert(flags & SCF_DO_STCLASS_OR);
5489
5490                         /* If the complement of this class was already there,
5491                          * the result is that they match all code points,
5492                          * (\d + \D == everything).  Remove the classes from
5493                          * future consideration.  Locale is not relevant in
5494                          * this case */
5495                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5496                             ssc_match_all_cp(data->start_class);
5497                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5498                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5499                         }
5500                         else {  /* The usual case; just add this class to the
5501                                    existing set */
5502                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5503                         }
5504                     }
5505                     break;
5506
5507                 case NPOSIXA:   /* For these, we always know the exact set of
5508                                    what's matched */
5509                     invert = 1;
5510                     /* FALLTHROUGH */
5511                 case POSIXA:
5512                     if (FLAGS(scan) == _CC_ASCII) {
5513                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5514                     }
5515                     else {
5516                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5517                                               PL_XPosix_ptrs[_CC_ASCII],
5518                                               &my_invlist);
5519                     }
5520                     goto join_posix;
5521
5522                 case NPOSIXD:
5523                 case NPOSIXU:
5524                     invert = 1;
5525                     /* FALLTHROUGH */
5526                 case POSIXD:
5527                 case POSIXU:
5528                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5529
5530                     /* NPOSIXD matches all upper Latin1 code points unless the
5531                      * target string being matched is UTF-8, which is
5532                      * unknowable until match time.  Since we are going to
5533                      * invert, we want to get rid of all of them so that the
5534                      * inversion will match all */
5535                     if (OP(scan) == NPOSIXD) {
5536                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5537                                           &my_invlist);
5538                     }
5539
5540                   join_posix:
5541
5542                     if (flags & SCF_DO_STCLASS_AND) {
5543                         ssc_intersection(data->start_class, my_invlist, invert);
5544                         ssc_clear_locale(data->start_class);
5545                     }
5546                     else {
5547                         assert(flags & SCF_DO_STCLASS_OR);
5548                         ssc_union(data->start_class, my_invlist, invert);
5549                     }
5550                     SvREFCNT_dec(my_invlist);
5551                 }
5552                 if (flags & SCF_DO_STCLASS_OR)
5553                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5554                 flags &= ~SCF_DO_STCLASS;
5555             }
5556         }
5557         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5558             data->flags |= (OP(scan) == MEOL
5559                             ? SF_BEFORE_MEOL
5560                             : SF_BEFORE_SEOL);
5561             scan_commit(pRExC_state, data, minlenp, is_inf);
5562
5563         }
5564         else if (  PL_regkind[OP(scan)] == BRANCHJ
5565                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5566                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5567                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5568         {
5569             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5570                 || OP(scan) == UNLESSM )
5571             {
5572                 /* Negative Lookahead/lookbehind
5573                    In this case we can't do fixed string optimisation.
5574                 */
5575
5576                 SSize_t deltanext, minnext, fake = 0;
5577                 regnode *nscan;
5578                 regnode_ssc intrnl;
5579                 int f = 0;
5580
5581                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5582                 if (data) {
5583                     data_fake.whilem_c = data->whilem_c;
5584                     data_fake.last_closep = data->last_closep;
5585                 }
5586                 else
5587                     data_fake.last_closep = &fake;
5588                 data_fake.pos_delta = delta;
5589                 if ( flags & SCF_DO_STCLASS && !scan->flags
5590                      && OP(scan) == IFMATCH ) { /* Lookahead */
5591                     ssc_init(pRExC_state, &intrnl);
5592                     data_fake.start_class = &intrnl;
5593                     f |= SCF_DO_STCLASS_AND;
5594                 }
5595                 if (flags & SCF_WHILEM_VISITED_POS)
5596                     f |= SCF_WHILEM_VISITED_POS;
5597                 next = regnext(scan);
5598                 nscan = NEXTOPER(NEXTOPER(scan));
5599                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5600                                       last, &data_fake, stopparen,
5601                                       recursed_depth, NULL, f, depth+1);
5602                 if (scan->flags) {
5603                     if (deltanext) {
5604                         FAIL("Variable length lookbehind not implemented");
5605                     }
5606                     else if (minnext > (I32)U8_MAX) {
5607                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5608                               (UV)U8_MAX);
5609                     }
5610                     scan->flags = (U8)minnext;
5611                 }
5612                 if (data) {
5613                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5614                         pars++;
5615                     if (data_fake.flags & SF_HAS_EVAL)
5616                         data->flags |= SF_HAS_EVAL;
5617                     data->whilem_c = data_fake.whilem_c;
5618                 }
5619                 if (f & SCF_DO_STCLASS_AND) {
5620                     if (flags & SCF_DO_STCLASS_OR) {
5621                         /* OR before, AND after: ideally we would recurse with
5622                          * data_fake to get the AND applied by study of the
5623                          * remainder of the pattern, and then derecurse;
5624                          * *** HACK *** for now just treat as "no information".
5625                          * See [perl #56690].
5626                          */
5627                         ssc_init(pRExC_state, data->start_class);
5628                     }  else {
5629                         /* AND before and after: combine and continue.  These
5630                          * assertions are zero-length, so can match an EMPTY
5631                          * string */
5632                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5633                         ANYOF_FLAGS(data->start_class)
5634                                                    |= SSC_MATCHES_EMPTY_STRING;
5635                     }
5636                 }
5637             }
5638 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5639             else {
5640                 /* Positive Lookahead/lookbehind
5641                    In this case we can do fixed string optimisation,
5642                    but we must be careful about it. Note in the case of
5643                    lookbehind the positions will be offset by the minimum
5644                    length of the pattern, something we won't know about
5645                    until after the recurse.
5646                 */
5647                 SSize_t deltanext, fake = 0;
5648                 regnode *nscan;
5649                 regnode_ssc intrnl;
5650                 int f = 0;
5651                 /* We use SAVEFREEPV so that when the full compile
5652                     is finished perl will clean up the allocated
5653                     minlens when it's all done. This way we don't
5654                     have to worry about freeing them when we know
5655                     they wont be used, which would be a pain.
5656                  */
5657                 SSize_t *minnextp;
5658                 Newx( minnextp, 1, SSize_t );
5659                 SAVEFREEPV(minnextp);
5660
5661                 if (data) {
5662                     StructCopy(data, &data_fake, scan_data_t);
5663                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5664                         f |= SCF_DO_SUBSTR;
5665                         if (scan->flags)
5666                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5667                         data_fake.last_found=newSVsv(data->last_found);
5668                     }
5669                 }
5670                 else
5671                     data_fake.last_closep = &fake;
5672                 data_fake.flags = 0;
5673                 data_fake.pos_delta = delta;
5674                 if (is_inf)
5675                     data_fake.flags |= SF_IS_INF;
5676                 if ( flags & SCF_DO_STCLASS && !scan->flags
5677                      && OP(scan) == IFMATCH ) { /* Lookahead */
5678                     ssc_init(pRExC_state, &intrnl);
5679                     data_fake.start_class = &intrnl;
5680                     f |= SCF_DO_STCLASS_AND;
5681                 }
5682                 if (flags & SCF_WHILEM_VISITED_POS)
5683                     f |= SCF_WHILEM_VISITED_POS;
5684                 next = regnext(scan);
5685                 nscan = NEXTOPER(NEXTOPER(scan));
5686
5687                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5688                                         &deltanext, last, &data_fake,
5689                                         stopparen, recursed_depth, NULL,
5690                                         f,depth+1);
5691                 if (scan->flags) {
5692                     if (deltanext) {
5693                         FAIL("Variable length lookbehind not implemented");
5694                     }
5695                     else if (*minnextp > (I32)U8_MAX) {
5696                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5697                               (UV)U8_MAX);
5698                     }
5699                     scan->flags = (U8)*minnextp;
5700                 }
5701
5702                 *minnextp += min;
5703
5704                 if (f & SCF_DO_STCLASS_AND) {
5705                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5706                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5707                 }
5708                 if (data) {
5709                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5710                         pars++;
5711                     if (data_fake.flags & SF_HAS_EVAL)
5712                         data->flags |= SF_HAS_EVAL;
5713                     data->whilem_c = data_fake.whilem_c;
5714                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5715                         if (RExC_rx->minlen<*minnextp)
5716                             RExC_rx->minlen=*minnextp;
5717                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5718                         SvREFCNT_dec_NN(data_fake.last_found);
5719
5720                         if ( data_fake.minlen_fixed != minlenp )
5721                         {
5722                             data->offset_fixed= data_fake.offset_fixed;
5723                             data->minlen_fixed= data_fake.minlen_fixed;
5724                             data->lookbehind_fixed+= scan->flags;
5725                         }
5726                         if ( data_fake.minlen_float != minlenp )
5727                         {
5728                             data->minlen_float= data_fake.minlen_float;
5729                             data->offset_float_min=data_fake.offset_float_min;
5730                             data->offset_float_max=data_fake.offset_float_max;
5731                             data->lookbehind_float+= scan->flags;
5732                         }
5733                     }
5734                 }
5735             }
5736 #endif
5737         }
5738         else if (OP(scan) == OPEN) {
5739             if (stopparen != (I32)ARG(scan))
5740                 pars++;
5741         }
5742         else if (OP(scan) == CLOSE) {
5743             if (stopparen == (I32)ARG(scan)) {
5744                 break;
5745             }
5746             if ((I32)ARG(scan) == is_par) {
5747                 next = regnext(scan);
5748
5749                 if ( next && (OP(next) != WHILEM) && next < last)
5750                     is_par = 0;         /* Disable optimization */
5751             }
5752             if (data)
5753                 *(data->last_closep) = ARG(scan);
5754         }
5755         else if (OP(scan) == EVAL) {
5756                 if (data)
5757                     data->flags |= SF_HAS_EVAL;
5758         }
5759         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5760             if (flags & SCF_DO_SUBSTR) {
5761                 scan_commit(pRExC_state, data, minlenp, is_inf);
5762                 flags &= ~SCF_DO_SUBSTR;
5763             }
5764             if (data && OP(scan)==ACCEPT) {
5765                 data->flags |= SCF_SEEN_ACCEPT;
5766                 if (stopmin > min)
5767                     stopmin = min;
5768             }
5769         }
5770         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5771         {
5772                 if (flags & SCF_DO_SUBSTR) {
5773                     scan_commit(pRExC_state, data, minlenp, is_inf);
5774                     data->longest = &(data->longest_float);
5775                 }
5776                 is_inf = is_inf_internal = 1;
5777                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5778                     ssc_anything(data->start_class);
5779                 flags &= ~SCF_DO_STCLASS;
5780         }
5781         else if (OP(scan) == GPOS) {
5782             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5783                 !(delta || is_inf || (data && data->pos_delta)))
5784             {
5785                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5786                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5787                 if (RExC_rx->gofs < (STRLEN)min)
5788                     RExC_rx->gofs = min;
5789             } else {
5790                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5791                 RExC_rx->gofs = 0;
5792             }
5793         }
5794 #ifdef TRIE_STUDY_OPT
5795 #ifdef FULL_TRIE_STUDY
5796         else if (PL_regkind[OP(scan)] == TRIE) {
5797             /* NOTE - There is similar code to this block above for handling
5798                BRANCH nodes on the initial study.  If you change stuff here
5799                check there too. */
5800             regnode *trie_node= scan;
5801             regnode *tail= regnext(scan);
5802             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5803             SSize_t max1 = 0, min1 = SSize_t_MAX;
5804             regnode_ssc accum;
5805
5806             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5807                 /* Cannot merge strings after this. */
5808                 scan_commit(pRExC_state, data, minlenp, is_inf);
5809             }
5810             if (flags & SCF_DO_STCLASS)
5811                 ssc_init_zero(pRExC_state, &accum);
5812
5813             if (!trie->jump) {
5814                 min1= trie->minlen;
5815                 max1= trie->maxlen;
5816             } else {
5817                 const regnode *nextbranch= NULL;
5818                 U32 word;
5819
5820                 for ( word=1 ; word <= trie->wordcount ; word++)
5821                 {
5822                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5823                     regnode_ssc this_class;
5824
5825                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5826                     if (data) {
5827                         data_fake.whilem_c = data->whilem_c;
5828                         data_fake.last_closep = data->last_closep;
5829                     }
5830                     else
5831                         data_fake.last_closep = &fake;
5832                     data_fake.pos_delta = delta;
5833                     if (flags & SCF_DO_STCLASS) {
5834                         ssc_init(pRExC_state, &this_class);
5835                         data_fake.start_class = &this_class;
5836                         f = SCF_DO_STCLASS_AND;
5837                     }
5838                     if (flags & SCF_WHILEM_VISITED_POS)
5839                         f |= SCF_WHILEM_VISITED_POS;
5840
5841                     if (trie->jump[word]) {
5842                         if (!nextbranch)
5843                             nextbranch = trie_node + trie->jump[0];
5844                         scan= trie_node + trie->jump[word];
5845                         /* We go from the jump point to the branch that follows
5846                            it. Note this means we need the vestigal unused
5847                            branches even though they arent otherwise used. */
5848                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5849                             &deltanext, (regnode *)nextbranch, &data_fake,
5850                             stopparen, recursed_depth, NULL, f,depth+1);
5851                     }
5852                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5853                         nextbranch= regnext((regnode*)nextbranch);
5854
5855                     if (min1 > (SSize_t)(minnext + trie->minlen))
5856                         min1 = minnext + trie->minlen;
5857                     if (deltanext == SSize_t_MAX) {
5858                         is_inf = is_inf_internal = 1;
5859                         max1 = SSize_t_MAX;
5860                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5861                         max1 = minnext + deltanext + trie->maxlen;
5862
5863                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5864                         pars++;
5865                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5866                         if ( stopmin > min + min1)
5867                             stopmin = min + min1;
5868                         flags &= ~SCF_DO_SUBSTR;
5869                         if (data)
5870                             data->flags |= SCF_SEEN_ACCEPT;
5871                     }
5872                     if (data) {
5873                         if (data_fake.flags & SF_HAS_EVAL)
5874                             data->flags |= SF_HAS_EVAL;
5875                         data->whilem_c = data_fake.whilem_c;
5876                     }
5877                     if (flags & SCF_DO_STCLASS)
5878                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5879                 }
5880             }
5881             if (flags & SCF_DO_SUBSTR) {
5882                 data->pos_min += min1;
5883                 data->pos_delta += max1 - min1;
5884                 if (max1 != min1 || is_inf)
5885                     data->longest = &(data->longest_float);
5886             }
5887             min += min1;
5888             if (delta != SSize_t_MAX)
5889                 delta += max1 - min1;
5890             if (flags & SCF_DO_STCLASS_OR) {
5891                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5892                 if (min1) {
5893                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5894                     flags &= ~SCF_DO_STCLASS;
5895                 }
5896             }
5897             else if (flags & SCF_DO_STCLASS_AND) {
5898                 if (min1) {
5899                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5900                     flags &= ~SCF_DO_STCLASS;
5901                 }
5902                 else {
5903                     /* Switch to OR mode: cache the old value of
5904                      * data->start_class */
5905                     INIT_AND_WITHP;
5906                     StructCopy(data->start_class, and_withp, regnode_ssc);
5907                     flags &= ~SCF_DO_STCLASS_AND;
5908                     StructCopy(&accum, data->start_class, regnode_ssc);
5909                     flags |= SCF_DO_STCLASS_OR;
5910                 }
5911             }
5912             scan= tail;
5913             continue;
5914         }
5915 #else
5916         else if (PL_regkind[OP(scan)] == TRIE) {
5917             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5918             U8*bang=NULL;
5919
5920             min += trie->minlen;
5921             delta += (trie->maxlen - trie->minlen);
5922             flags &= ~SCF_DO_STCLASS; /* xxx */
5923             if (flags & SCF_DO_SUBSTR) {
5924                 /* Cannot expect anything... */
5925                 scan_commit(pRExC_state, data, minlenp, is_inf);
5926                 data->pos_min += trie->minlen;
5927                 data->pos_delta += (trie->maxlen - trie->minlen);
5928                 if (trie->maxlen != trie->minlen)
5929                     data->longest = &(data->longest_float);
5930             }
5931             if (trie->jump) /* no more substrings -- for now /grr*/
5932                flags &= ~SCF_DO_SUBSTR;
5933         }
5934 #endif /* old or new */
5935 #endif /* TRIE_STUDY_OPT */
5936
5937         /* Else: zero-length, ignore. */
5938         scan = regnext(scan);
5939     }
5940
5941   finish:
5942     if (frame) {
5943         /* we need to unwind recursion. */
5944         depth = depth - 1;
5945
5946         DEBUG_STUDYDATA("frame-end:",data,depth);
5947         DEBUG_PEEP("fend", scan, depth);
5948
5949         /* restore previous context */
5950         last = frame->last_regnode;
5951         scan = frame->next_regnode;
5952         stopparen = frame->stopparen;
5953         recursed_depth = frame->prev_recursed_depth;
5954
5955         RExC_frame_last = frame->prev_frame;
5956         frame = frame->this_prev_frame;
5957         goto fake_study_recurse;
5958     }
5959
5960     assert(!frame);
5961     DEBUG_STUDYDATA("pre-fin:",data,depth);
5962
5963     *scanp = scan;
5964     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5965
5966     if (flags & SCF_DO_SUBSTR && is_inf)
5967         data->pos_delta = SSize_t_MAX - data->pos_min;
5968     if (is_par > (I32)U8_MAX)
5969         is_par = 0;
5970     if (is_par && pars==1 && data) {
5971         data->flags |= SF_IN_PAR;
5972         data->flags &= ~SF_HAS_PAR;
5973     }
5974     else if (pars && data) {
5975         data->flags |= SF_HAS_PAR;
5976         data->flags &= ~SF_IN_PAR;
5977     }
5978     if (flags & SCF_DO_STCLASS_OR)
5979         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5980     if (flags & SCF_TRIE_RESTUDY)
5981         data->flags |=  SCF_TRIE_RESTUDY;
5982
5983     DEBUG_STUDYDATA("post-fin:",data,depth);
5984
5985     {
5986         SSize_t final_minlen= min < stopmin ? min : stopmin;
5987
5988         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5989             if (final_minlen > SSize_t_MAX - delta)
5990                 RExC_maxlen = SSize_t_MAX;
5991             else if (RExC_maxlen < final_minlen + delta)
5992                 RExC_maxlen = final_minlen + delta;
5993         }
5994         return final_minlen;
5995     }
5996     NOT_REACHED; /* NOTREACHED */
5997 }
5998
5999 STATIC U32
6000 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6001 {
6002     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6003
6004     PERL_ARGS_ASSERT_ADD_DATA;
6005
6006     Renewc(RExC_rxi->data,
6007            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6008            char, struct reg_data);
6009     if(count)
6010         Renew(RExC_rxi->data->what, count + n, U8);
6011     else
6012         Newx(RExC_rxi->data->what, n, U8);
6013     RExC_rxi->data->count = count + n;
6014     Copy(s, RExC_rxi->data->what + count, n, U8);
6015     return count;
6016 }
6017
6018 /*XXX: todo make this not included in a non debugging perl, but appears to be
6019  * used anyway there, in 'use re' */
6020 #ifndef PERL_IN_XSUB_RE
6021 void
6022 Perl_reginitcolors(pTHX)
6023 {
6024     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6025     if (s) {
6026         char *t = savepv(s);
6027         int i = 0;
6028         PL_colors[0] = t;
6029         while (++i < 6) {
6030             t = strchr(t, '\t');
6031             if (t) {
6032                 *t = '\0';
6033                 PL_colors[i] = ++t;
6034             }
6035             else
6036                 PL_colors[i] = t = (char *)"";
6037         }
6038     } else {
6039         int i = 0;
6040         while (i < 6)
6041             PL_colors[i++] = (char *)"";
6042     }
6043     PL_colorset = 1;
6044 }
6045 #endif
6046
6047
6048 #ifdef TRIE_STUDY_OPT
6049 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6050     STMT_START {                                            \
6051         if (                                                \
6052               (data.flags & SCF_TRIE_RESTUDY)               \
6053               && ! restudied++                              \
6054         ) {                                                 \
6055             dOsomething;                                    \
6056             goto reStudy;                                   \
6057         }                                                   \
6058     } STMT_END
6059 #else
6060 #define CHECK_RESTUDY_GOTO_butfirst
6061 #endif
6062
6063 /*
6064  * pregcomp - compile a regular expression into internal code
6065  *
6066  * Decides which engine's compiler to call based on the hint currently in
6067  * scope
6068  */
6069
6070 #ifndef PERL_IN_XSUB_RE
6071
6072 /* return the currently in-scope regex engine (or the default if none)  */
6073
6074 regexp_engine const *
6075 Perl_current_re_engine(pTHX)
6076 {
6077     if (IN_PERL_COMPILETIME) {
6078         HV * const table = GvHV(PL_hintgv);
6079         SV **ptr;
6080
6081         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6082             return &PL_core_reg_engine;
6083         ptr = hv_fetchs(table, "regcomp", FALSE);
6084         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6085             return &PL_core_reg_engine;
6086         return INT2PTR(regexp_engine*,SvIV(*ptr));
6087     }
6088     else {
6089         SV *ptr;
6090         if (!PL_curcop->cop_hints_hash)
6091             return &PL_core_reg_engine;
6092         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6093         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6094             return &PL_core_reg_engine;
6095         return INT2PTR(regexp_engine*,SvIV(ptr));
6096     }
6097 }
6098
6099
6100 REGEXP *
6101 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6102 {
6103     regexp_engine const *eng = current_re_engine();
6104     GET_RE_DEBUG_FLAGS_DECL;
6105
6106     PERL_ARGS_ASSERT_PREGCOMP;
6107
6108     /* Dispatch a request to compile a regexp to correct regexp engine. */
6109     DEBUG_COMPILE_r({
6110         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6111                         PTR2UV(eng));
6112     });
6113     return CALLREGCOMP_ENG(eng, pattern, flags);
6114 }
6115 #endif
6116
6117 /* public(ish) entry point for the perl core's own regex compiling code.
6118  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6119  * pattern rather than a list of OPs, and uses the internal engine rather
6120  * than the current one */
6121
6122 REGEXP *
6123 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6124 {
6125     SV *pat = pattern; /* defeat constness! */
6126     PERL_ARGS_ASSERT_RE_COMPILE;
6127     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6128 #ifdef PERL_IN_XSUB_RE
6129                                 &my_reg_engine,
6130 #else
6131                                 &PL_core_reg_engine,
6132 #endif
6133                                 NULL, NULL, rx_flags, 0);
6134 }
6135
6136
6137 static void
6138 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6139 {
6140     int n;
6141
6142     if (--cbs->refcnt > 0)
6143         return;
6144     for (n = 0; n < cbs->count; n++) {
6145         REGEXP *rx = cbs->cb[n].src_regex;
6146         cbs->cb[n].src_regex = NULL;
6147         SvREFCNT_dec(rx);
6148     }
6149     Safefree(cbs->cb);
6150     Safefree(cbs);
6151 }
6152
6153
6154 static struct reg_code_blocks *
6155 S_alloc_code_blocks(pTHX_  int ncode)
6156 {
6157      struct reg_code_blocks *cbs;
6158     Newx(cbs, 1, struct reg_code_blocks);
6159     cbs->count = ncode;
6160     cbs->refcnt = 1;
6161     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6162     if (ncode)
6163         Newx(cbs->cb, ncode, struct reg_code_block);
6164     else
6165         cbs->cb = NULL;
6166     return cbs;
6167 }
6168
6169
6170 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6171  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6172  * point to the realloced string and length.
6173  *
6174  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6175  * stuff added */
6176
6177 static void
6178 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6179                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6180 {
6181     U8 *const src = (U8*)*pat_p;
6182     U8 *dst, *d;
6183     int n=0;
6184     STRLEN s = 0;
6185     bool do_end = 0;
6186     GET_RE_DEBUG_FLAGS_DECL;
6187
6188     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6189         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6190
6191     Newx(dst, *plen_p * 2 + 1, U8);
6192     d = dst;
6193
6194     while (s < *plen_p) {
6195         append_utf8_from_native_byte(src[s], &d);
6196
6197         if (n < num_code_blocks) {
6198             assert(pRExC_state->code_blocks);
6199             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6200                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6201                 assert(*(d - 1) == '(');
6202                 do_end = 1;
6203             }
6204             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6205                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6206                 assert(*(d - 1) == ')');
6207                 do_end = 0;
6208                 n++;
6209             }
6210         }
6211         s++;
6212     }
6213     *d = '\0';
6214     *plen_p = d - dst;
6215     *pat_p = (char*) dst;
6216     SAVEFREEPV(*pat_p);
6217     RExC_orig_utf8 = RExC_utf8 = 1;
6218 }
6219
6220
6221
6222 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6223  * while recording any code block indices, and handling overloading,
6224  * nested qr// objects etc.  If pat is null, it will allocate a new
6225  * string, or just return the first arg, if there's only one.
6226  *
6227  * Returns the malloced/updated pat.
6228  * patternp and pat_count is the array of SVs to be concatted;
6229  * oplist is the optional list of ops that generated the SVs;
6230  * recompile_p is a pointer to a boolean that will be set if
6231  *   the regex will need to be recompiled.
6232  * delim, if non-null is an SV that will be inserted between each element
6233  */
6234
6235 static SV*
6236 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6237                 SV *pat, SV ** const patternp, int pat_count,
6238                 OP *oplist, bool *recompile_p, SV *delim)
6239 {
6240     SV **svp;
6241     int n = 0;
6242     bool use_delim = FALSE;
6243     bool alloced = FALSE;
6244
6245     /* if we know we have at least two args, create an empty string,
6246      * then concatenate args to that. For no args, return an empty string */
6247     if (!pat && pat_count != 1) {
6248         pat = newSVpvs("");
6249         SAVEFREESV(pat);
6250         alloced = TRUE;
6251     }
6252
6253     for (svp = patternp; svp < patternp + pat_count; svp++) {
6254         SV *sv;
6255         SV *rx  = NULL;
6256         STRLEN orig_patlen = 0;
6257         bool code = 0;
6258         SV *msv = use_delim ? delim : *svp;
6259         if (!msv) msv = &PL_sv_undef;
6260
6261         /* if we've got a delimiter, we go round the loop twice for each
6262          * svp slot (except the last), using the delimiter the second
6263          * time round */
6264         if (use_delim) {
6265             svp--;
6266             use_delim = FALSE;
6267         }
6268         else if (delim)
6269             use_delim = TRUE;
6270
6271         if (SvTYPE(msv) == SVt_PVAV) {
6272             /* we've encountered an interpolated array within
6273              * the pattern, e.g. /...@a..../. Expand the list of elements,
6274              * then recursively append elements.
6275              * The code in this block is based on S_pushav() */
6276
6277             AV *const av = (AV*)msv;
6278             const SSize_t maxarg = AvFILL(av) + 1;
6279             SV **array;
6280
6281             if (oplist) {
6282                 assert(oplist->op_type == OP_PADAV
6283                     || oplist->op_type == OP_RV2AV);
6284                 oplist = OpSIBLING(oplist);
6285             }
6286
6287             if (SvRMAGICAL(av)) {
6288                 SSize_t i;
6289
6290                 Newx(array, maxarg, SV*);
6291                 SAVEFREEPV(array);
6292                 for (i=0; i < maxarg; i++) {
6293                     SV ** const svp = av_fetch(av, i, FALSE);
6294                     array[i] = svp ? *svp : &PL_sv_undef;
6295                 }
6296             }
6297             else
6298                 array = AvARRAY(av);
6299
6300             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6301                                 array, maxarg, NULL, recompile_p,
6302                                 /* $" */
6303                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6304
6305             continue;
6306         }
6307
6308
6309         /* we make the assumption here that each op in the list of
6310          * op_siblings maps to one SV pushed onto the stack,
6311          * except for code blocks, with have both an OP_NULL and
6312          * and OP_CONST.
6313          * This allows us to match up the list of SVs against the
6314          * list of OPs to find the next code block.
6315          *
6316          * Note that       PUSHMARK PADSV PADSV ..
6317          * is optimised to
6318          *                 PADRANGE PADSV  PADSV  ..
6319          * so the alignment still works. */
6320
6321         if (oplist) {
6322             if (oplist->op_type == OP_NULL
6323                 && (oplist->op_flags & OPf_SPECIAL))
6324             {
6325                 assert(n < pRExC_state->code_blocks->count);
6326                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6327                 pRExC_state->code_blocks->cb[n].block = oplist;
6328                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6329                 n++;
6330                 code = 1;
6331                 oplist = OpSIBLING(oplist); /* skip CONST */
6332                 assert(oplist);
6333             }
6334             oplist = OpSIBLING(oplist);;
6335         }
6336
6337         /* apply magic and QR overloading to arg */
6338
6339         SvGETMAGIC(msv);
6340         if (SvROK(msv) && SvAMAGIC(msv)) {
6341             SV *sv = AMG_CALLunary(msv, regexp_amg);
6342             if (sv) {
6343                 if (SvROK(sv))
6344                     sv = SvRV(sv);
6345                 if (SvTYPE(sv) != SVt_REGEXP)
6346                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6347                 msv = sv;
6348             }
6349         }
6350
6351         /* try concatenation overload ... */
6352         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6353                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6354         {
6355             sv_setsv(pat, sv);
6356             /* overloading involved: all bets are off over literal
6357              * code. Pretend we haven't seen it */
6358             if (n)
6359                 pRExC_state->code_blocks->count -= n;
6360             n = 0;
6361         }
6362         else  {
6363             /* ... or failing that, try "" overload */
6364             while (SvAMAGIC(msv)
6365                     && (sv = AMG_CALLunary(msv, string_amg))
6366                     && sv != msv
6367                     &&  !(   SvROK(msv)
6368                           && SvROK(sv)
6369                           && SvRV(msv) == SvRV(sv))
6370             ) {
6371                 msv = sv;
6372                 SvGETMAGIC(msv);
6373             }
6374             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6375                 msv = SvRV(msv);
6376
6377             if (pat) {
6378                 /* this is a partially unrolled
6379                  *     sv_catsv_nomg(pat, msv);
6380                  * that allows us to adjust code block indices if
6381                  * needed */
6382                 STRLEN dlen;
6383                 char *dst = SvPV_force_nomg(pat, dlen);
6384                 orig_patlen = dlen;
6385                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6386                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6387                     sv_setpvn(pat, dst, dlen);
6388                     SvUTF8_on(pat);
6389                 }
6390                 sv_catsv_nomg(pat, msv);
6391                 rx = msv;
6392             }
6393             else {
6394                 /* We have only one SV to process, but we need to verify
6395                  * it is properly null terminated or we will fail asserts
6396                  * later. In theory we probably shouldn't get such SV's,
6397                  * but if we do we should handle it gracefully. */
6398                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
6399                     /* not a string, or a string with a trailing null */
6400                     pat = msv;
6401                 } else {
6402                     /* a string with no trailing null, we need to copy it
6403                      * so it we have a trailing null */
6404                     pat = newSVsv(msv);
6405                 }
6406             }
6407
6408             if (code)
6409                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6410         }
6411
6412         /* extract any code blocks within any embedded qr//'s */
6413         if (rx && SvTYPE(rx) == SVt_REGEXP
6414             && RX_ENGINE((REGEXP*)rx)->op_comp)
6415         {
6416
6417             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6418             if (ri->code_blocks && ri->code_blocks->count) {
6419                 int i;
6420                 /* the presence of an embedded qr// with code means
6421                  * we should always recompile: the text of the
6422                  * qr// may not have changed, but it may be a
6423                  * different closure than last time */
6424                 *recompile_p = 1;
6425                 if (pRExC_state->code_blocks) {
6426                     pRExC_state->code_blocks->count += ri->code_blocks->count;
6427                     Renew(pRExC_state->code_blocks->cb,
6428                             pRExC_state->code_blocks->count,
6429                             struct reg_code_block);
6430                 }
6431                 else
6432                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6433                                                     ri->code_blocks->count);
6434
6435                 for (i=0; i < ri->code_blocks->count; i++) {
6436                     struct reg_code_block *src, *dst;
6437                     STRLEN offset =  orig_patlen
6438                         + ReANY((REGEXP *)rx)->pre_prefix;
6439                     assert(n < pRExC_state->code_blocks->count);
6440                     src = &ri->code_blocks->cb[i];
6441                     dst = &pRExC_state->code_blocks->cb[n];
6442                     dst->start      = src->start + offset;
6443                     dst->end        = src->end   + offset;
6444                     dst->block      = src->block;
6445                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6446                                             src->src_regex
6447                                                 ? src->src_regex
6448                                                 : (REGEXP*)rx);
6449                     n++;
6450                 }
6451             }
6452         }
6453     }
6454     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6455     if (alloced)
6456         SvSETMAGIC(pat);
6457
6458     return pat;
6459 }
6460
6461
6462
6463 /* see if there are any run-time code blocks in the pattern.
6464  * False positives are allowed */
6465
6466 static bool
6467 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6468                     char *pat, STRLEN plen)
6469 {
6470     int n = 0;
6471     STRLEN s;
6472
6473     PERL_UNUSED_CONTEXT;
6474
6475     for (s = 0; s < plen; s++) {
6476         if (   pRExC_state->code_blocks
6477             && n < pRExC_state->code_blocks->count
6478             && s == pRExC_state->code_blocks->cb[n].start)
6479         {
6480             s = pRExC_state->code_blocks->cb[n].end;
6481             n++;
6482             continue;
6483         }
6484         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6485          * positives here */
6486         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6487             (pat[s+2] == '{'
6488                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6489         )
6490             return 1;
6491     }
6492     return 0;
6493 }
6494
6495 /* Handle run-time code blocks. We will already have compiled any direct
6496  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6497  * copy of it, but with any literal code blocks blanked out and
6498  * appropriate chars escaped; then feed it into
6499  *
6500  *    eval "qr'modified_pattern'"
6501  *
6502  * For example,
6503  *
6504  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6505  *
6506  * becomes
6507  *
6508  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6509  *
6510  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6511  * and merge them with any code blocks of the original regexp.
6512  *
6513  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6514  * instead, just save the qr and return FALSE; this tells our caller that
6515  * the original pattern needs upgrading to utf8.
6516  */
6517
6518 static bool
6519 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6520     char *pat, STRLEN plen)
6521 {
6522     SV *qr;
6523
6524     GET_RE_DEBUG_FLAGS_DECL;
6525
6526     if (pRExC_state->runtime_code_qr) {
6527         /* this is the second time we've been called; this should
6528          * only happen if the main pattern got upgraded to utf8
6529          * during compilation; re-use the qr we compiled first time
6530          * round (which should be utf8 too)
6531          */
6532         qr = pRExC_state->runtime_code_qr;
6533         pRExC_state->runtime_code_qr = NULL;
6534         assert(RExC_utf8 && SvUTF8(qr));
6535     }
6536     else {
6537         int n = 0;
6538         STRLEN s;
6539         char *p, *newpat;
6540         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6541         SV *sv, *qr_ref;
6542         dSP;
6543
6544         /* determine how many extra chars we need for ' and \ escaping */
6545         for (s = 0; s < plen; s++) {
6546             if (pat[s] == '\'' || pat[s] == '\\')
6547                 newlen++;
6548         }
6549
6550         Newx(newpat, newlen, char);
6551         p = newpat;
6552         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6553
6554         for (s = 0; s < plen; s++) {
6555             if (   pRExC_state->code_blocks
6556                 && n < pRExC_state->code_blocks->count
6557                 && s == pRExC_state->code_blocks->cb[n].start)
6558             {
6559                 /* blank out literal code block */
6560                 assert(pat[s] == '(');
6561                 while (s <= pRExC_state->code_blocks->cb[n].end) {
6562                     *p++ = '_';
6563                     s++;
6564                 }
6565                 s--;
6566                 n++;
6567                 continue;
6568             }
6569             if (pat[s] == '\'' || pat[s] == '\\')
6570                 *p++ = '\\';
6571             *p++ = pat[s];
6572         }
6573         *p++ = '\'';
6574         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6575             *p++ = 'x';
6576             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6577                 *p++ = 'x';
6578             }
6579         }
6580         *p++ = '\0';
6581         DEBUG_COMPILE_r({
6582             Perl_re_printf( aTHX_
6583                 "%sre-parsing pattern for runtime code:%s %s\n",
6584                 PL_colors[4],PL_colors[5],newpat);
6585         });
6586
6587         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6588         Safefree(newpat);
6589
6590         ENTER;
6591         SAVETMPS;
6592         save_re_context();
6593         PUSHSTACKi(PERLSI_REQUIRE);
6594         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6595          * parsing qr''; normally only q'' does this. It also alters
6596          * hints handling */
6597         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6598         SvREFCNT_dec_NN(sv);
6599         SPAGAIN;
6600         qr_ref = POPs;
6601         PUTBACK;
6602         {
6603             SV * const errsv = ERRSV;
6604             if (SvTRUE_NN(errsv))
6605                 /* use croak_sv ? */
6606                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6607         }
6608         assert(SvROK(qr_ref));
6609         qr = SvRV(qr_ref);
6610         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6611         /* the leaving below frees the tmp qr_ref.
6612          * Give qr a life of its own */
6613         SvREFCNT_inc(qr);
6614         POPSTACK;
6615         FREETMPS;
6616         LEAVE;
6617
6618     }
6619
6620     if (!RExC_utf8 && SvUTF8(qr)) {
6621         /* first time through; the pattern got upgraded; save the
6622          * qr for the next time through */
6623         assert(!pRExC_state->runtime_code_qr);
6624         pRExC_state->runtime_code_qr = qr;
6625         return 0;
6626     }
6627
6628
6629     /* extract any code blocks within the returned qr//  */
6630
6631
6632     /* merge the main (r1) and run-time (r2) code blocks into one */
6633     {
6634         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6635         struct reg_code_block *new_block, *dst;
6636         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6637         int i1 = 0, i2 = 0;
6638         int r1c, r2c;
6639
6640         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
6641         {
6642             SvREFCNT_dec_NN(qr);
6643             return 1;
6644         }
6645
6646         if (!r1->code_blocks)
6647             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
6648
6649         r1c = r1->code_blocks->count;
6650         r2c = r2->code_blocks->count;
6651
6652         Newx(new_block, r1c + r2c, struct reg_code_block);
6653
6654         dst = new_block;
6655
6656         while (i1 < r1c || i2 < r2c) {
6657             struct reg_code_block *src;
6658             bool is_qr = 0;
6659
6660             if (i1 == r1c) {
6661                 src = &r2->code_blocks->cb[i2++];
6662                 is_qr = 1;
6663             }
6664             else if (i2 == r2c)
6665                 src = &r1->code_blocks->cb[i1++];
6666             else if (  r1->code_blocks->cb[i1].start
6667                      < r2->code_blocks->cb[i2].start)
6668             {
6669                 src = &r1->code_blocks->cb[i1++];
6670                 assert(src->end < r2->code_blocks->cb[i2].start);
6671             }
6672             else {
6673                 assert(  r1->code_blocks->cb[i1].start
6674                        > r2->code_blocks->cb[i2].start);
6675                 src = &r2->code_blocks->cb[i2++];
6676                 is_qr = 1;
6677                 assert(src->end < r1->code_blocks->cb[i1].start);
6678             }
6679
6680             assert(pat[src->start] == '(');
6681             assert(pat[src->end]   == ')');
6682             dst->start      = src->start;
6683             dst->end        = src->end;
6684             dst->block      = src->block;
6685             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6686                                     : src->src_regex;
6687             dst++;
6688         }
6689         r1->code_blocks->count += r2c;
6690         Safefree(r1->code_blocks->cb);
6691         r1->code_blocks->cb = new_block;
6692     }
6693
6694     SvREFCNT_dec_NN(qr);
6695     return 1;
6696 }
6697
6698
6699 STATIC bool
6700 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6701                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6702                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6703                       STRLEN longest_length, bool eol, bool meol)
6704 {
6705     /* This is the common code for setting up the floating and fixed length
6706      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6707      * as to whether succeeded or not */
6708
6709     I32 t;
6710     SSize_t ml;
6711
6712     if (! (longest_length
6713            || (eol /* Can't have SEOL and MULTI */
6714                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6715           )
6716             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6717         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6718     {
6719         return FALSE;
6720     }
6721
6722     /* copy the information about the longest from the reg_scan_data
6723         over to the program. */
6724     if (SvUTF8(sv_longest)) {
6725         *rx_utf8 = sv_longest;
6726         *rx_substr = NULL;
6727     } else {
6728         *rx_substr = sv_longest;
6729         *rx_utf8 = NULL;
6730     }
6731     /* end_shift is how many chars that must be matched that
6732         follow this item. We calculate it ahead of time as once the
6733         lookbehind offset is added in we lose the ability to correctly
6734         calculate it.*/
6735     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6736     *rx_end_shift = ml - offset
6737         - longest_length
6738             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
6739              * intead? - DAPM
6740             + (SvTAIL(sv_longest) != 0)
6741             */
6742         + lookbehind;
6743
6744     t = (eol/* Can't have SEOL and MULTI */
6745          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6746     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6747
6748     return TRUE;
6749 }
6750
6751 /*
6752  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6753  * regular expression into internal code.
6754  * The pattern may be passed either as:
6755  *    a list of SVs (patternp plus pat_count)
6756  *    a list of OPs (expr)
6757  * If both are passed, the SV list is used, but the OP list indicates
6758  * which SVs are actually pre-compiled code blocks
6759  *
6760  * The SVs in the list have magic and qr overloading applied to them (and
6761  * the list may be modified in-place with replacement SVs in the latter
6762  * case).
6763  *
6764  * If the pattern hasn't changed from old_re, then old_re will be
6765  * returned.
6766  *
6767  * eng is the current engine. If that engine has an op_comp method, then
6768  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6769  * do the initial concatenation of arguments and pass on to the external
6770  * engine.
6771  *
6772  * If is_bare_re is not null, set it to a boolean indicating whether the
6773  * arg list reduced (after overloading) to a single bare regex which has
6774  * been returned (i.e. /$qr/).
6775  *
6776  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6777  *
6778  * pm_flags contains the PMf_* flags, typically based on those from the
6779  * pm_flags field of the related PMOP. Currently we're only interested in
6780  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6781  *
6782  * We can't allocate space until we know how big the compiled form will be,
6783  * but we can't compile it (and thus know how big it is) until we've got a
6784  * place to put the code.  So we cheat:  we compile it twice, once with code
6785  * generation turned off and size counting turned on, and once "for real".
6786  * This also means that we don't allocate space until we are sure that the
6787  * thing really will compile successfully, and we never have to move the
6788  * code and thus invalidate pointers into it.  (Note that it has to be in
6789  * one piece because free() must be able to free it all.) [NB: not true in perl]
6790  *
6791  * Beware that the optimization-preparation code in here knows about some
6792  * of the structure of the compiled regexp.  [I'll say.]
6793  */
6794
6795 REGEXP *
6796 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6797                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6798                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6799 {
6800     REGEXP *rx;
6801     struct regexp *r;
6802     regexp_internal *ri;
6803     STRLEN plen;
6804     char *exp;
6805     regnode *scan;
6806     I32 flags;
6807     SSize_t minlen = 0;
6808     U32 rx_flags;
6809     SV *pat;
6810     SV** new_patternp = patternp;
6811
6812     /* these are all flags - maybe they should be turned
6813      * into a single int with different bit masks */
6814     I32 sawlookahead = 0;
6815     I32 sawplus = 0;
6816     I32 sawopen = 0;
6817     I32 sawminmod = 0;
6818
6819     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6820     bool recompile = 0;
6821     bool runtime_code = 0;
6822     scan_data_t data;
6823     RExC_state_t RExC_state;
6824     RExC_state_t * const pRExC_state = &RExC_state;
6825 #ifdef TRIE_STUDY_OPT
6826     int restudied = 0;
6827     RExC_state_t copyRExC_state;
6828 #endif
6829     GET_RE_DEBUG_FLAGS_DECL;
6830
6831     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6832
6833     DEBUG_r(if (!PL_colorset) reginitcolors());
6834
6835     /* Initialize these here instead of as-needed, as is quick and avoids
6836      * having to test them each time otherwise */
6837     if (! PL_AboveLatin1) {
6838 #ifdef DEBUGGING
6839         char * dump_len_string;
6840 #endif
6841
6842         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6843         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6844         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6845         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6846         PL_HasMultiCharFold =
6847                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6848
6849         /* This is calculated here, because the Perl program that generates the
6850          * static global ones doesn't currently have access to
6851          * NUM_ANYOF_CODE_POINTS */
6852         PL_InBitmap = _new_invlist(2);
6853         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6854                                                     NUM_ANYOF_CODE_POINTS - 1);
6855 #ifdef DEBUGGING
6856         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6857         if (   ! dump_len_string
6858             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6859         {
6860             PL_dump_re_max_len = 0;
6861         }
6862 #endif
6863     }
6864
6865     pRExC_state->warn_text = NULL;
6866     pRExC_state->code_blocks = NULL;
6867
6868     if (is_bare_re)
6869         *is_bare_re = FALSE;
6870
6871     if (expr && (expr->op_type == OP_LIST ||
6872                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6873         /* allocate code_blocks if needed */
6874         OP *o;
6875         int ncode = 0;
6876
6877         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6878             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6879                 ncode++; /* count of DO blocks */
6880
6881         if (ncode)
6882             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
6883     }
6884
6885     if (!pat_count) {
6886         /* compile-time pattern with just OP_CONSTs and DO blocks */
6887
6888         int n;
6889         OP *o;
6890
6891         /* find how many CONSTs there are */
6892         assert(expr);
6893         n = 0;
6894         if (expr->op_type == OP_CONST)
6895             n = 1;
6896         else
6897             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6898                 if (o->op_type == OP_CONST)
6899                     n++;
6900             }
6901
6902         /* fake up an SV array */
6903
6904         assert(!new_patternp);
6905         Newx(new_patternp, n, SV*);
6906         SAVEFREEPV(new_patternp);
6907         pat_count = n;
6908
6909         n = 0;
6910         if (expr->op_type == OP_CONST)
6911             new_patternp[n] = cSVOPx_sv(expr);
6912         else
6913             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6914                 if (o->op_type == OP_CONST)
6915                     new_patternp[n++] = cSVOPo_sv;
6916             }
6917
6918     }
6919
6920     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6921         "Assembling pattern from %d elements%s\n", pat_count,
6922             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6923
6924     /* set expr to the first arg op */
6925
6926     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
6927          && expr->op_type != OP_CONST)
6928     {
6929             expr = cLISTOPx(expr)->op_first;
6930             assert(   expr->op_type == OP_PUSHMARK
6931                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6932                    || expr->op_type == OP_PADRANGE);
6933             expr = OpSIBLING(expr);
6934     }
6935
6936     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6937                         expr, &recompile, NULL);
6938
6939     /* handle bare (possibly after overloading) regex: foo =~ $re */
6940     {
6941         SV *re = pat;
6942         if (SvROK(re))
6943             re = SvRV(re);
6944         if (SvTYPE(re) == SVt_REGEXP) {
6945             if (is_bare_re)
6946                 *is_bare_re = TRUE;
6947             SvREFCNT_inc(re);
6948             DEBUG_PARSE_r(Perl_re_printf( aTHX_
6949                 "Precompiled pattern%s\n",
6950                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6951
6952             return (REGEXP*)re;
6953         }
6954     }
6955
6956     exp = SvPV_nomg(pat, plen);
6957
6958     if (!eng->op_comp) {
6959         if ((SvUTF8(pat) && IN_BYTES)
6960                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6961         {
6962             /* make a temporary copy; either to convert to bytes,
6963              * or to avoid repeating get-magic / overloaded stringify */
6964             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6965                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6966         }
6967         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6968     }
6969
6970     /* ignore the utf8ness if the pattern is 0 length */
6971     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6972
6973     RExC_uni_semantics = 0;
6974     RExC_seen_unfolded_sharp_s = 0;
6975     RExC_contains_locale = 0;
6976     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6977     RExC_study_started = 0;
6978     pRExC_state->runtime_code_qr = NULL;
6979     RExC_frame_head= NULL;
6980     RExC_frame_last= NULL;
6981     RExC_frame_count= 0;
6982
6983     DEBUG_r({
6984         RExC_mysv1= sv_newmortal();
6985         RExC_mysv2= sv_newmortal();
6986     });
6987     DEBUG_COMPILE_r({
6988             SV *dsv= sv_newmortal();
6989             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6990             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
6991                           PL_colors[4],PL_colors[5],s);
6992         });
6993
6994   redo_first_pass:
6995     /* we jump here if we have to recompile, e.g., from upgrading the pattern
6996      * to utf8 */
6997
6998     if ((pm_flags & PMf_USE_RE_EVAL)
6999                 /* this second condition covers the non-regex literal case,
7000                  * i.e.  $foo =~ '(?{})'. */
7001                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7002     )
7003         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7004
7005     /* return old regex if pattern hasn't changed */
7006     /* XXX: note in the below we have to check the flags as well as the
7007      * pattern.
7008      *
7009      * Things get a touch tricky as we have to compare the utf8 flag
7010      * independently from the compile flags.  */
7011
7012     if (   old_re
7013         && !recompile
7014         && !!RX_UTF8(old_re) == !!RExC_utf8
7015         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7016         && RX_PRECOMP(old_re)
7017         && RX_PRELEN(old_re) == plen
7018         && memEQ(RX_PRECOMP(old_re), exp, plen)
7019         && !runtime_code /* with runtime code, always recompile */ )
7020     {
7021         return old_re;
7022     }
7023
7024     rx_flags = orig_rx_flags;
7025
7026     if (   initial_charset == REGEX_DEPENDS_CHARSET
7027         && (RExC_utf8 ||RExC_uni_semantics))
7028     {
7029
7030         /* Set to use unicode semantics if the pattern is in utf8 and has the
7031          * 'depends' charset specified, as it means unicode when utf8  */
7032         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7033     }
7034
7035     RExC_precomp = exp;
7036     RExC_precomp_adj = 0;
7037     RExC_flags = rx_flags;
7038     RExC_pm_flags = pm_flags;
7039
7040     if (runtime_code) {
7041         assert(TAINTING_get || !TAINT_get);
7042         if (TAINT_get)
7043             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7044
7045         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7046             /* whoops, we have a non-utf8 pattern, whilst run-time code
7047              * got compiled as utf8. Try again with a utf8 pattern */
7048             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7049                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7050             goto redo_first_pass;
7051         }
7052     }
7053     assert(!pRExC_state->runtime_code_qr);
7054
7055     RExC_sawback = 0;
7056
7057     RExC_seen = 0;
7058     RExC_maxlen = 0;
7059     RExC_in_lookbehind = 0;
7060     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7061     RExC_extralen = 0;
7062 #ifdef EBCDIC
7063     RExC_recode_x_to_native = 0;
7064 #endif
7065     RExC_in_multi_char_class = 0;
7066
7067     /* First pass: determine size, legality. */
7068     RExC_parse = exp;
7069     RExC_start = RExC_adjusted_start = exp;
7070     RExC_end = exp + plen;
7071     RExC_precomp_end = RExC_end;
7072     RExC_naughty = 0;
7073     RExC_npar = 1;
7074     RExC_nestroot = 0;
7075     RExC_size = 0L;
7076     RExC_emit = (regnode *) &RExC_emit_dummy;
7077     RExC_whilem_seen = 0;
7078     RExC_open_parens = NULL;
7079     RExC_close_parens = NULL;
7080     RExC_end_op = NULL;
7081     RExC_paren_names = NULL;
7082 #ifdef DEBUGGING
7083     RExC_paren_name_list = NULL;
7084 #endif
7085     RExC_recurse = NULL;
7086     RExC_study_chunk_recursed = NULL;
7087     RExC_study_chunk_recursed_bytes= 0;
7088     RExC_recurse_count = 0;
7089     pRExC_state->code_index = 0;
7090
7091     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7092      * code makes sure the final byte is an uncounted NUL.  But should this
7093      * ever not be the case, lots of things could read beyond the end of the
7094      * buffer: loops like
7095      *      while(isFOO(*RExC_parse)) RExC_parse++;
7096      *      strchr(RExC_parse, "foo");
7097      * etc.  So it is worth noting. */
7098     assert(*RExC_end == '\0');
7099
7100     DEBUG_PARSE_r(
7101         Perl_re_printf( aTHX_  "Starting first pass (sizing)\n");
7102         RExC_lastnum=0;
7103         RExC_lastparse=NULL;
7104     );
7105
7106     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7107         /* It's possible to write a regexp in ascii that represents Unicode
7108         codepoints outside of the byte range, such as via \x{100}. If we
7109         detect such a sequence we have to convert the entire pattern to utf8
7110         and then recompile, as our sizing calculation will have been based
7111         on 1 byte == 1 character, but we will need to use utf8 to encode
7112         at least some part of the pattern, and therefore must convert the whole
7113         thing.
7114         -- dmq */
7115         if (flags & RESTART_PASS1) {
7116             if (flags & NEED_UTF8) {
7117                 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7118                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7119             }
7120             else {
7121                 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7122                 "Need to redo pass 1\n"));
7123             }
7124
7125             goto redo_first_pass;
7126         }
7127         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
7128     }
7129
7130     DEBUG_PARSE_r({
7131         Perl_re_printf( aTHX_
7132             "Required size %" IVdf " nodes\n"
7133             "Starting second pass (creation)\n",
7134             (IV)RExC_size);
7135         RExC_lastnum=0;
7136         RExC_lastparse=NULL;
7137     });
7138
7139     /* The first pass could have found things that force Unicode semantics */
7140     if ((RExC_utf8 || RExC_uni_semantics)
7141          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7142     {
7143         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7144     }
7145
7146     /* Small enough for pointer-storage convention?
7147        If extralen==0, this means that we will not need long jumps. */
7148     if (RExC_size >= 0x10000L && RExC_extralen)
7149         RExC_size += RExC_extralen;
7150     else
7151         RExC_extralen = 0;
7152     if (RExC_whilem_seen > 15)
7153         RExC_whilem_seen = 15;
7154
7155     /* Allocate space and zero-initialize. Note, the two step process
7156        of zeroing when in debug mode, thus anything assigned has to
7157        happen after that */
7158     rx = (REGEXP*) newSV_type(SVt_REGEXP);
7159     r = ReANY(rx);
7160     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7161          char, regexp_internal);
7162     if ( r == NULL || ri == NULL )
7163         FAIL("Regexp out of space");
7164 #ifdef DEBUGGING
7165     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7166     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7167          char);
7168 #else
7169     /* bulk initialize base fields with 0. */
7170     Zero(ri, sizeof(regexp_internal), char);
7171 #endif
7172
7173     /* non-zero initialization begins here */
7174     RXi_SET( r, ri );
7175     r->engine= eng;
7176     r->extflags = rx_flags;
7177     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7178
7179     if (pm_flags & PMf_IS_QR) {
7180         ri->code_blocks = pRExC_state->code_blocks;
7181         if (ri->code_blocks)
7182             ri->code_blocks->refcnt++;
7183     }
7184
7185     {
7186         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7187         bool has_charset = (get_regex_charset(r->extflags)
7188                                                     != REGEX_DEPENDS_CHARSET);
7189
7190         /* The caret is output if there are any defaults: if not all the STD
7191          * flags are set, or if no character set specifier is needed */
7192         bool has_default =
7193                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7194                     || ! has_charset);
7195         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7196                                                    == REG_RUN_ON_COMMENT_SEEN);
7197         U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7198                             >> RXf_PMf_STD_PMMOD_SHIFT);
7199         const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7200         char *p;
7201
7202         /* We output all the necessary flags; we never output a minus, as all
7203          * those are defaults, so are
7204          * covered by the caret */
7205         const STRLEN wraplen = plen + has_p + has_runon
7206             + has_default       /* If needs a caret */
7207             + PL_bitcount[reganch] /* 1 char for each set standard flag */
7208
7209                 /* If needs a character set specifier */
7210             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7211             + (sizeof("(?:)") - 1);
7212
7213         /* make sure PL_bitcount bounds not exceeded */
7214         assert(sizeof(STD_PAT_MODS) <= 8);
7215
7216         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
7217         r->xpv_len_u.xpvlenu_pv = p;
7218         if (RExC_utf8)
7219             SvFLAGS(rx) |= SVf_UTF8;
7220         *p++='('; *p++='?';
7221
7222         /* If a default, cover it using the caret */
7223         if (has_default) {
7224             *p++= DEFAULT_PAT_MOD;
7225         }
7226         if (has_charset) {
7227             STRLEN len;
7228             const char* const name = get_regex_charset_name(r->extflags, &len);
7229             Copy(name, p, len, char);
7230             p += len;
7231         }
7232         if (has_p)
7233             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7234         {
7235             char ch;
7236             while((ch = *fptr++)) {
7237                 if(reganch & 1)
7238                     *p++ = ch;
7239                 reganch >>= 1;
7240             }
7241         }
7242
7243         *p++ = ':';
7244         Copy(RExC_precomp, p, plen, char);
7245         assert ((RX_WRAPPED(rx) - p) < 16);
7246         r->pre_prefix = p - RX_WRAPPED(rx);
7247         p += plen;
7248         if (has_runon)
7249             *p++ = '\n';
7250         *p++ = ')';
7251         *p = 0;
7252         SvCUR_set(rx, p - RX_WRAPPED(rx));
7253     }
7254
7255     r->intflags = 0;
7256     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
7257
7258     /* Useful during FAIL. */
7259 #ifdef RE_TRACK_PATTERN_OFFSETS
7260     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7261     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7262                           "%s %" UVuf " bytes for offset annotations.\n",
7263                           ri->u.offsets ? "Got" : "Couldn't get",
7264                           (UV)((2*RExC_size+1) * sizeof(U32))));
7265 #endif
7266     SetProgLen(ri,RExC_size);
7267     RExC_rx_sv = rx;
7268     RExC_rx = r;
7269     RExC_rxi = ri;
7270
7271     /* Second pass: emit code. */
7272     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7273     RExC_pm_flags = pm_flags;
7274     RExC_parse = exp;
7275     RExC_end = exp + plen;
7276     RExC_naughty = 0;
7277     RExC_emit_start = ri->program;
7278     RExC_emit = ri->program;
7279     RExC_emit_bound = ri->program + RExC_size + 1;
7280     pRExC_state->code_index = 0;
7281
7282     *((char*) RExC_emit++) = (char) REG_MAGIC;
7283     /* setup various meta data about recursion, this all requires
7284      * RExC_npar to be correctly set, and a bit later on we clear it */
7285     if (RExC_seen & REG_RECURSE_SEEN) {
7286         DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7287             "%*s%*s Setting up open/close parens\n",
7288                   22, "|    |", (int)(0 * 2 + 1), ""));
7289
7290         /* setup RExC_open_parens, which holds the address of each
7291          * OPEN tag, and to make things simpler for the 0 index
7292          * the start of the program - this is used later for offsets */
7293         Newxz(RExC_open_parens, RExC_npar,regnode *);
7294         SAVEFREEPV(RExC_open_parens);
7295         RExC_open_parens[0] = RExC_emit;
7296
7297         /* setup RExC_close_parens, which holds the address of each
7298          * CLOSE tag, and to make things simpler for the 0 index
7299          * the end of the program - this is used later for offsets */
7300         Newxz(RExC_close_parens, RExC_npar,regnode *);
7301         SAVEFREEPV(RExC_close_parens);
7302         /* we dont know where end op starts yet, so we dont
7303          * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7304
7305         /* Note, RExC_npar is 1 + the number of parens in a pattern.
7306          * So its 1 if there are no parens. */
7307         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7308                                          ((RExC_npar & 0x07) != 0);
7309         Newx(RExC_study_chunk_recursed,
7310              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7311         SAVEFREEPV(RExC_study_chunk_recursed);
7312     }
7313     RExC_npar = 1;
7314     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7315         ReREFCNT_dec(rx);
7316         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
7317     }
7318     DEBUG_OPTIMISE_r(
7319         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7320     );
7321
7322     /* XXXX To minimize changes to RE engine we always allocate
7323        3-units-long substrs field. */
7324     Newx(r->substrs, 1, struct reg_substr_data);
7325     if (RExC_recurse_count) {
7326         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7327         SAVEFREEPV(RExC_recurse);
7328     }
7329
7330   reStudy:
7331     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7332     DEBUG_r(
7333         RExC_study_chunk_recursed_count= 0;
7334     );
7335     Zero(r->substrs, 1, struct reg_substr_data);
7336     if (RExC_study_chunk_recursed) {
7337         Zero(RExC_study_chunk_recursed,
7338              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7339     }
7340
7341
7342 #ifdef TRIE_STUDY_OPT
7343     if (!restudied) {
7344         StructCopy(&zero_scan_data, &data, scan_data_t);
7345         copyRExC_state = RExC_state;
7346     } else {
7347         U32 seen=RExC_seen;
7348         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7349
7350         RExC_state = copyRExC_state;
7351         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7352             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7353         else
7354             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7355         StructCopy(&zero_scan_data, &data, scan_data_t);
7356     }
7357 #else
7358     StructCopy(&zero_scan_data, &data, scan_data_t);
7359 #endif
7360
7361     /* Dig out information for optimizations. */
7362     r->extflags = RExC_flags; /* was pm_op */
7363     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7364
7365     if (UTF)
7366         SvUTF8_on(rx);  /* Unicode in it? */
7367     ri->regstclass = NULL;
7368     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7369         r->intflags |= PREGf_NAUGHTY;
7370     scan = ri->program + 1;             /* First BRANCH. */
7371
7372     /* testing for BRANCH here tells us whether there is "must appear"
7373        data in the pattern. If there is then we can use it for optimisations */
7374     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7375                                                   */
7376         SSize_t fake;
7377         STRLEN longest_float_length, longest_fixed_length;
7378         regnode_ssc ch_class; /* pointed to by data */
7379         int stclass_flag;
7380         SSize_t last_close = 0; /* pointed to by data */
7381         regnode *first= scan;
7382         regnode *first_next= regnext(first);
7383         /*
7384          * Skip introductions and multiplicators >= 1
7385          * so that we can extract the 'meat' of the pattern that must
7386          * match in the large if() sequence following.
7387          * NOTE that EXACT is NOT covered here, as it is normally
7388          * picked up by the optimiser separately.
7389          *
7390          * This is unfortunate as the optimiser isnt handling lookahead
7391          * properly currently.
7392          *
7393          */
7394         while ((OP(first) == OPEN && (sawopen = 1)) ||
7395                /* An OR of *one* alternative - should not happen now. */
7396             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7397             /* for now we can't handle lookbehind IFMATCH*/
7398             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7399             (OP(first) == PLUS) ||
7400             (OP(first) == MINMOD) ||
7401                /* An {n,m} with n>0 */
7402             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7403             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7404         {
7405                 /*
7406                  * the only op that could be a regnode is PLUS, all the rest
7407                  * will be regnode_1 or regnode_2.
7408                  *
7409                  * (yves doesn't think this is true)
7410                  */
7411                 if (OP(first) == PLUS)
7412                     sawplus = 1;
7413                 else {
7414                     if (OP(first) == MINMOD)
7415                         sawminmod = 1;
7416                     first += regarglen[OP(first)];
7417                 }
7418                 first = NEXTOPER(first);
7419                 first_next= regnext(first);
7420         }
7421
7422         /* Starting-point info. */
7423       again:
7424         DEBUG_PEEP("first:",first,0);
7425         /* Ignore EXACT as we deal with it later. */
7426         if (PL_regkind[OP(first)] == EXACT) {
7427             if (OP(first) == EXACT || OP(first) == EXACTL)
7428                 NOOP;   /* Empty, get anchored substr later. */
7429             else
7430                 ri->regstclass = first;
7431         }
7432 #ifdef TRIE_STCLASS
7433         else if (PL_regkind[OP(first)] == TRIE &&
7434                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7435         {
7436             /* this can happen only on restudy */
7437             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7438         }
7439 #endif
7440         else if (REGNODE_SIMPLE(OP(first)))
7441             ri->regstclass = first;
7442         else if (PL_regkind[OP(first)] == BOUND ||
7443                  PL_regkind[OP(first)] == NBOUND)
7444             ri->regstclass = first;
7445         else if (PL_regkind[OP(first)] == BOL) {
7446             r->intflags |= (OP(first) == MBOL
7447                            ? PREGf_ANCH_MBOL
7448                            : PREGf_ANCH_SBOL);
7449             first = NEXTOPER(first);
7450             goto again;
7451         }
7452         else if (OP(first) == GPOS) {
7453             r->intflags |= PREGf_ANCH_GPOS;
7454             first = NEXTOPER(first);
7455             goto again;
7456         }
7457         else if ((!sawopen || !RExC_sawback) &&
7458             !sawlookahead &&
7459             (OP(first) == STAR &&
7460             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7461             !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7462         {
7463             /* turn .* into ^.* with an implied $*=1 */
7464             const int type =
7465                 (OP(NEXTOPER(first)) == REG_ANY)
7466                     ? PREGf_ANCH_MBOL
7467                     : PREGf_ANCH_SBOL;
7468             r->intflags |= (type | PREGf_IMPLICIT);
7469             first = NEXTOPER(first);
7470             goto again;
7471         }
7472         if (sawplus && !sawminmod && !sawlookahead
7473             && (!sawopen || !RExC_sawback)
7474             && !pRExC_state->code_blocks) /* May examine pos and $& */
7475             /* x+ must match at the 1st pos of run of x's */
7476             r->intflags |= PREGf_SKIP;
7477
7478         /* Scan is after the zeroth branch, first is atomic matcher. */
7479 #ifdef TRIE_STUDY_OPT
7480         DEBUG_PARSE_r(
7481             if (!restudied)
7482                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7483                               (IV)(first - scan + 1))
7484         );
7485 #else
7486         DEBUG_PARSE_r(
7487             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7488                 (IV)(first - scan + 1))
7489         );
7490 #endif
7491
7492
7493         /*
7494         * If there's something expensive in the r.e., find the
7495         * longest literal string that must appear and make it the
7496         * regmust.  Resolve ties in favor of later strings, since
7497         * the regstart check works with the beginning of the r.e.
7498         * and avoiding duplication strengthens checking.  Not a
7499         * strong reason, but sufficient in the absence of others.
7500         * [Now we resolve ties in favor of the earlier string if
7501         * it happens that c_offset_min has been invalidated, since the
7502         * earlier string may buy us something the later one won't.]
7503         */
7504
7505         data.longest_fixed = newSVpvs("");
7506         data.longest_float = newSVpvs("");
7507         data.last_found = newSVpvs("");
7508         data.longest = &(data.longest_fixed);
7509         ENTER_with_name("study_chunk");
7510         SAVEFREESV(data.longest_fixed);
7511         SAVEFREESV(data.longest_float);
7512         SAVEFREESV(data.last_found);
7513         first = scan;
7514         if (!ri->regstclass) {
7515             ssc_init(pRExC_state, &ch_class);
7516             data.start_class = &ch_class;
7517             stclass_flag = SCF_DO_STCLASS_AND;
7518         } else                          /* XXXX Check for BOUND? */
7519             stclass_flag = 0;
7520         data.last_closep = &last_close;
7521
7522         DEBUG_RExC_seen();
7523         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7524                              scan + RExC_size, /* Up to end */
7525             &data, -1, 0, NULL,
7526             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7527                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7528             0);
7529
7530
7531         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7532
7533
7534         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7535              && data.last_start_min == 0 && data.last_end > 0
7536              && !RExC_seen_zerolen
7537              && !(RExC_seen & REG_VERBARG_SEEN)
7538              && !(RExC_seen & REG_GPOS_SEEN)
7539         ){
7540             r->extflags |= RXf_CHECK_ALL;
7541         }
7542         scan_commit(pRExC_state, &data,&minlen,0);
7543
7544         longest_float_length = CHR_SVLEN(data.longest_float);
7545
7546         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7547                    && data.offset_fixed == data.offset_float_min
7548                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7549             && S_setup_longest (aTHX_ pRExC_state,
7550                                     data.longest_float,
7551                                     &(r->float_utf8),
7552                                     &(r->float_substr),
7553                                     &(r->float_end_shift),
7554                                     data.lookbehind_float,
7555                                     data.offset_float_min,
7556                                     data.minlen_float,
7557                                     longest_float_length,
7558                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7559                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7560         {
7561             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7562             r->float_max_offset = data.offset_float_max;
7563             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7564                 r->float_max_offset -= data.lookbehind_float;
7565             SvREFCNT_inc_simple_void_NN(data.longest_float);
7566         }
7567         else {
7568             r->float_substr = r->float_utf8 = NULL;
7569             longest_float_length = 0;
7570         }
7571
7572         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7573
7574         if (S_setup_longest (aTHX_ pRExC_state,
7575                                 data.longest_fixed,
7576                                 &(r->anchored_utf8),
7577                                 &(r->anchored_substr),
7578                                 &(r->anchored_end_shift),
7579                                 data.lookbehind_fixed,
7580                                 data.offset_fixed,
7581                                 data.minlen_fixed,
7582                                 longest_fixed_length,
7583                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7584                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7585         {
7586             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7587             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7588         }
7589         else {
7590             r->anchored_substr = r->anchored_utf8 = NULL;
7591             longest_fixed_length = 0;
7592         }
7593         LEAVE_with_name("study_chunk");
7594
7595         if (ri->regstclass
7596             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7597             ri->regstclass = NULL;
7598
7599         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7600             && stclass_flag
7601             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7602             && is_ssc_worth_it(pRExC_state, data.start_class))
7603         {
7604             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7605
7606             ssc_finalize(pRExC_state, data.start_class);
7607
7608             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7609             StructCopy(data.start_class,
7610                        (regnode_ssc*)RExC_rxi->data->data[n],
7611                        regnode_ssc);
7612             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7613             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7614             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7615                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7616                       Perl_re_printf( aTHX_
7617                                     "synthetic stclass \"%s\".\n",
7618                                     SvPVX_const(sv));});
7619             data.start_class = NULL;
7620         }
7621
7622         /* A temporary algorithm prefers floated substr to fixed one to dig
7623          * more info. */
7624         if (longest_fixed_length > longest_float_length) {
7625             r->substrs->check_ix = 0;
7626             r->check_end_shift = r->anchored_end_shift;
7627             r->check_substr = r->anchored_substr;
7628             r->check_utf8 = r->anchored_utf8;
7629             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7630             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7631                 r->intflags |= PREGf_NOSCAN;
7632         }
7633         else {
7634             r->substrs->check_ix = 1;
7635             r->check_end_shift = r->float_end_shift;
7636             r->check_substr = r->float_substr;
7637             r->check_utf8 = r->float_utf8;
7638             r->check_offset_min = r->float_min_offset;
7639             r->check_offset_max = r->float_max_offset;
7640         }
7641         if ((r->check_substr || r->check_utf8) ) {
7642             r->extflags |= RXf_USE_INTUIT;
7643             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7644                 r->extflags |= RXf_INTUIT_TAIL;
7645         }
7646         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7647
7648         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7649         if ( (STRLEN)minlen < longest_float_length )
7650             minlen= longest_float_length;
7651         if ( (STRLEN)minlen < longest_fixed_length )
7652             minlen= longest_fixed_length;
7653         */
7654     }
7655     else {
7656         /* Several toplevels. Best we can is to set minlen. */
7657         SSize_t fake;
7658         regnode_ssc ch_class;
7659         SSize_t last_close = 0;
7660
7661         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
7662
7663         scan = ri->program + 1;
7664         ssc_init(pRExC_state, &ch_class);
7665         data.start_class = &ch_class;
7666         data.last_closep = &last_close;
7667
7668         DEBUG_RExC_seen();
7669         minlen = study_chunk(pRExC_state,
7670             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7671             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7672                                                       ? SCF_TRIE_DOING_RESTUDY
7673                                                       : 0),
7674             0);
7675
7676         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7677
7678         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7679                 = r->float_substr = r->float_utf8 = NULL;
7680
7681         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7682             && is_ssc_worth_it(pRExC_state, data.start_class))
7683         {
7684             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7685
7686             ssc_finalize(pRExC_state, data.start_class);
7687
7688             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7689             StructCopy(data.start_class,
7690                        (regnode_ssc*)RExC_rxi->data->data[n],
7691                        regnode_ssc);
7692             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7693             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7694             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7695                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7696                       Perl_re_printf( aTHX_
7697                                     "synthetic stclass \"%s\".\n",
7698                                     SvPVX_const(sv));});
7699             data.start_class = NULL;
7700         }
7701     }
7702
7703     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7704         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7705         r->maxlen = REG_INFTY;
7706     }
7707     else {
7708         r->maxlen = RExC_maxlen;
7709     }
7710
7711     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7712        the "real" pattern. */
7713     DEBUG_OPTIMISE_r({
7714         Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
7715                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7716     });
7717     r->minlenret = minlen;
7718     if (r->minlen < minlen)
7719         r->minlen = minlen;
7720
7721     if (RExC_seen & REG_RECURSE_SEEN ) {
7722         r->intflags |= PREGf_RECURSE_SEEN;
7723         Newxz(r->recurse_locinput, r->nparens + 1, char *);
7724     }
7725     if (RExC_seen & REG_GPOS_SEEN)
7726         r->intflags |= PREGf_GPOS_SEEN;
7727     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7728         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7729                                                 lookbehind */
7730     if (pRExC_state->code_blocks)
7731         r->extflags |= RXf_EVAL_SEEN;
7732     if (RExC_seen & REG_VERBARG_SEEN)
7733     {
7734         r->intflags |= PREGf_VERBARG_SEEN;
7735         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7736     }
7737     if (RExC_seen & REG_CUTGROUP_SEEN)
7738         r->intflags |= PREGf_CUTGROUP_SEEN;
7739     if (pm_flags & PMf_USE_RE_EVAL)
7740         r->intflags |= PREGf_USE_RE_EVAL;
7741     if (RExC_paren_names)
7742         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7743     else
7744         RXp_PAREN_NAMES(r) = NULL;
7745
7746     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7747      * so it can be used in pp.c */
7748     if (r->intflags & PREGf_ANCH)
7749         r->extflags |= RXf_IS_ANCHORED;
7750
7751
7752     {
7753         /* this is used to identify "special" patterns that might result
7754          * in Perl NOT calling the regex engine and instead doing the match "itself",
7755          * particularly special cases in split//. By having the regex compiler
7756          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7757          * we avoid weird issues with equivalent patterns resulting in different behavior,
7758          * AND we allow non Perl engines to get the same optimizations by the setting the
7759          * flags appropriately - Yves */
7760         regnode *first = ri->program + 1;
7761         U8 fop = OP(first);
7762         regnode *next = regnext(first);
7763         U8 nop = OP(next);
7764
7765         if (PL_regkind[fop] == NOTHING && nop == END)
7766             r->extflags |= RXf_NULL;
7767         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7768             /* when fop is SBOL first->flags will be true only when it was
7769              * produced by parsing /\A/, and not when parsing /^/. This is
7770              * very important for the split code as there we want to
7771              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7772              * See rt #122761 for more details. -- Yves */
7773             r->extflags |= RXf_START_ONLY;
7774         else if (fop == PLUS
7775                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7776                  && nop == END)
7777             r->extflags |= RXf_WHITE;
7778         else if ( r->extflags & RXf_SPLIT
7779                   && (fop == EXACT || fop == EXACTL)
7780                   && STR_LEN(first) == 1
7781                   && *(STRING(first)) == ' '
7782                   && nop == END )
7783             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7784
7785     }
7786
7787     if (RExC_contains_locale) {
7788         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7789     }
7790
7791 #ifdef DEBUGGING
7792     if (RExC_paren_names) {
7793         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7794         ri->data->data[ri->name_list_idx]
7795                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7796     } else
7797 #endif
7798     ri->name_list_idx = 0;
7799
7800     while ( RExC_recurse_count > 0 ) {
7801         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7802         /*
7803          * This data structure is set up in study_chunk() and is used
7804          * to calculate the distance between a GOSUB regopcode and
7805          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
7806          * it refers to.
7807          *
7808          * If for some reason someone writes code that optimises
7809          * away a GOSUB opcode then the assert should be changed to
7810          * an if(scan) to guard the ARG2L_SET() - Yves
7811          *
7812          */
7813         assert(scan && OP(scan) == GOSUB);
7814         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7815     }
7816
7817     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7818     /* assume we don't need to swap parens around before we match */
7819     DEBUG_TEST_r({
7820         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7821             (unsigned long)RExC_study_chunk_recursed_count);
7822     });
7823     DEBUG_DUMP_r({
7824         DEBUG_RExC_seen();
7825         Perl_re_printf( aTHX_ "Final program:\n");
7826         regdump(r);
7827     });
7828 #ifdef RE_TRACK_PATTERN_OFFSETS
7829     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7830         const STRLEN len = ri->u.offsets[0];
7831         STRLEN i;
7832         GET_RE_DEBUG_FLAGS_DECL;
7833         Perl_re_printf( aTHX_
7834                       "Offsets: [%" UVuf "]\n\t", (UV)ri->u.offsets[0]);
7835         for (i = 1; i <= len; i++) {
7836             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7837                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7838                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7839             }
7840         Perl_re_printf( aTHX_  "\n");
7841     });
7842 #endif
7843
7844 #ifdef USE_ITHREADS
7845     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7846      * by setting the regexp SV to readonly-only instead. If the
7847      * pattern's been recompiled, the USEDness should remain. */
7848     if (old_re && SvREADONLY(old_re))
7849         SvREADONLY_on(rx);
7850 #endif
7851     return rx;
7852 }
7853
7854
7855 SV*
7856 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7857                     const U32 flags)
7858 {
7859     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7860
7861     PERL_UNUSED_ARG(value);
7862
7863     if (flags & RXapif_FETCH) {
7864         return reg_named_buff_fetch(rx, key, flags);
7865     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7866         Perl_croak_no_modify();
7867         return NULL;
7868     } else if (flags & RXapif_EXISTS) {
7869         return reg_named_buff_exists(rx, key, flags)
7870             ? &PL_sv_yes
7871             : &PL_sv_no;
7872     } else if (flags & RXapif_REGNAMES) {
7873         return reg_named_buff_all(rx, flags);
7874     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7875         return reg_named_buff_scalar(rx, flags);
7876     } else {
7877         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7878         return NULL;
7879     }
7880 }
7881
7882 SV*
7883 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7884                          const U32 flags)
7885 {
7886     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7887     PERL_UNUSED_ARG(lastkey);
7888
7889     if (flags & RXapif_FIRSTKEY)
7890         return reg_named_buff_firstkey(rx, flags);
7891     else if (flags & RXapif_NEXTKEY)
7892         return reg_named_buff_nextkey(rx, flags);
7893     else {
7894         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7895                                             (int)flags);
7896         return NULL;
7897     }
7898 }
7899
7900 SV*
7901 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7902                           const U32 flags)
7903 {
7904     AV *retarray = NULL;
7905     SV *ret;
7906     struct regexp *const rx = ReANY(r);
7907
7908     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7909
7910     if (flags & RXapif_ALL)
7911         retarray=newAV();
7912
7913     if (rx && RXp_PAREN_NAMES(rx)) {
7914         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7915         if (he_str) {
7916             IV i;
7917             SV* sv_dat=HeVAL(he_str);
7918             I32 *nums=(I32*)SvPVX(sv_dat);
7919             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7920                 if ((I32)(rx->nparens) >= nums[i]
7921                     && rx->offs[nums[i]].start != -1
7922                     && rx->offs[nums[i]].end != -1)
7923                 {
7924                     ret = newSVpvs("");
7925                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7926                     if (!retarray)
7927                         return ret;
7928                 } else {
7929                     if (retarray)
7930                         ret = newSVsv(&PL_sv_undef);
7931                 }
7932                 if (retarray)
7933                     av_push(retarray, ret);
7934             }
7935             if (retarray)
7936                 return newRV_noinc(MUTABLE_SV(retarray));
7937         }
7938     }
7939     return NULL;
7940 }
7941
7942 bool
7943 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7944                            const U32 flags)
7945 {
7946     struct regexp *const rx = ReANY(r);
7947
7948     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7949
7950     if (rx && RXp_PAREN_NAMES(rx)) {
7951         if (flags & RXapif_ALL) {
7952             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7953         } else {
7954             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7955             if (sv) {
7956                 SvREFCNT_dec_NN(sv);
7957                 return TRUE;
7958             } else {
7959                 return FALSE;
7960             }
7961         }
7962     } else {
7963         return FALSE;
7964     }
7965 }
7966
7967 SV*
7968 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7969 {
7970     struct regexp *const rx = ReANY(r);
7971
7972     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7973
7974     if ( rx && RXp_PAREN_NAMES(rx) ) {
7975         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7976
7977         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7978     } else {
7979         return FALSE;
7980     }
7981 }
7982
7983 SV*
7984 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7985 {
7986     struct regexp *const rx = ReANY(r);
7987     GET_RE_DEBUG_FLAGS_DECL;
7988
7989     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7990
7991     if (rx && RXp_PAREN_NAMES(rx)) {
7992         HV *hv = RXp_PAREN_NAMES(rx);
7993         HE *temphe;
7994         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7995             IV i;
7996             IV parno = 0;
7997             SV* sv_dat = HeVAL(temphe);
7998             I32 *nums = (I32*)SvPVX(sv_dat);
7999             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8000                 if ((I32)(rx->lastparen) >= nums[i] &&
8001                     rx->offs[nums[i]].start != -1 &&
8002                     rx->offs[nums[i]].end != -1)
8003                 {
8004                     parno = nums[i];
8005                     break;
8006                 }
8007             }
8008             if (parno || flags & RXapif_ALL) {
8009                 return newSVhek(HeKEY_hek(temphe));
8010             }
8011         }
8012     }
8013     return NULL;
8014 }
8015
8016 SV*
8017 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8018 {
8019     SV *ret;
8020     AV *av;
8021     SSize_t length;
8022     struct regexp *const rx = ReANY(r);
8023
8024     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8025
8026     if (rx && RXp_PAREN_NAMES(rx)) {
8027         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8028             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8029         } else if (flags & RXapif_ONE) {
8030             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8031             av = MUTABLE_AV(SvRV(ret));
8032             length = av_tindex(av);
8033             SvREFCNT_dec_NN(ret);
8034             return newSViv(length + 1);
8035         } else {
8036             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8037                                                 (int)flags);
8038             return NULL;
8039         }
8040     }
8041     return &PL_sv_undef;
8042 }
8043
8044 SV*
8045 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8046 {
8047     struct regexp *const rx = ReANY(r);
8048     AV *av = newAV();
8049
8050     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8051
8052     if (rx && RXp_PAREN_NAMES(rx)) {
8053         HV *hv= RXp_PAREN_NAMES(rx);
8054         HE *temphe;
8055         (void)hv_iterinit(hv);
8056         while ( (temphe = hv_iternext_flags(hv,0)) ) {
8057             IV i;
8058             IV parno = 0;
8059             SV* sv_dat = HeVAL(temphe);
8060             I32 *nums = (I32*)SvPVX(sv_dat);
8061             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8062                 if ((I32)(rx->lastparen) >= nums[i] &&
8063                     rx->offs[nums[i]].start != -1 &&
8064                     rx->offs[nums[i]].end != -1)
8065                 {
8066                     parno = nums[i];
8067                     break;
8068                 }
8069             }
8070             if (parno || flags & RXapif_ALL) {
8071                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8072             }
8073         }
8074     }
8075
8076     return newRV_noinc(MUTABLE_SV(av));
8077 }
8078
8079 void
8080 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8081                              SV * const sv)
8082 {
8083     struct regexp *const rx = ReANY(r);
8084     char *s = NULL;
8085     SSize_t i = 0;
8086     SSize_t s1, t1;
8087     I32 n = paren;
8088
8089     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8090
8091     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8092            || n == RX_BUFF_IDX_CARET_FULLMATCH
8093            || n == RX_BUFF_IDX_CARET_POSTMATCH
8094        )
8095     {
8096         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8097         if (!keepcopy) {
8098             /* on something like
8099              *    $r = qr/.../;
8100              *    /$qr/p;
8101              * the KEEPCOPY is set on the PMOP rather than the regex */
8102             if (PL_curpm && r == PM_GETRE(PL_curpm))
8103                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8104         }
8105         if (!keepcopy)
8106             goto ret_undef;
8107     }
8108
8109     if (!rx->subbeg)
8110         goto ret_undef;
8111
8112     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8113         /* no need to distinguish between them any more */
8114         n = RX_BUFF_IDX_FULLMATCH;
8115
8116     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8117         && rx->offs[0].start != -1)
8118     {
8119         /* $`, ${^PREMATCH} */
8120         i = rx->offs[0].start;
8121         s = rx->subbeg;
8122     }
8123     else
8124     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8125         && rx->offs[0].end != -1)
8126     {
8127         /* $', ${^POSTMATCH} */
8128         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8129         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8130     }
8131     else
8132     if ( 0 <= n && n <= (I32)rx->nparens &&
8133         (s1 = rx->offs[n].start) != -1 &&
8134         (t1 = rx->offs[n].end) != -1)
8135     {
8136         /* $&, ${^MATCH},  $1 ... */
8137         i = t1 - s1;
8138         s = rx->subbeg + s1 - rx->suboffset;
8139     } else {
8140         goto ret_undef;
8141     }
8142
8143     assert(s >= rx->subbeg);
8144     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8145     if (i >= 0) {
8146 #ifdef NO_TAINT_SUPPORT
8147         sv_setpvn(sv, s, i);
8148 #else
8149         const int oldtainted = TAINT_get;
8150         TAINT_NOT;
8151         sv_setpvn(sv, s, i);
8152         TAINT_set(oldtainted);
8153 #endif
8154         if (RXp_MATCH_UTF8(rx))
8155             SvUTF8_on(sv);
8156         else
8157             SvUTF8_off(sv);
8158         if (TAINTING_get) {
8159             if (RXp_MATCH_TAINTED(rx)) {
8160                 if (SvTYPE(sv) >= SVt_PVMG) {
8161                     MAGIC* const mg = SvMAGIC(sv);
8162                     MAGIC* mgt;
8163                     TAINT;
8164                     SvMAGIC_set(sv, mg->mg_moremagic);
8165                     SvTAINT(sv);
8166                     if ((mgt = SvMAGIC(sv))) {
8167                         mg->mg_moremagic = mgt;
8168                         SvMAGIC_set(sv, mg);
8169                     }
8170                 } else {
8171                     TAINT;
8172                     SvTAINT(sv);
8173                 }
8174             } else
8175                 SvTAINTED_off(sv);
8176         }
8177     } else {
8178       ret_undef:
8179         sv_set_undef(sv);
8180         return;
8181     }
8182 }
8183
8184 void
8185 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8186                                                          SV const * const value)
8187 {
8188     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8189
8190     PERL_UNUSED_ARG(rx);
8191     PERL_UNUSED_ARG(paren);
8192     PERL_UNUSED_ARG(value);
8193
8194     if (!PL_localizing)
8195         Perl_croak_no_modify();
8196 }
8197
8198 I32
8199 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8200                               const I32 paren)
8201 {
8202     struct regexp *const rx = ReANY(r);
8203     I32 i;
8204     I32 s1, t1;
8205
8206     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8207
8208     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8209         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8210         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8211     )
8212     {
8213         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8214         if (!keepcopy) {
8215             /* on something like
8216              *    $r = qr/.../;
8217              *    /$qr/p;
8218              * the KEEPCOPY is set on the PMOP rather than the regex */
8219             if (PL_curpm && r == PM_GETRE(PL_curpm))
8220                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8221         }
8222         if (!keepcopy)
8223             goto warn_undef;
8224     }
8225
8226     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8227     switch (paren) {
8228       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8229       case RX_BUFF_IDX_PREMATCH:       /* $` */
8230         if (rx->offs[0].start != -1) {
8231                         i = rx->offs[0].start;
8232                         if (i > 0) {
8233                                 s1 = 0;
8234                                 t1 = i;
8235                                 goto getlen;
8236                         }
8237             }
8238         return 0;
8239
8240       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8241       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8242             if (rx->offs[0].end != -1) {
8243                         i = rx->sublen - rx->offs[0].end;
8244                         if (i > 0) {
8245                                 s1 = rx->offs[0].end;
8246                                 t1 = rx->sublen;
8247                                 goto getlen;
8248                         }
8249             }
8250         return 0;
8251
8252       default: /* $& / ${^MATCH}, $1, $2, ... */
8253             if (paren <= (I32)rx->nparens &&
8254             (s1 = rx->offs[paren].start) != -1 &&
8255             (t1 = rx->offs[paren].end) != -1)
8256             {
8257             i = t1 - s1;
8258             goto getlen;
8259         } else {
8260           warn_undef:
8261             if (ckWARN(WARN_UNINITIALIZED))
8262                 report_uninit((const SV *)sv);
8263             return 0;
8264         }
8265     }
8266   getlen:
8267     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8268         const char * const s = rx->subbeg - rx->suboffset + s1;
8269         const U8 *ep;
8270         STRLEN el;
8271
8272         i = t1 - s1;
8273         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8274                         i = el;
8275     }
8276     return i;
8277 }
8278
8279 SV*
8280 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8281 {
8282     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8283         PERL_UNUSED_ARG(rx);
8284         if (0)
8285             return NULL;
8286         else
8287             return newSVpvs("Regexp");
8288 }
8289
8290 /* Scans the name of a named buffer from the pattern.
8291  * If flags is REG_RSN_RETURN_NULL returns null.
8292  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8293  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8294  * to the parsed name as looked up in the RExC_paren_names hash.
8295  * If there is an error throws a vFAIL().. type exception.
8296  */
8297
8298 #define REG_RSN_RETURN_NULL    0
8299 #define REG_RSN_RETURN_NAME    1
8300 #define REG_RSN_RETURN_DATA    2
8301
8302 STATIC SV*
8303 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8304 {
8305     char *name_start = RExC_parse;
8306
8307     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8308
8309     assert (RExC_parse <= RExC_end);
8310     if (RExC_parse == RExC_end) NOOP;
8311     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8312          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8313           * using do...while */
8314         if (UTF)
8315             do {
8316                 RExC_parse += UTF8SKIP(RExC_parse);
8317             } while (   RExC_parse < RExC_end
8318                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8319         else
8320             do {
8321                 RExC_parse++;
8322             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8323     } else {
8324         RExC_parse++; /* so the <- from the vFAIL is after the offending
8325                          character */
8326         vFAIL("Group name must start with a non-digit word character");
8327     }
8328     if ( flags ) {
8329         SV* sv_name
8330             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8331                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8332         if ( flags == REG_RSN_RETURN_NAME)
8333             return sv_name;
8334         else if (flags==REG_RSN_RETURN_DATA) {
8335             HE *he_str = NULL;
8336             SV *sv_dat = NULL;
8337             if ( ! sv_name )      /* should not happen*/
8338                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8339             if (RExC_paren_names)
8340                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8341             if ( he_str )
8342                 sv_dat = HeVAL(he_str);
8343             if ( ! sv_dat )
8344                 vFAIL("Reference to nonexistent named group");
8345             return sv_dat;
8346         }
8347         else {
8348             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8349                        (unsigned long) flags);
8350         }
8351         NOT_REACHED; /* NOTREACHED */
8352     }
8353     return NULL;
8354 }
8355
8356 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8357     int num;                                                    \
8358     if (RExC_lastparse!=RExC_parse) {                           \
8359         Perl_re_printf( aTHX_  "%s",                                        \
8360             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8361                 RExC_end - RExC_parse, 16,                      \
8362                 "", "",                                         \
8363                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8364                 PERL_PV_PRETTY_ELLIPSES   |                     \
8365                 PERL_PV_PRETTY_LTGT       |                     \
8366                 PERL_PV_ESCAPE_RE         |                     \
8367                 PERL_PV_PRETTY_EXACTSIZE                        \
8368             )                                                   \
8369         );                                                      \
8370     } else                                                      \
8371         Perl_re_printf( aTHX_ "%16s","");                                   \
8372                                                                 \
8373     if (SIZE_ONLY)                                              \
8374        num = RExC_size + 1;                                     \
8375     else                                                        \
8376        num=REG_NODE_NUM(RExC_emit);                             \
8377     if (RExC_lastnum!=num)                                      \
8378        Perl_re_printf( aTHX_ "|%4d",num);                                   \
8379     else                                                        \
8380        Perl_re_printf( aTHX_ "|%4s","");                                    \
8381     Perl_re_printf( aTHX_ "|%*s%-4s",                                       \
8382         (int)((depth*2)), "",                                   \
8383         (funcname)                                              \
8384     );                                                          \
8385     RExC_lastnum=num;                                           \
8386     RExC_lastparse=RExC_parse;                                  \
8387 })
8388
8389
8390
8391 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8392     DEBUG_PARSE_MSG((funcname));                            \
8393     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8394 })
8395 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8396     DEBUG_PARSE_MSG((funcname));                            \
8397     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8398 })
8399
8400 /* This section of code defines the inversion list object and its methods.  The
8401  * interfaces are highly subject to change, so as much as possible is static to
8402  * this file.  An inversion list is here implemented as a malloc'd C UV array
8403  * as an SVt_INVLIST scalar.
8404  *
8405  * An inversion list for Unicode is an array of code points, sorted by ordinal
8406  * number.  Each element gives the code point that begins a range that extends
8407  * up-to but not including the code point given by the next element.  The final
8408  * element gives the first code point of a range that extends to the platform's
8409  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8410  * ...) give ranges whose code points are all in the inversion list.  We say
8411  * that those ranges are in the set.  The odd-numbered elements give ranges
8412  * whose code points are not in the inversion list, and hence not in the set.
8413  * Thus, element [0] is the first code point in the list.  Element [1]
8414  * is the first code point beyond that not in the list; and element [2] is the
8415  * first code point beyond that that is in the list.  In other words, the first
8416  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8417  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8418  * all code points in that range are not in the inversion list.  The third
8419  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8420  * list, and so forth.  Thus every element whose index is divisible by two
8421  * gives the beginning of a range that is in the list, and every element whose
8422  * index is not divisible by two gives the beginning of a range not in the
8423  * list.  If the final element's index is divisible by two, the inversion list
8424  * extends to the platform's infinity; otherwise the highest code point in the
8425  * inversion list is the contents of that element minus 1.
8426  *
8427  * A range that contains just a single code point N will look like
8428  *  invlist[i]   == N
8429  *  invlist[i+1] == N+1
8430  *
8431  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8432  * impossible to represent, so element [i+1] is omitted.  The single element
8433  * inversion list
8434  *  invlist[0] == UV_MAX
8435  * contains just UV_MAX, but is interpreted as matching to infinity.
8436  *
8437  * Taking the complement (inverting) an inversion list is quite simple, if the
8438  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8439  * This implementation reserves an element at the beginning of each inversion
8440  * list to always contain 0; there is an additional flag in the header which
8441  * indicates if the list begins at the 0, or is offset to begin at the next
8442  * element.  This means that the inversion list can be inverted without any
8443  * copying; just flip the flag.
8444  *
8445  * More about inversion lists can be found in "Unicode Demystified"
8446  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8447  *
8448  * The inversion list data structure is currently implemented as an SV pointing
8449  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8450  * array of UV whose memory management is automatically handled by the existing
8451  * facilities for SV's.
8452  *
8453  * Some of the methods should always be private to the implementation, and some
8454  * should eventually be made public */
8455
8456 /* The header definitions are in F<invlist_inline.h> */
8457
8458 #ifndef PERL_IN_XSUB_RE
8459
8460 PERL_STATIC_INLINE UV*
8461 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8462 {
8463     /* Returns a pointer to the first element in the inversion list's array.
8464      * This is called upon initialization of an inversion list.  Where the
8465      * array begins depends on whether the list has the code point U+0000 in it
8466      * or not.  The other parameter tells it whether the code that follows this
8467      * call is about to put a 0 in the inversion list or not.  The first
8468      * element is either the element reserved for 0, if TRUE, or the element
8469      * after it, if FALSE */
8470
8471     bool* offset = get_invlist_offset_addr(invlist);
8472     UV* zero_addr = (UV *) SvPVX(invlist);
8473
8474     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8475
8476     /* Must be empty */
8477     assert(! _invlist_len(invlist));
8478
8479     *zero_addr = 0;
8480
8481     /* 1^1 = 0; 1^0 = 1 */
8482     *offset = 1 ^ will_have_0;
8483     return zero_addr + *offset;
8484 }
8485
8486 #endif
8487
8488 PERL_STATIC_INLINE void
8489 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8490 {
8491     /* Sets the current number of elements stored in the inversion list.
8492      * Updates SvCUR correspondingly */
8493     PERL_UNUSED_CONTEXT;
8494     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8495
8496     assert(SvTYPE(invlist) == SVt_INVLIST);
8497
8498     SvCUR_set(invlist,
8499               (len == 0)
8500                ? 0
8501                : TO_INTERNAL_SIZE(len + offset));
8502     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8503 }
8504
8505 #ifndef PERL_IN_XSUB_RE
8506
8507 STATIC void
8508 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8509 {
8510     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
8511      * steals the list from 'src', so 'src' is made to have a NULL list.  This
8512      * is similar to what SvSetMagicSV() would do, if it were implemented on
8513      * inversion lists, though this routine avoids a copy */
8514
8515     const UV src_len          = _invlist_len(src);
8516     const bool src_offset     = *get_invlist_offset_addr(src);
8517     const STRLEN src_byte_len = SvLEN(src);
8518     char * array              = SvPVX(src);
8519
8520     const int oldtainted = TAINT_get;
8521
8522     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8523
8524     assert(SvTYPE(src) == SVt_INVLIST);
8525     assert(SvTYPE(dest) == SVt_INVLIST);
8526     assert(! invlist_is_iterating(src));
8527     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8528
8529     /* Make sure it ends in the right place with a NUL, as our inversion list
8530      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8531      * asserts it */
8532     array[src_byte_len - 1] = '\0';
8533
8534     TAINT_NOT;      /* Otherwise it breaks */
8535     sv_usepvn_flags(dest,
8536                     (char *) array,
8537                     src_byte_len - 1,
8538
8539                     /* This flag is documented to cause a copy to be avoided */
8540                     SV_HAS_TRAILING_NUL);
8541     TAINT_set(oldtainted);
8542     SvPV_set(src, 0);
8543     SvLEN_set(src, 0);
8544     SvCUR_set(src, 0);
8545
8546     /* Finish up copying over the other fields in an inversion list */
8547     *get_invlist_offset_addr(dest) = src_offset;
8548     invlist_set_len(dest, src_len, src_offset);
8549     *get_invlist_previous_index_addr(dest) = 0;
8550     invlist_iterfinish(dest);
8551 }
8552
8553 PERL_STATIC_INLINE IV*
8554 S_get_invlist_previous_index_addr(SV* invlist)
8555 {
8556     /* Return the address of the IV that is reserved to hold the cached index
8557      * */
8558     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8559
8560     assert(SvTYPE(invlist) == SVt_INVLIST);
8561
8562     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8563 }
8564
8565 PERL_STATIC_INLINE IV
8566 S_invlist_previous_index(SV* const invlist)
8567 {
8568     /* Returns cached index of previous search */
8569
8570     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8571
8572     return *get_invlist_previous_index_addr(invlist);
8573 }
8574
8575 PERL_STATIC_INLINE void
8576 S_invlist_set_previous_index(SV* const invlist, const IV index)
8577 {
8578     /* Caches <index> for later retrieval */
8579
8580     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8581
8582     assert(index == 0 || index < (int) _invlist_len(invlist));
8583
8584     *get_invlist_previous_index_addr(invlist) = index;
8585 }
8586
8587 PERL_STATIC_INLINE void
8588 S_invlist_trim(SV* invlist)
8589 {
8590     /* Free the not currently-being-used space in an inversion list */
8591
8592     /* But don't free up the space needed for the 0 UV that is always at the
8593      * beginning of the list, nor the trailing NUL */
8594     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8595
8596     PERL_ARGS_ASSERT_INVLIST_TRIM;
8597
8598     assert(SvTYPE(invlist) == SVt_INVLIST);
8599
8600     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8601 }
8602
8603 PERL_STATIC_INLINE void
8604 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
8605 {
8606     PERL_ARGS_ASSERT_INVLIST_CLEAR;
8607
8608     assert(SvTYPE(invlist) == SVt_INVLIST);
8609
8610     invlist_set_len(invlist, 0, 0);
8611     invlist_trim(invlist);
8612 }
8613
8614 #endif /* ifndef PERL_IN_XSUB_RE */
8615
8616 PERL_STATIC_INLINE bool
8617 S_invlist_is_iterating(SV* const invlist)
8618 {
8619     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8620
8621     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8622 }
8623
8624 #ifndef PERL_IN_XSUB_RE
8625
8626 PERL_STATIC_INLINE UV
8627 S_invlist_max(SV* const invlist)
8628 {
8629     /* Returns the maximum number of elements storable in the inversion list's
8630      * array, without having to realloc() */
8631
8632     PERL_ARGS_ASSERT_INVLIST_MAX;
8633
8634     assert(SvTYPE(invlist) == SVt_INVLIST);
8635
8636     /* Assumes worst case, in which the 0 element is not counted in the
8637      * inversion list, so subtracts 1 for that */
8638     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8639            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8640            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8641 }
8642 SV*
8643 Perl__new_invlist(pTHX_ IV initial_size)
8644 {
8645
8646     /* Return a pointer to a newly constructed inversion list, with enough
8647      * space to store 'initial_size' elements.  If that number is negative, a
8648      * system default is used instead */
8649
8650     SV* new_list;
8651
8652     if (initial_size < 0) {
8653         initial_size = 10;
8654     }
8655
8656     /* Allocate the initial space */
8657     new_list = newSV_type(SVt_INVLIST);
8658
8659     /* First 1 is in case the zero element isn't in the list; second 1 is for
8660      * trailing NUL */
8661     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8662     invlist_set_len(new_list, 0, 0);
8663
8664     /* Force iterinit() to be used to get iteration to work */
8665     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8666
8667     *get_invlist_previous_index_addr(new_list) = 0;
8668
8669     return new_list;
8670 }
8671
8672 SV*
8673 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8674 {
8675     /* Return a pointer to a newly constructed inversion list, initialized to
8676      * point to <list>, which has to be in the exact correct inversion list
8677      * form, including internal fields.  Thus this is a dangerous routine that
8678      * should not be used in the wrong hands.  The passed in 'list' contains
8679      * several header fields at the beginning that are not part of the
8680      * inversion list body proper */
8681
8682     const STRLEN length = (STRLEN) list[0];
8683     const UV version_id =          list[1];
8684     const bool offset   =    cBOOL(list[2]);
8685 #define HEADER_LENGTH 3
8686     /* If any of the above changes in any way, you must change HEADER_LENGTH
8687      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8688      *      perl -E 'say int(rand 2**31-1)'
8689      */
8690 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8691                                         data structure type, so that one being
8692                                         passed in can be validated to be an
8693                                         inversion list of the correct vintage.
8694                                        */
8695
8696     SV* invlist = newSV_type(SVt_INVLIST);
8697
8698     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8699
8700     if (version_id != INVLIST_VERSION_ID) {
8701         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8702     }
8703
8704     /* The generated array passed in includes header elements that aren't part
8705      * of the list proper, so start it just after them */
8706     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8707
8708     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8709                                shouldn't touch it */
8710
8711     *(get_invlist_offset_addr(invlist)) = offset;
8712
8713     /* The 'length' passed to us is the physical number of elements in the
8714      * inversion list.  But if there is an offset the logical number is one
8715      * less than that */
8716     invlist_set_len(invlist, length  - offset, offset);
8717
8718     invlist_set_previous_index(invlist, 0);
8719
8720     /* Initialize the iteration pointer. */
8721     invlist_iterfinish(invlist);
8722
8723     SvREADONLY_on(invlist);
8724
8725     return invlist;
8726 }
8727
8728 STATIC void
8729 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8730 {
8731     /* Grow the maximum size of an inversion list */
8732
8733     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8734
8735     assert(SvTYPE(invlist) == SVt_INVLIST);
8736
8737     /* Add one to account for the zero element at the beginning which may not
8738      * be counted by the calling parameters */
8739     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8740 }
8741
8742 STATIC void
8743 S__append_range_to_invlist(pTHX_ SV* const invlist,
8744                                  const UV start, const UV end)
8745 {
8746    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8747     * the end of the inversion list.  The range must be above any existing
8748     * ones. */
8749
8750     UV* array;
8751     UV max = invlist_max(invlist);
8752     UV len = _invlist_len(invlist);
8753     bool offset;
8754
8755     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8756
8757     if (len == 0) { /* Empty lists must be initialized */
8758         offset = start != 0;
8759         array = _invlist_array_init(invlist, ! offset);
8760     }
8761     else {
8762         /* Here, the existing list is non-empty. The current max entry in the
8763          * list is generally the first value not in the set, except when the
8764          * set extends to the end of permissible values, in which case it is
8765          * the first entry in that final set, and so this call is an attempt to
8766          * append out-of-order */
8767
8768         UV final_element = len - 1;
8769         array = invlist_array(invlist);
8770         if (   array[final_element] > start
8771             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8772         {
8773             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
8774                      array[final_element], start,
8775                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8776         }
8777
8778         /* Here, it is a legal append.  If the new range begins 1 above the end
8779          * of the range below it, it is extending the range below it, so the
8780          * new first value not in the set is one greater than the newly
8781          * extended range.  */
8782         offset = *get_invlist_offset_addr(invlist);
8783         if (array[final_element] == start) {
8784             if (end != UV_MAX) {
8785                 array[final_element] = end + 1;
8786             }
8787             else {
8788                 /* But if the end is the maximum representable on the machine,
8789                  * assume that infinity was actually what was meant.  Just let
8790                  * the range that this would extend to have no end */
8791                 invlist_set_len(invlist, len - 1, offset);
8792             }
8793             return;
8794         }
8795     }
8796
8797     /* Here the new range doesn't extend any existing set.  Add it */
8798
8799     len += 2;   /* Includes an element each for the start and end of range */
8800
8801     /* If wll overflow the existing space, extend, which may cause the array to
8802      * be moved */
8803     if (max < len) {
8804         invlist_extend(invlist, len);
8805
8806         /* Have to set len here to avoid assert failure in invlist_array() */
8807         invlist_set_len(invlist, len, offset);
8808
8809         array = invlist_array(invlist);
8810     }
8811     else {
8812         invlist_set_len(invlist, len, offset);
8813     }
8814
8815     /* The next item on the list starts the range, the one after that is
8816      * one past the new range.  */
8817     array[len - 2] = start;
8818     if (end != UV_MAX) {
8819         array[len - 1] = end + 1;
8820     }
8821     else {
8822         /* But if the end is the maximum representable on the machine, just let
8823          * the range have no end */
8824         invlist_set_len(invlist, len - 1, offset);
8825     }
8826 }
8827
8828 SSize_t
8829 Perl__invlist_search(SV* const invlist, const UV cp)
8830 {
8831     /* Searches the inversion list for the entry that contains the input code
8832      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8833      * return value is the index into the list's array of the range that
8834      * contains <cp>, that is, 'i' such that
8835      *  array[i] <= cp < array[i+1]
8836      */
8837
8838     IV low = 0;
8839     IV mid;
8840     IV high = _invlist_len(invlist);
8841     const IV highest_element = high - 1;
8842     const UV* array;
8843
8844     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8845
8846     /* If list is empty, return failure. */
8847     if (high == 0) {
8848         return -1;
8849     }
8850
8851     /* (We can't get the array unless we know the list is non-empty) */
8852     array = invlist_array(invlist);
8853
8854     mid = invlist_previous_index(invlist);
8855     assert(mid >=0);
8856     if (mid > highest_element) {
8857         mid = highest_element;
8858     }
8859
8860     /* <mid> contains the cache of the result of the previous call to this
8861      * function (0 the first time).  See if this call is for the same result,
8862      * or if it is for mid-1.  This is under the theory that calls to this
8863      * function will often be for related code points that are near each other.
8864      * And benchmarks show that caching gives better results.  We also test
8865      * here if the code point is within the bounds of the list.  These tests
8866      * replace others that would have had to be made anyway to make sure that
8867      * the array bounds were not exceeded, and these give us extra information
8868      * at the same time */
8869     if (cp >= array[mid]) {
8870         if (cp >= array[highest_element]) {
8871             return highest_element;
8872         }
8873
8874         /* Here, array[mid] <= cp < array[highest_element].  This means that
8875          * the final element is not the answer, so can exclude it; it also
8876          * means that <mid> is not the final element, so can refer to 'mid + 1'
8877          * safely */
8878         if (cp < array[mid + 1]) {
8879             return mid;
8880         }
8881         high--;
8882         low = mid + 1;
8883     }
8884     else { /* cp < aray[mid] */
8885         if (cp < array[0]) { /* Fail if outside the array */
8886             return -1;
8887         }
8888         high = mid;
8889         if (cp >= array[mid - 1]) {
8890             goto found_entry;
8891         }
8892     }
8893
8894     /* Binary search.  What we are looking for is <i> such that
8895      *  array[i] <= cp < array[i+1]
8896      * The loop below converges on the i+1.  Note that there may not be an
8897      * (i+1)th element in the array, and things work nonetheless */
8898     while (low < high) {
8899         mid = (low + high) / 2;
8900         assert(mid <= highest_element);
8901         if (array[mid] <= cp) { /* cp >= array[mid] */
8902             low = mid + 1;
8903
8904             /* We could do this extra test to exit the loop early.
8905             if (cp < array[low]) {
8906                 return mid;
8907             }
8908             */
8909         }
8910         else { /* cp < array[mid] */
8911             high = mid;
8912         }
8913     }
8914
8915   found_entry:
8916     high--;
8917     invlist_set_previous_index(invlist, high);
8918     return high;
8919 }
8920
8921 void
8922 Perl__invlist_populate_swatch(SV* const invlist,
8923                               const UV start, const UV end, U8* swatch)
8924 {
8925     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8926      * but is used when the swash has an inversion list.  This makes this much
8927      * faster, as it uses a binary search instead of a linear one.  This is
8928      * intimately tied to that function, and perhaps should be in utf8.c,
8929      * except it is intimately tied to inversion lists as well.  It assumes
8930      * that <swatch> is all 0's on input */
8931
8932     UV current = start;
8933     const IV len = _invlist_len(invlist);
8934     IV i;
8935     const UV * array;
8936
8937     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8938
8939     if (len == 0) { /* Empty inversion list */
8940         return;
8941     }
8942
8943     array = invlist_array(invlist);
8944
8945     /* Find which element it is */
8946     i = _invlist_search(invlist, start);
8947
8948     /* We populate from <start> to <end> */
8949     while (current < end) {
8950         UV upper;
8951
8952         /* The inversion list gives the results for every possible code point
8953          * after the first one in the list.  Only those ranges whose index is
8954          * even are ones that the inversion list matches.  For the odd ones,
8955          * and if the initial code point is not in the list, we have to skip
8956          * forward to the next element */
8957         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8958             i++;
8959             if (i >= len) { /* Finished if beyond the end of the array */
8960                 return;
8961             }
8962             current = array[i];
8963             if (current >= end) {   /* Finished if beyond the end of what we
8964                                        are populating */
8965                 if (LIKELY(end < UV_MAX)) {
8966                     return;
8967                 }
8968
8969                 /* We get here when the upper bound is the maximum
8970                  * representable on the machine, and we are looking for just
8971                  * that code point.  Have to special case it */
8972                 i = len;
8973                 goto join_end_of_list;
8974             }
8975         }
8976         assert(current >= start);
8977
8978         /* The current range ends one below the next one, except don't go past
8979          * <end> */
8980         i++;
8981         upper = (i < len && array[i] < end) ? array[i] : end;
8982
8983         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8984          * for each code point in it */
8985         for (; current < upper; current++) {
8986             const STRLEN offset = (STRLEN)(current - start);
8987             swatch[offset >> 3] |= 1 << (offset & 7);
8988         }
8989
8990       join_end_of_list:
8991
8992         /* Quit if at the end of the list */
8993         if (i >= len) {
8994
8995             /* But first, have to deal with the highest possible code point on
8996              * the platform.  The previous code assumes that <end> is one
8997              * beyond where we want to populate, but that is impossible at the
8998              * platform's infinity, so have to handle it specially */
8999             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9000             {
9001                 const STRLEN offset = (STRLEN)(end - start);
9002                 swatch[offset >> 3] |= 1 << (offset & 7);
9003             }
9004             return;
9005         }
9006
9007         /* Advance to the next range, which will be for code points not in the
9008          * inversion list */
9009         current = array[i];
9010     }
9011
9012     return;
9013 }
9014
9015 void
9016 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9017                                          const bool complement_b, SV** output)
9018 {
9019     /* Take the union of two inversion lists and point '*output' to it.  On
9020      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9021      * even 'a' or 'b').  If to an inversion list, the contents of the original
9022      * list will be replaced by the union.  The first list, 'a', may be
9023      * NULL, in which case a copy of the second list is placed in '*output'.
9024      * If 'complement_b' is TRUE, the union is taken of the complement
9025      * (inversion) of 'b' instead of b itself.
9026      *
9027      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9028      * Richard Gillam, published by Addison-Wesley, and explained at some
9029      * length there.  The preface says to incorporate its examples into your
9030      * code at your own risk.
9031      *
9032      * The algorithm is like a merge sort. */
9033
9034     const UV* array_a;    /* a's array */
9035     const UV* array_b;
9036     UV len_a;       /* length of a's array */
9037     UV len_b;
9038
9039     SV* u;                      /* the resulting union */
9040     UV* array_u;
9041     UV len_u = 0;
9042
9043     UV i_a = 0;             /* current index into a's array */
9044     UV i_b = 0;
9045     UV i_u = 0;
9046
9047     /* running count, as explained in the algorithm source book; items are
9048      * stopped accumulating and are output when the count changes to/from 0.
9049      * The count is incremented when we start a range that's in an input's set,
9050      * and decremented when we start a range that's not in a set.  So this
9051      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9052      * and hence nothing goes into the union; 1, just one of the inputs is in
9053      * its set (and its current range gets added to the union); and 2 when both
9054      * inputs are in their sets.  */
9055     UV count = 0;
9056
9057     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9058     assert(a != b);
9059     assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
9060
9061     len_b = _invlist_len(b);
9062     if (len_b == 0) {
9063
9064         /* Here, 'b' is empty, hence it's complement is all possible code
9065          * points.  So if the union includes the complement of 'b', it includes
9066          * everything, and we need not even look at 'a'.  It's easiest to
9067          * create a new inversion list that matches everything.  */
9068         if (complement_b) {
9069             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9070
9071             if (*output == NULL) { /* If the output didn't exist, just point it
9072                                       at the new list */
9073                 *output = everything;
9074             }
9075             else { /* Otherwise, replace its contents with the new list */
9076                 invlist_replace_list_destroys_src(*output, everything);
9077                 SvREFCNT_dec_NN(everything);
9078             }
9079
9080             return;
9081         }
9082
9083         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9084          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9085          * output will be empty */
9086
9087         if (a == NULL || _invlist_len(a) == 0) {
9088             if (*output == NULL) {
9089                 *output = _new_invlist(0);
9090             }
9091             else {
9092                 invlist_clear(*output);
9093             }
9094             return;
9095         }
9096
9097         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9098          * union.  We can just return a copy of 'a' if '*output' doesn't point
9099          * to an existing list */
9100         if (*output == NULL) {
9101             *output = invlist_clone(a);
9102             return;
9103         }
9104
9105         /* If the output is to overwrite 'a', we have a no-op, as it's
9106          * already in 'a' */
9107         if (*output == a) {
9108             return;
9109         }
9110
9111         /* Here, '*output' is to be overwritten by 'a' */
9112         u = invlist_clone(a);
9113         invlist_replace_list_destroys_src(*output, u);
9114         SvREFCNT_dec_NN(u);
9115
9116         return;
9117     }
9118
9119     /* Here 'b' is not empty.  See about 'a' */
9120
9121     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9122
9123         /* Here, 'a' is empty (and b is not).  That means the union will come
9124          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9125          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9126          * the clone */
9127
9128         SV ** dest = (*output == NULL) ? output : &u;
9129         *dest = invlist_clone(b);
9130         if (complement_b) {
9131             _invlist_invert(*dest);
9132         }
9133
9134         if (dest == &u) {
9135             invlist_replace_list_destroys_src(*output, u);
9136             SvREFCNT_dec_NN(u);
9137         }
9138
9139         return;
9140     }
9141
9142     /* Here both lists exist and are non-empty */
9143     array_a = invlist_array(a);
9144     array_b = invlist_array(b);
9145
9146     /* If are to take the union of 'a' with the complement of b, set it
9147      * up so are looking at b's complement. */
9148     if (complement_b) {
9149
9150         /* To complement, we invert: if the first element is 0, remove it.  To
9151          * do this, we just pretend the array starts one later */
9152         if (array_b[0] == 0) {
9153             array_b++;
9154             len_b--;
9155         }
9156         else {
9157
9158             /* But if the first element is not zero, we pretend the list starts
9159              * at the 0 that is always stored immediately before the array. */
9160             array_b--;
9161             len_b++;
9162         }
9163     }
9164
9165     /* Size the union for the worst case: that the sets are completely
9166      * disjoint */
9167     u = _new_invlist(len_a + len_b);
9168
9169     /* Will contain U+0000 if either component does */
9170     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9171                                       || (len_b > 0 && array_b[0] == 0));
9172
9173     /* Go through each input list item by item, stopping when have exhausted
9174      * one of them */
9175     while (i_a < len_a && i_b < len_b) {
9176         UV cp;      /* The element to potentially add to the union's array */
9177         bool cp_in_set;   /* is it in the the input list's set or not */
9178
9179         /* We need to take one or the other of the two inputs for the union.
9180          * Since we are merging two sorted lists, we take the smaller of the
9181          * next items.  In case of a tie, we take first the one that is in its
9182          * set.  If we first took the one not in its set, it would decrement
9183          * the count, possibly to 0 which would cause it to be output as ending
9184          * the range, and the next time through we would take the same number,
9185          * and output it again as beginning the next range.  By doing it the
9186          * opposite way, there is no possibility that the count will be
9187          * momentarily decremented to 0, and thus the two adjoining ranges will
9188          * be seamlessly merged.  (In a tie and both are in the set or both not
9189          * in the set, it doesn't matter which we take first.) */
9190         if (       array_a[i_a] < array_b[i_b]
9191             || (   array_a[i_a] == array_b[i_b]
9192                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9193         {
9194             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9195             cp = array_a[i_a++];
9196         }
9197         else {
9198             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9199             cp = array_b[i_b++];
9200         }
9201
9202         /* Here, have chosen which of the two inputs to look at.  Only output
9203          * if the running count changes to/from 0, which marks the
9204          * beginning/end of a range that's in the set */
9205         if (cp_in_set) {
9206             if (count == 0) {
9207                 array_u[i_u++] = cp;
9208             }
9209             count++;
9210         }
9211         else {
9212             count--;
9213             if (count == 0) {
9214                 array_u[i_u++] = cp;
9215             }
9216         }
9217     }
9218
9219
9220     /* The loop above increments the index into exactly one of the input lists
9221      * each iteration, and ends when either index gets to its list end.  That
9222      * means the other index is lower than its end, and so something is
9223      * remaining in that one.  We decrement 'count', as explained below, if
9224      * that list is in its set.  (i_a and i_b each currently index the element
9225      * beyond the one we care about.) */
9226     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9227         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9228     {
9229         count--;
9230     }
9231
9232     /* Above we decremented 'count' if the list that had unexamined elements in
9233      * it was in its set.  This has made it so that 'count' being non-zero
9234      * means there isn't anything left to output; and 'count' equal to 0 means
9235      * that what is left to output is precisely that which is left in the
9236      * non-exhausted input list.
9237      *
9238      * To see why, note first that the exhausted input obviously has nothing
9239      * left to add to the union.  If it was in its set at its end, that means
9240      * the set extends from here to the platform's infinity, and hence so does
9241      * the union and the non-exhausted set is irrelevant.  The exhausted set
9242      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9243      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9244      * 'count' remains at 1.  This is consistent with the decremented 'count'
9245      * != 0 meaning there's nothing left to add to the union.
9246      *
9247      * But if the exhausted input wasn't in its set, it contributed 0 to
9248      * 'count', and the rest of the union will be whatever the other input is.
9249      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9250      * otherwise it gets decremented to 0.  This is consistent with 'count'
9251      * == 0 meaning the remainder of the union is whatever is left in the
9252      * non-exhausted list. */
9253     if (count != 0) {
9254         len_u = i_u;
9255     }
9256     else {
9257         IV copy_count = len_a - i_a;
9258         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9259             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9260         }
9261         else { /* The non-exhausted input is b */
9262             copy_count = len_b - i_b;
9263             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9264         }
9265         len_u = i_u + copy_count;
9266     }
9267
9268     /* Set the result to the final length, which can change the pointer to
9269      * array_u, so re-find it.  (Note that it is unlikely that this will
9270      * change, as we are shrinking the space, not enlarging it) */
9271     if (len_u != _invlist_len(u)) {
9272         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9273         invlist_trim(u);
9274         array_u = invlist_array(u);
9275     }
9276
9277     if (*output == NULL) {  /* Simply return the new inversion list */
9278         *output = u;
9279     }
9280     else {
9281         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9282          * could instead free '*output', and then set it to 'u', but experience
9283          * has shown [perl #127392] that if the input is a mortal, we can get a
9284          * huge build-up of these during regex compilation before they get
9285          * freed. */
9286         invlist_replace_list_destroys_src(*output, u);
9287         SvREFCNT_dec_NN(u);
9288     }
9289
9290     return;
9291 }
9292
9293 void
9294 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9295                                                const bool complement_b, SV** i)
9296 {
9297     /* Take the intersection of two inversion lists and point '*i' to it.  On
9298      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9299      * even 'a' or 'b').  If to an inversion list, the contents of the original
9300      * list will be replaced by the intersection.  The first list, 'a', may be
9301      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9302      * TRUE, the result will be the intersection of 'a' and the complement (or
9303      * inversion) of 'b' instead of 'b' directly.
9304      *
9305      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9306      * Richard Gillam, published by Addison-Wesley, and explained at some
9307      * length there.  The preface says to incorporate its examples into your
9308      * code at your own risk.  In fact, it had bugs
9309      *
9310      * The algorithm is like a merge sort, and is essentially the same as the
9311      * union above
9312      */
9313
9314     const UV* array_a;          /* a's array */
9315     const UV* array_b;
9316     UV len_a;   /* length of a's array */
9317     UV len_b;
9318
9319     SV* r;                   /* the resulting intersection */
9320     UV* array_r;
9321     UV len_r = 0;
9322
9323     UV i_a = 0;             /* current index into a's array */
9324     UV i_b = 0;
9325     UV i_r = 0;
9326
9327     /* running count of how many of the two inputs are postitioned at ranges
9328      * that are in their sets.  As explained in the algorithm source book,
9329      * items are stopped accumulating and are output when the count changes
9330      * to/from 2.  The count is incremented when we start a range that's in an
9331      * input's set, and decremented when we start a range that's not in a set.
9332      * Only when it is 2 are we in the intersection. */
9333     UV count = 0;
9334
9335     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9336     assert(a != b);
9337     assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
9338
9339     /* Special case if either one is empty */
9340     len_a = (a == NULL) ? 0 : _invlist_len(a);
9341     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9342         if (len_a != 0 && complement_b) {
9343
9344             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9345              * must be empty.  Here, also we are using 'b's complement, which
9346              * hence must be every possible code point.  Thus the intersection
9347              * is simply 'a'. */
9348
9349             if (*i == a) {  /* No-op */
9350                 return;
9351             }
9352
9353             if (*i == NULL) {
9354                 *i = invlist_clone(a);
9355                 return;
9356             }
9357
9358             r = invlist_clone(a);
9359             invlist_replace_list_destroys_src(*i, r);
9360             SvREFCNT_dec_NN(r);
9361             return;
9362         }
9363
9364         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9365          * intersection must be empty */
9366         if (*i == NULL) {
9367             *i = _new_invlist(0);
9368             return;
9369         }
9370
9371         invlist_clear(*i);
9372         return;
9373     }
9374
9375     /* Here both lists exist and are non-empty */
9376     array_a = invlist_array(a);
9377     array_b = invlist_array(b);
9378
9379     /* If are to take the intersection of 'a' with the complement of b, set it
9380      * up so are looking at b's complement. */
9381     if (complement_b) {
9382
9383         /* To complement, we invert: if the first element is 0, remove it.  To
9384          * do this, we just pretend the array starts one later */
9385         if (array_b[0] == 0) {
9386             array_b++;
9387             len_b--;
9388         }
9389         else {
9390
9391             /* But if the first element is not zero, we pretend the list starts
9392              * at the 0 that is always stored immediately before the array. */
9393             array_b--;
9394             len_b++;
9395         }
9396     }
9397
9398     /* Size the intersection for the worst case: that the intersection ends up
9399      * fragmenting everything to be completely disjoint */
9400     r= _new_invlist(len_a + len_b);
9401
9402     /* Will contain U+0000 iff both components do */
9403     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9404                                      && len_b > 0 && array_b[0] == 0);
9405
9406     /* Go through each list item by item, stopping when have exhausted one of
9407      * them */
9408     while (i_a < len_a && i_b < len_b) {
9409         UV cp;      /* The element to potentially add to the intersection's
9410                        array */
9411         bool cp_in_set; /* Is it in the input list's set or not */
9412
9413         /* We need to take one or the other of the two inputs for the
9414          * intersection.  Since we are merging two sorted lists, we take the
9415          * smaller of the next items.  In case of a tie, we take first the one
9416          * that is not in its set (a difference from the union algorithm).  If
9417          * we first took the one in its set, it would increment the count,
9418          * possibly to 2 which would cause it to be output as starting a range
9419          * in the intersection, and the next time through we would take that
9420          * same number, and output it again as ending the set.  By doing the
9421          * opposite of this, there is no possibility that the count will be
9422          * momentarily incremented to 2.  (In a tie and both are in the set or
9423          * both not in the set, it doesn't matter which we take first.) */
9424         if (       array_a[i_a] < array_b[i_b]
9425             || (   array_a[i_a] == array_b[i_b]
9426                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9427         {
9428             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9429             cp = array_a[i_a++];
9430         }
9431         else {
9432             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9433             cp= array_b[i_b++];
9434         }
9435
9436         /* Here, have chosen which of the two inputs to look at.  Only output
9437          * if the running count changes to/from 2, which marks the
9438          * beginning/end of a range that's in the intersection */
9439         if (cp_in_set) {
9440             count++;
9441             if (count == 2) {
9442                 array_r[i_r++] = cp;
9443             }
9444         }
9445         else {
9446             if (count == 2) {
9447                 array_r[i_r++] = cp;
9448             }
9449             count--;
9450         }
9451
9452     }
9453
9454     /* The loop above increments the index into exactly one of the input lists
9455      * each iteration, and ends when either index gets to its list end.  That
9456      * means the other index is lower than its end, and so something is
9457      * remaining in that one.  We increment 'count', as explained below, if the
9458      * exhausted list was in its set.  (i_a and i_b each currently index the
9459      * element beyond the one we care about.) */
9460     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9461         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9462     {
9463         count++;
9464     }
9465
9466     /* Above we incremented 'count' if the exhausted list was in its set.  This
9467      * has made it so that 'count' being below 2 means there is nothing left to
9468      * output; otheriwse what's left to add to the intersection is precisely
9469      * that which is left in the non-exhausted input list.
9470      *
9471      * To see why, note first that the exhausted input obviously has nothing
9472      * left to affect the intersection.  If it was in its set at its end, that
9473      * means the set extends from here to the platform's infinity, and hence
9474      * anything in the non-exhausted's list will be in the intersection, and
9475      * anything not in it won't be.  Hence, the rest of the intersection is
9476      * precisely what's in the non-exhausted list  The exhausted set also
9477      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9478      * it means 'count' is now at least 2.  This is consistent with the
9479      * incremented 'count' being >= 2 means to add the non-exhausted list to
9480      * the intersection.
9481      *
9482      * But if the exhausted input wasn't in its set, it contributed 0 to
9483      * 'count', and the intersection can't include anything further; the
9484      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9485      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9486      * further to add to the intersection. */
9487     if (count < 2) { /* Nothing left to put in the intersection. */
9488         len_r = i_r;
9489     }
9490     else { /* copy the non-exhausted list, unchanged. */
9491         IV copy_count = len_a - i_a;
9492         if (copy_count > 0) {   /* a is the one with stuff left */
9493             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9494         }
9495         else {  /* b is the one with stuff left */
9496             copy_count = len_b - i_b;
9497             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9498         }
9499         len_r = i_r + copy_count;
9500     }
9501
9502     /* Set the result to the final length, which can change the pointer to
9503      * array_r, so re-find it.  (Note that it is unlikely that this will
9504      * change, as we are shrinking the space, not enlarging it) */
9505     if (len_r != _invlist_len(r)) {
9506         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9507         invlist_trim(r);
9508         array_r = invlist_array(r);
9509     }
9510
9511     if (*i == NULL) { /* Simply return the calculated intersection */
9512         *i = r;
9513     }
9514     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9515               instead free '*i', and then set it to 'r', but experience has
9516               shown [perl #127392] that if the input is a mortal, we can get a
9517               huge build-up of these during regex compilation before they get
9518               freed. */
9519         if (len_r) {
9520             invlist_replace_list_destroys_src(*i, r);
9521         }
9522         else {
9523             invlist_clear(*i);
9524         }
9525         SvREFCNT_dec_NN(r);
9526     }
9527
9528     return;
9529 }
9530
9531 SV*
9532 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9533 {
9534     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9535      * set.  A pointer to the inversion list is returned.  This may actually be
9536      * a new list, in which case the passed in one has been destroyed.  The
9537      * passed-in inversion list can be NULL, in which case a new one is created
9538      * with just the one range in it.  The new list is not necessarily
9539      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9540      * result of this function.  The gain would not be large, and in many
9541      * cases, this is called multiple times on a single inversion list, so
9542      * anything freed may almost immediately be needed again.
9543      *
9544      * This used to mostly call the 'union' routine, but that is much more
9545      * heavyweight than really needed for a single range addition */
9546
9547     UV* array;              /* The array implementing the inversion list */
9548     UV len;                 /* How many elements in 'array' */
9549     SSize_t i_s;            /* index into the invlist array where 'start'
9550                                should go */
9551     SSize_t i_e = 0;        /* And the index where 'end' should go */
9552     UV cur_highest;         /* The highest code point in the inversion list
9553                                upon entry to this function */
9554
9555     /* This range becomes the whole inversion list if none already existed */
9556     if (invlist == NULL) {
9557         invlist = _new_invlist(2);
9558         _append_range_to_invlist(invlist, start, end);
9559         return invlist;
9560     }
9561
9562     /* Likewise, if the inversion list is currently empty */
9563     len = _invlist_len(invlist);
9564     if (len == 0) {
9565         _append_range_to_invlist(invlist, start, end);
9566         return invlist;
9567     }
9568
9569     /* Starting here, we have to know the internals of the list */
9570     array = invlist_array(invlist);
9571
9572     /* If the new range ends higher than the current highest ... */
9573     cur_highest = invlist_highest(invlist);
9574     if (end > cur_highest) {
9575
9576         /* If the whole range is higher, we can just append it */
9577         if (start > cur_highest) {
9578             _append_range_to_invlist(invlist, start, end);
9579             return invlist;
9580         }
9581
9582         /* Otherwise, add the portion that is higher ... */
9583         _append_range_to_invlist(invlist, cur_highest + 1, end);
9584
9585         /* ... and continue on below to handle the rest.  As a result of the
9586          * above append, we know that the index of the end of the range is the
9587          * final even numbered one of the array.  Recall that the final element
9588          * always starts a range that extends to infinity.  If that range is in
9589          * the set (meaning the set goes from here to infinity), it will be an
9590          * even index, but if it isn't in the set, it's odd, and the final
9591          * range in the set is one less, which is even. */
9592         if (end == UV_MAX) {
9593             i_e = len;
9594         }
9595         else {
9596             i_e = len - 2;
9597         }
9598     }
9599
9600     /* We have dealt with appending, now see about prepending.  If the new
9601      * range starts lower than the current lowest ... */
9602     if (start < array[0]) {
9603
9604         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9605          * Let the union code handle it, rather than having to know the
9606          * trickiness in two code places.  */
9607         if (UNLIKELY(start == 0)) {
9608             SV* range_invlist;
9609
9610             range_invlist = _new_invlist(2);
9611             _append_range_to_invlist(range_invlist, start, end);
9612
9613             _invlist_union(invlist, range_invlist, &invlist);
9614
9615             SvREFCNT_dec_NN(range_invlist);
9616
9617             return invlist;
9618         }
9619
9620         /* If the whole new range comes before the first entry, and doesn't
9621          * extend it, we have to insert it as an additional range */
9622         if (end < array[0] - 1) {
9623             i_s = i_e = -1;
9624             goto splice_in_new_range;
9625         }
9626
9627         /* Here the new range adjoins the existing first range, extending it
9628          * downwards. */
9629         array[0] = start;
9630
9631         /* And continue on below to handle the rest.  We know that the index of
9632          * the beginning of the range is the first one of the array */
9633         i_s = 0;
9634     }
9635     else { /* Not prepending any part of the new range to the existing list.
9636             * Find where in the list it should go.  This finds i_s, such that:
9637             *     invlist[i_s] <= start < array[i_s+1]
9638             */
9639         i_s = _invlist_search(invlist, start);
9640     }
9641
9642     /* At this point, any extending before the beginning of the inversion list
9643      * and/or after the end has been done.  This has made it so that, in the
9644      * code below, each endpoint of the new range is either in a range that is
9645      * in the set, or is in a gap between two ranges that are.  This means we
9646      * don't have to worry about exceeding the array bounds.
9647      *
9648      * Find where in the list the new range ends (but we can skip this if we
9649      * have already determined what it is, or if it will be the same as i_s,
9650      * which we already have computed) */
9651     if (i_e == 0) {
9652         i_e = (start == end)
9653               ? i_s
9654               : _invlist_search(invlist, end);
9655     }
9656
9657     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
9658      * is a range that goes to infinity there is no element at invlist[i_e+1],
9659      * so only the first relation holds. */
9660
9661     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9662
9663         /* Here, the ranges on either side of the beginning of the new range
9664          * are in the set, and this range starts in the gap between them.
9665          *
9666          * The new range extends the range above it downwards if the new range
9667          * ends at or above that range's start */
9668         const bool extends_the_range_above = (   end == UV_MAX
9669                                               || end + 1 >= array[i_s+1]);
9670
9671         /* The new range extends the range below it upwards if it begins just
9672          * after where that range ends */
9673         if (start == array[i_s]) {
9674
9675             /* If the new range fills the entire gap between the other ranges,
9676              * they will get merged together.  Other ranges may also get
9677              * merged, depending on how many of them the new range spans.  In
9678              * the general case, we do the merge later, just once, after we
9679              * figure out how many to merge.  But in the case where the new
9680              * range exactly spans just this one gap (possibly extending into
9681              * the one above), we do the merge here, and an early exit.  This
9682              * is done here to avoid having to special case later. */
9683             if (i_e - i_s <= 1) {
9684
9685                 /* If i_e - i_s == 1, it means that the new range terminates
9686                  * within the range above, and hence 'extends_the_range_above'
9687                  * must be true.  (If the range above it extends to infinity,
9688                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9689                  * will be 0, so no harm done.) */
9690                 if (extends_the_range_above) {
9691                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9692                     invlist_set_len(invlist,
9693                                     len - 2,
9694                                     *(get_invlist_offset_addr(invlist)));
9695                     return invlist;
9696                 }
9697
9698                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
9699                  * to the same range, and below we are about to decrement i_s
9700                  * */
9701                 i_e--;
9702             }
9703
9704             /* Here, the new range is adjacent to the one below.  (It may also
9705              * span beyond the range above, but that will get resolved later.)
9706              * Extend the range below to include this one. */
9707             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
9708             i_s--;
9709             start = array[i_s];
9710         }
9711         else if (extends_the_range_above) {
9712
9713             /* Here the new range only extends the range above it, but not the
9714              * one below.  It merges with the one above.  Again, we keep i_e
9715              * and i_s in sync if they point to the same range */
9716             if (i_e == i_s) {
9717                 i_e++;
9718             }
9719             i_s++;
9720             array[i_s] = start;
9721         }
9722     }
9723
9724     /* Here, we've dealt with the new range start extending any adjoining
9725      * existing ranges.
9726      *
9727      * If the new range extends to infinity, it is now the final one,
9728      * regardless of what was there before */
9729     if (UNLIKELY(end == UV_MAX)) {
9730         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
9731         return invlist;
9732     }
9733
9734     /* If i_e started as == i_s, it has also been dealt with,
9735      * and been updated to the new i_s, which will fail the following if */
9736     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
9737
9738         /* Here, the ranges on either side of the end of the new range are in
9739          * the set, and this range ends in the gap between them.
9740          *
9741          * If this range is adjacent to (hence extends) the range above it, it
9742          * becomes part of that range; likewise if it extends the range below,
9743          * it becomes part of that range */
9744         if (end + 1 == array[i_e+1]) {
9745             i_e++;
9746             array[i_e] = start;
9747         }
9748         else if (start <= array[i_e]) {
9749             array[i_e] = end + 1;
9750             i_e--;
9751         }
9752     }
9753
9754     if (i_s == i_e) {
9755
9756         /* If the range fits entirely in an existing range (as possibly already
9757          * extended above), it doesn't add anything new */
9758         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9759             return invlist;
9760         }
9761
9762         /* Here, no part of the range is in the list.  Must add it.  It will
9763          * occupy 2 more slots */
9764       splice_in_new_range:
9765
9766         invlist_extend(invlist, len + 2);
9767         array = invlist_array(invlist);
9768         /* Move the rest of the array down two slots. Don't include any
9769          * trailing NUL */
9770         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
9771
9772         /* Do the actual splice */
9773         array[i_e+1] = start;
9774         array[i_e+2] = end + 1;
9775         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
9776         return invlist;
9777     }
9778
9779     /* Here the new range crossed the boundaries of a pre-existing range.  The
9780      * code above has adjusted things so that both ends are in ranges that are
9781      * in the set.  This means everything in between must also be in the set.
9782      * Just squash things together */
9783     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
9784     invlist_set_len(invlist,
9785                     len - i_e + i_s,
9786                     *(get_invlist_offset_addr(invlist)));
9787
9788     return invlist;
9789 }
9790
9791 SV*
9792 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9793                                  UV** other_elements_ptr)
9794 {
9795     /* Create and return an inversion list whose contents are to be populated
9796      * by the caller.  The caller gives the number of elements (in 'size') and
9797      * the very first element ('element0').  This function will set
9798      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9799      * are to be placed.
9800      *
9801      * Obviously there is some trust involved that the caller will properly
9802      * fill in the other elements of the array.
9803      *
9804      * (The first element needs to be passed in, as the underlying code does
9805      * things differently depending on whether it is zero or non-zero) */
9806
9807     SV* invlist = _new_invlist(size);
9808     bool offset;
9809
9810     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9811
9812     invlist = add_cp_to_invlist(invlist, element0);
9813     offset = *get_invlist_offset_addr(invlist);
9814
9815     invlist_set_len(invlist, size, offset);
9816     *other_elements_ptr = invlist_array(invlist) + 1;
9817     return invlist;
9818 }
9819
9820 #endif
9821
9822 PERL_STATIC_INLINE SV*
9823 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9824     return _add_range_to_invlist(invlist, cp, cp);
9825 }
9826
9827 #ifndef PERL_IN_XSUB_RE
9828 void
9829 Perl__invlist_invert(pTHX_ SV* const invlist)
9830 {
9831     /* Complement the input inversion list.  This adds a 0 if the list didn't
9832      * have a zero; removes it otherwise.  As described above, the data
9833      * structure is set up so that this is very efficient */
9834
9835     PERL_ARGS_ASSERT__INVLIST_INVERT;
9836
9837     assert(! invlist_is_iterating(invlist));
9838
9839     /* The inverse of matching nothing is matching everything */
9840     if (_invlist_len(invlist) == 0) {
9841         _append_range_to_invlist(invlist, 0, UV_MAX);
9842         return;
9843     }
9844
9845     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9846 }
9847
9848 #endif
9849
9850 PERL_STATIC_INLINE SV*
9851 S_invlist_clone(pTHX_ SV* const invlist)
9852 {
9853
9854     /* Return a new inversion list that is a copy of the input one, which is
9855      * unchanged.  The new list will not be mortal even if the old one was. */
9856
9857     /* Need to allocate extra space to accommodate Perl's addition of a
9858      * trailing NUL to SvPV's, since it thinks they are always strings */
9859     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9860     STRLEN physical_length = SvCUR(invlist);
9861     bool offset = *(get_invlist_offset_addr(invlist));
9862
9863     PERL_ARGS_ASSERT_INVLIST_CLONE;
9864
9865     *(get_invlist_offset_addr(new_invlist)) = offset;
9866     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9867     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9868
9869     return new_invlist;
9870 }
9871
9872 PERL_STATIC_INLINE STRLEN*
9873 S_get_invlist_iter_addr(SV* invlist)
9874 {
9875     /* Return the address of the UV that contains the current iteration
9876      * position */
9877
9878     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9879
9880     assert(SvTYPE(invlist) == SVt_INVLIST);
9881
9882     return &(((XINVLIST*) SvANY(invlist))->iterator);
9883 }
9884
9885 PERL_STATIC_INLINE void
9886 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9887 {
9888     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9889
9890     *get_invlist_iter_addr(invlist) = 0;
9891 }
9892
9893 PERL_STATIC_INLINE void
9894 S_invlist_iterfinish(SV* invlist)
9895 {
9896     /* Terminate iterator for invlist.  This is to catch development errors.
9897      * Any iteration that is interrupted before completed should call this
9898      * function.  Functions that add code points anywhere else but to the end
9899      * of an inversion list assert that they are not in the middle of an
9900      * iteration.  If they were, the addition would make the iteration
9901      * problematical: if the iteration hadn't reached the place where things
9902      * were being added, it would be ok */
9903
9904     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9905
9906     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9907 }
9908
9909 STATIC bool
9910 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9911 {
9912     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9913      * This call sets in <*start> and <*end>, the next range in <invlist>.
9914      * Returns <TRUE> if successful and the next call will return the next
9915      * range; <FALSE> if was already at the end of the list.  If the latter,
9916      * <*start> and <*end> are unchanged, and the next call to this function
9917      * will start over at the beginning of the list */
9918
9919     STRLEN* pos = get_invlist_iter_addr(invlist);
9920     UV len = _invlist_len(invlist);
9921     UV *array;
9922
9923     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9924
9925     if (*pos >= len) {
9926         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9927         return FALSE;
9928     }
9929
9930     array = invlist_array(invlist);
9931
9932     *start = array[(*pos)++];
9933
9934     if (*pos >= len) {
9935         *end = UV_MAX;
9936     }
9937     else {
9938         *end = array[(*pos)++] - 1;
9939     }
9940
9941     return TRUE;
9942 }
9943
9944 PERL_STATIC_INLINE UV
9945 S_invlist_highest(SV* const invlist)
9946 {
9947     /* Returns the highest code point that matches an inversion list.  This API
9948      * has an ambiguity, as it returns 0 under either the highest is actually
9949      * 0, or if the list is empty.  If this distinction matters to you, check
9950      * for emptiness before calling this function */
9951
9952     UV len = _invlist_len(invlist);
9953     UV *array;
9954
9955     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9956
9957     if (len == 0) {
9958         return 0;
9959     }
9960
9961     array = invlist_array(invlist);
9962
9963     /* The last element in the array in the inversion list always starts a
9964      * range that goes to infinity.  That range may be for code points that are
9965      * matched in the inversion list, or it may be for ones that aren't
9966      * matched.  In the latter case, the highest code point in the set is one
9967      * less than the beginning of this range; otherwise it is the final element
9968      * of this range: infinity */
9969     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9970            ? UV_MAX
9971            : array[len - 1] - 1;
9972 }
9973
9974 STATIC SV *
9975 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
9976 {
9977     /* Get the contents of an inversion list into a string SV so that they can
9978      * be printed out.  If 'traditional_style' is TRUE, it uses the format
9979      * traditionally done for debug tracing; otherwise it uses a format
9980      * suitable for just copying to the output, with blanks between ranges and
9981      * a dash between range components */
9982
9983     UV start, end;
9984     SV* output;
9985     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
9986     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
9987
9988     if (traditional_style) {
9989         output = newSVpvs("\n");
9990     }
9991     else {
9992         output = newSVpvs("");
9993     }
9994
9995     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
9996
9997     assert(! invlist_is_iterating(invlist));
9998
9999     invlist_iterinit(invlist);
10000     while (invlist_iternext(invlist, &start, &end)) {
10001         if (end == UV_MAX) {
10002             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
10003                                           start, intra_range_delimiter,
10004                                                  inter_range_delimiter);
10005         }
10006         else if (end != start) {
10007             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10008                                           start,
10009                                                    intra_range_delimiter,
10010                                                   end, inter_range_delimiter);
10011         }
10012         else {
10013             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10014                                           start, inter_range_delimiter);
10015         }
10016     }
10017
10018     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10019         SvCUR_set(output, SvCUR(output) - 1);
10020     }
10021
10022     return output;
10023 }
10024
10025 #ifndef PERL_IN_XSUB_RE
10026 void
10027 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10028                          const char * const indent, SV* const invlist)
10029 {
10030     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10031      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10032      * the string 'indent'.  The output looks like this:
10033          [0] 0x000A .. 0x000D
10034          [2] 0x0085
10035          [4] 0x2028 .. 0x2029
10036          [6] 0x3104 .. INFINITY
10037      * This means that the first range of code points matched by the list are
10038      * 0xA through 0xD; the second range contains only the single code point
10039      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10040      * are used to define each range (except if the final range extends to
10041      * infinity, only a single element is needed).  The array index of the
10042      * first element for the corresponding range is given in brackets. */
10043
10044     UV start, end;
10045     STRLEN count = 0;
10046
10047     PERL_ARGS_ASSERT__INVLIST_DUMP;
10048
10049     if (invlist_is_iterating(invlist)) {
10050         Perl_dump_indent(aTHX_ level, file,
10051              "%sCan't dump inversion list because is in middle of iterating\n",
10052              indent);
10053         return;
10054     }
10055
10056     invlist_iterinit(invlist);
10057     while (invlist_iternext(invlist, &start, &end)) {
10058         if (end == UV_MAX) {
10059             Perl_dump_indent(aTHX_ level, file,
10060                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
10061                                    indent, (UV)count, start);
10062         }
10063         else if (end != start) {
10064             Perl_dump_indent(aTHX_ level, file,
10065                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10066                                 indent, (UV)count, start,         end);
10067         }
10068         else {
10069             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10070                                             indent, (UV)count, start);
10071         }
10072         count += 2;
10073     }
10074 }
10075
10076 void
10077 Perl__load_PL_utf8_foldclosures (pTHX)
10078 {
10079     assert(! PL_utf8_foldclosures);
10080
10081     /* If the folds haven't been read in, call a fold function
10082      * to force that */
10083     if (! PL_utf8_tofold) {
10084         U8 dummy[UTF8_MAXBYTES_CASE+1];
10085         const U8 hyphen[] = HYPHEN_UTF8;
10086
10087         /* This string is just a short named one above \xff */
10088         toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL);
10089         assert(PL_utf8_tofold); /* Verify that worked */
10090     }
10091     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10092 }
10093 #endif
10094
10095 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10096 bool
10097 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10098 {
10099     /* Return a boolean as to if the two passed in inversion lists are
10100      * identical.  The final argument, if TRUE, says to take the complement of
10101      * the second inversion list before doing the comparison */
10102
10103     const UV* array_a = invlist_array(a);
10104     const UV* array_b = invlist_array(b);
10105     UV len_a = _invlist_len(a);
10106     UV len_b = _invlist_len(b);
10107
10108     PERL_ARGS_ASSERT__INVLISTEQ;
10109
10110     /* If are to compare 'a' with the complement of b, set it
10111      * up so are looking at b's complement. */
10112     if (complement_b) {
10113
10114         /* The complement of nothing is everything, so <a> would have to have
10115          * just one element, starting at zero (ending at infinity) */
10116         if (len_b == 0) {
10117             return (len_a == 1 && array_a[0] == 0);
10118         }
10119         else if (array_b[0] == 0) {
10120
10121             /* Otherwise, to complement, we invert.  Here, the first element is
10122              * 0, just remove it.  To do this, we just pretend the array starts
10123              * one later */
10124
10125             array_b++;
10126             len_b--;
10127         }
10128         else {
10129
10130             /* But if the first element is not zero, we pretend the list starts
10131              * at the 0 that is always stored immediately before the array. */
10132             array_b--;
10133             len_b++;
10134         }
10135     }
10136
10137     return    len_a == len_b
10138            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10139
10140 }
10141 #endif
10142
10143 /*
10144  * As best we can, determine the characters that can match the start of
10145  * the given EXACTF-ish node.
10146  *
10147  * Returns the invlist as a new SV*; it is the caller's responsibility to
10148  * call SvREFCNT_dec() when done with it.
10149  */
10150 STATIC SV*
10151 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10152 {
10153     const U8 * s = (U8*)STRING(node);
10154     SSize_t bytelen = STR_LEN(node);
10155     UV uc;
10156     /* Start out big enough for 2 separate code points */
10157     SV* invlist = _new_invlist(4);
10158
10159     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10160
10161     if (! UTF) {
10162         uc = *s;
10163
10164         /* We punt and assume can match anything if the node begins
10165          * with a multi-character fold.  Things are complicated.  For
10166          * example, /ffi/i could match any of:
10167          *  "\N{LATIN SMALL LIGATURE FFI}"
10168          *  "\N{LATIN SMALL LIGATURE FF}I"
10169          *  "F\N{LATIN SMALL LIGATURE FI}"
10170          *  plus several other things; and making sure we have all the
10171          *  possibilities is hard. */
10172         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10173             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10174         }
10175         else {
10176             /* Any Latin1 range character can potentially match any
10177              * other depending on the locale */
10178             if (OP(node) == EXACTFL) {
10179                 _invlist_union(invlist, PL_Latin1, &invlist);
10180             }
10181             else {
10182                 /* But otherwise, it matches at least itself.  We can
10183                  * quickly tell if it has a distinct fold, and if so,
10184                  * it matches that as well */
10185                 invlist = add_cp_to_invlist(invlist, uc);
10186                 if (IS_IN_SOME_FOLD_L1(uc))
10187                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10188             }
10189
10190             /* Some characters match above-Latin1 ones under /i.  This
10191              * is true of EXACTFL ones when the locale is UTF-8 */
10192             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10193                 && (! isASCII(uc) || (OP(node) != EXACTFA
10194                                     && OP(node) != EXACTFA_NO_TRIE)))
10195             {
10196                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10197             }
10198         }
10199     }
10200     else {  /* Pattern is UTF-8 */
10201         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10202         STRLEN foldlen = UTF8SKIP(s);
10203         const U8* e = s + bytelen;
10204         SV** listp;
10205
10206         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10207
10208         /* The only code points that aren't folded in a UTF EXACTFish
10209          * node are are the problematic ones in EXACTFL nodes */
10210         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10211             /* We need to check for the possibility that this EXACTFL
10212              * node begins with a multi-char fold.  Therefore we fold
10213              * the first few characters of it so that we can make that
10214              * check */
10215             U8 *d = folded;
10216             int i;
10217
10218             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10219                 if (isASCII(*s)) {
10220                     *(d++) = (U8) toFOLD(*s);
10221                     s++;
10222                 }
10223                 else {
10224                     STRLEN len;
10225                     toFOLD_utf8_safe(s, e, d, &len);
10226                     d += len;
10227                     s += UTF8SKIP(s);
10228                 }
10229             }
10230
10231             /* And set up so the code below that looks in this folded
10232              * buffer instead of the node's string */
10233             e = d;
10234             foldlen = UTF8SKIP(folded);
10235             s = folded;
10236         }
10237
10238         /* When we reach here 's' points to the fold of the first
10239          * character(s) of the node; and 'e' points to far enough along
10240          * the folded string to be just past any possible multi-char
10241          * fold. 'foldlen' is the length in bytes of the first
10242          * character in 's'
10243          *
10244          * Unlike the non-UTF-8 case, the macro for determining if a
10245          * string is a multi-char fold requires all the characters to
10246          * already be folded.  This is because of all the complications
10247          * if not.  Note that they are folded anyway, except in EXACTFL
10248          * nodes.  Like the non-UTF case above, we punt if the node
10249          * begins with a multi-char fold  */
10250
10251         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10252             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10253         }
10254         else {  /* Single char fold */
10255
10256             /* It matches all the things that fold to it, which are
10257              * found in PL_utf8_foldclosures (including itself) */
10258             invlist = add_cp_to_invlist(invlist, uc);
10259             if (! PL_utf8_foldclosures)
10260                 _load_PL_utf8_foldclosures();
10261             if ((listp = hv_fetch(PL_utf8_foldclosures,
10262                                 (char *) s, foldlen, FALSE)))
10263             {
10264                 AV* list = (AV*) *listp;
10265                 IV k;
10266                 for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
10267                     SV** c_p = av_fetch(list, k, FALSE);
10268                     UV c;
10269                     assert(c_p);
10270
10271                     c = SvUV(*c_p);
10272
10273                     /* /aa doesn't allow folds between ASCII and non- */
10274                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10275                         && isASCII(c) != isASCII(uc))
10276                     {
10277                         continue;
10278                     }
10279
10280                     invlist = add_cp_to_invlist(invlist, c);
10281                 }
10282             }
10283         }
10284     }
10285
10286     return invlist;
10287 }
10288
10289 #undef HEADER_LENGTH
10290 #undef TO_INTERNAL_SIZE
10291 #undef FROM_INTERNAL_SIZE
10292 #undef INVLIST_VERSION_ID
10293
10294 /* End of inversion list object */
10295
10296 STATIC void
10297 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10298 {
10299     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10300      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10301      * should point to the first flag; it is updated on output to point to the
10302      * final ')' or ':'.  There needs to be at least one flag, or this will
10303      * abort */
10304
10305     /* for (?g), (?gc), and (?o) warnings; warning
10306        about (?c) will warn about (?g) -- japhy    */
10307
10308 #define WASTED_O  0x01
10309 #define WASTED_G  0x02
10310 #define WASTED_C  0x04
10311 #define WASTED_GC (WASTED_G|WASTED_C)
10312     I32 wastedflags = 0x00;
10313     U32 posflags = 0, negflags = 0;
10314     U32 *flagsp = &posflags;
10315     char has_charset_modifier = '\0';
10316     regex_charset cs;
10317     bool has_use_defaults = FALSE;
10318     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10319     int x_mod_count = 0;
10320
10321     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10322
10323     /* '^' as an initial flag sets certain defaults */
10324     if (UCHARAT(RExC_parse) == '^') {
10325         RExC_parse++;
10326         has_use_defaults = TRUE;
10327         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10328         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10329                                         ? REGEX_UNICODE_CHARSET
10330                                         : REGEX_DEPENDS_CHARSET);
10331     }
10332
10333     cs = get_regex_charset(RExC_flags);
10334     if (cs == REGEX_DEPENDS_CHARSET
10335         && (RExC_utf8 || RExC_uni_semantics))
10336     {
10337         cs = REGEX_UNICODE_CHARSET;
10338     }
10339
10340     while (RExC_parse < RExC_end) {
10341         /* && strchr("iogcmsx", *RExC_parse) */
10342         /* (?g), (?gc) and (?o) are useless here
10343            and must be globally applied -- japhy */
10344         switch (*RExC_parse) {
10345
10346             /* Code for the imsxn flags */
10347             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10348
10349             case LOCALE_PAT_MOD:
10350                 if (has_charset_modifier) {
10351                     goto excess_modifier;
10352                 }
10353                 else if (flagsp == &negflags) {
10354                     goto neg_modifier;
10355                 }
10356                 cs = REGEX_LOCALE_CHARSET;
10357                 has_charset_modifier = LOCALE_PAT_MOD;
10358                 break;
10359             case UNICODE_PAT_MOD:
10360                 if (has_charset_modifier) {
10361                     goto excess_modifier;
10362                 }
10363                 else if (flagsp == &negflags) {
10364                     goto neg_modifier;
10365                 }
10366                 cs = REGEX_UNICODE_CHARSET;
10367                 has_charset_modifier = UNICODE_PAT_MOD;
10368                 break;
10369             case ASCII_RESTRICT_PAT_MOD:
10370                 if (flagsp == &negflags) {
10371                     goto neg_modifier;
10372                 }
10373                 if (has_charset_modifier) {
10374                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10375                         goto excess_modifier;
10376                     }
10377                     /* Doubled modifier implies more restricted */
10378                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10379                 }
10380                 else {
10381                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10382                 }
10383                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10384                 break;
10385             case DEPENDS_PAT_MOD:
10386                 if (has_use_defaults) {
10387                     goto fail_modifiers;
10388                 }
10389                 else if (flagsp == &negflags) {
10390                     goto neg_modifier;
10391                 }
10392                 else if (has_charset_modifier) {
10393                     goto excess_modifier;
10394                 }
10395
10396                 /* The dual charset means unicode semantics if the
10397                  * pattern (or target, not known until runtime) are
10398                  * utf8, or something in the pattern indicates unicode
10399                  * semantics */
10400                 cs = (RExC_utf8 || RExC_uni_semantics)
10401                      ? REGEX_UNICODE_CHARSET
10402                      : REGEX_DEPENDS_CHARSET;
10403                 has_charset_modifier = DEPENDS_PAT_MOD;
10404                 break;
10405               excess_modifier:
10406                 RExC_parse++;
10407                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10408                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10409                 }
10410                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10411                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10412                                         *(RExC_parse - 1));
10413                 }
10414                 else {
10415                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10416                 }
10417                 NOT_REACHED; /*NOTREACHED*/
10418               neg_modifier:
10419                 RExC_parse++;
10420                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10421                                     *(RExC_parse - 1));
10422                 NOT_REACHED; /*NOTREACHED*/
10423             case ONCE_PAT_MOD: /* 'o' */
10424             case GLOBAL_PAT_MOD: /* 'g' */
10425                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10426                     const I32 wflagbit = *RExC_parse == 'o'
10427                                          ? WASTED_O
10428                                          : WASTED_G;
10429                     if (! (wastedflags & wflagbit) ) {
10430                         wastedflags |= wflagbit;
10431                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10432                         vWARN5(
10433                             RExC_parse + 1,
10434                             "Useless (%s%c) - %suse /%c modifier",
10435                             flagsp == &negflags ? "?-" : "?",
10436                             *RExC_parse,
10437                             flagsp == &negflags ? "don't " : "",
10438                             *RExC_parse
10439                         );
10440                     }
10441                 }
10442                 break;
10443
10444             case CONTINUE_PAT_MOD: /* 'c' */
10445                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10446                     if (! (wastedflags & WASTED_C) ) {
10447                         wastedflags |= WASTED_GC;
10448                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10449                         vWARN3(
10450                             RExC_parse + 1,
10451                             "Useless (%sc) - %suse /gc modifier",
10452                             flagsp == &negflags ? "?-" : "?",
10453                             flagsp == &negflags ? "don't " : ""
10454                         );
10455                     }
10456                 }
10457                 break;
10458             case KEEPCOPY_PAT_MOD: /* 'p' */
10459                 if (flagsp == &negflags) {
10460                     if (PASS2)
10461                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10462                 } else {
10463                     *flagsp |= RXf_PMf_KEEPCOPY;
10464                 }
10465                 break;
10466             case '-':
10467                 /* A flag is a default iff it is following a minus, so
10468                  * if there is a minus, it means will be trying to
10469                  * re-specify a default which is an error */
10470                 if (has_use_defaults || flagsp == &negflags) {
10471                     goto fail_modifiers;
10472                 }
10473                 flagsp = &negflags;
10474                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10475                 x_mod_count = 0;
10476                 break;
10477             case ':':
10478             case ')':
10479
10480                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10481                     negflags |= RXf_PMf_EXTENDED_MORE;
10482                 }
10483                 RExC_flags |= posflags;
10484
10485                 if (negflags & RXf_PMf_EXTENDED) {
10486                     negflags |= RXf_PMf_EXTENDED_MORE;
10487                 }
10488                 RExC_flags &= ~negflags;
10489                 set_regex_charset(&RExC_flags, cs);
10490
10491                 return;
10492             default:
10493               fail_modifiers:
10494                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10495                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10496                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10497                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10498                 NOT_REACHED; /*NOTREACHED*/
10499         }
10500
10501         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10502     }
10503
10504     vFAIL("Sequence (?... not terminated");
10505 }
10506
10507 /*
10508  - reg - regular expression, i.e. main body or parenthesized thing
10509  *
10510  * Caller must absorb opening parenthesis.
10511  *
10512  * Combining parenthesis handling with the base level of regular expression
10513  * is a trifle forced, but the need to tie the tails of the branches to what
10514  * follows makes it hard to avoid.
10515  */
10516 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10517 #ifdef DEBUGGING
10518 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10519 #else
10520 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10521 #endif
10522
10523 PERL_STATIC_INLINE regnode *
10524 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10525                              I32 *flagp,
10526                              char * parse_start,
10527                              char ch
10528                       )
10529 {
10530     regnode *ret;
10531     char* name_start = RExC_parse;
10532     U32 num = 0;
10533     SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10534                                             ? REG_RSN_RETURN_NULL
10535                                             : REG_RSN_RETURN_DATA);
10536     GET_RE_DEBUG_FLAGS_DECL;
10537
10538     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10539
10540     if (RExC_parse == name_start || *RExC_parse != ch) {
10541         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10542         vFAIL2("Sequence %.3s... not terminated",parse_start);
10543     }
10544
10545     if (!SIZE_ONLY) {
10546         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10547         RExC_rxi->data->data[num]=(void*)sv_dat;
10548         SvREFCNT_inc_simple_void(sv_dat);
10549     }
10550     RExC_sawback = 1;
10551     ret = reganode(pRExC_state,
10552                    ((! FOLD)
10553                      ? NREF
10554                      : (ASCII_FOLD_RESTRICTED)
10555                        ? NREFFA
10556                        : (AT_LEAST_UNI_SEMANTICS)
10557                          ? NREFFU
10558                          : (LOC)
10559                            ? NREFFL
10560                            : NREFF),
10561                     num);
10562     *flagp |= HASWIDTH;
10563
10564     Set_Node_Offset(ret, parse_start+1);
10565     Set_Node_Cur_Length(ret, parse_start);
10566
10567     nextchar(pRExC_state);
10568     return ret;
10569 }
10570
10571 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10572    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10573    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10574    upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
10575    NULL, which cannot happen.  */
10576 STATIC regnode *
10577 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10578     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10579      * 2 is like 1, but indicates that nextchar() has been called to advance
10580      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10581      * this flag alerts us to the need to check for that */
10582 {
10583     regnode *ret;               /* Will be the head of the group. */
10584     regnode *br;
10585     regnode *lastbr;
10586     regnode *ender = NULL;
10587     I32 parno = 0;
10588     I32 flags;
10589     U32 oregflags = RExC_flags;
10590     bool have_branch = 0;
10591     bool is_open = 0;
10592     I32 freeze_paren = 0;
10593     I32 after_freeze = 0;
10594     I32 num; /* numeric backreferences */
10595
10596     char * parse_start = RExC_parse; /* MJD */
10597     char * const oregcomp_parse = RExC_parse;
10598
10599     GET_RE_DEBUG_FLAGS_DECL;
10600
10601     PERL_ARGS_ASSERT_REG;
10602     DEBUG_PARSE("reg ");
10603
10604     *flagp = 0;                         /* Tentatively. */
10605
10606     /* Having this true makes it feasible to have a lot fewer tests for the
10607      * parse pointer being in scope.  For example, we can write
10608      *      while(isFOO(*RExC_parse)) RExC_parse++;
10609      * instead of
10610      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10611      */
10612     assert(*RExC_end == '\0');
10613
10614     /* Make an OPEN node, if parenthesized. */
10615     if (paren) {
10616
10617         /* Under /x, space and comments can be gobbled up between the '(' and
10618          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
10619          * intervening space, as the sequence is a token, and a token should be
10620          * indivisible */
10621         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
10622
10623         if (RExC_parse >= RExC_end) {
10624             vFAIL("Unmatched (");
10625         }
10626
10627         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10628             char *start_verb = RExC_parse + 1;
10629             STRLEN verb_len;
10630             char *start_arg = NULL;
10631             unsigned char op = 0;
10632             int arg_required = 0;
10633             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10634
10635             if (has_intervening_patws) {
10636                 RExC_parse++;   /* past the '*' */
10637                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10638             }
10639             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10640                 if ( *RExC_parse == ':' ) {
10641                     start_arg = RExC_parse + 1;
10642                     break;
10643                 }
10644                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10645             }
10646             verb_len = RExC_parse - start_verb;
10647             if ( start_arg ) {
10648                 if (RExC_parse >= RExC_end) {
10649                     goto unterminated_verb_pattern;
10650                 }
10651                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10652                 while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10653                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10654                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10655                   unterminated_verb_pattern:
10656                     vFAIL("Unterminated verb pattern argument");
10657                 if ( RExC_parse == start_arg )
10658                     start_arg = NULL;
10659             } else {
10660                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10661                     vFAIL("Unterminated verb pattern");
10662             }
10663
10664             /* Here, we know that RExC_parse < RExC_end */
10665
10666             switch ( *start_verb ) {
10667             case 'A':  /* (*ACCEPT) */
10668                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10669                     op = ACCEPT;
10670                     internal_argval = RExC_nestroot;
10671                 }
10672                 break;
10673             case 'C':  /* (*COMMIT) */
10674                 if ( memEQs(start_verb,verb_len,"COMMIT") )
10675                     op = COMMIT;
10676                 break;
10677             case 'F':  /* (*FAIL) */
10678                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10679                     op = OPFAIL;
10680                 }
10681                 break;
10682             case ':':  /* (*:NAME) */
10683             case 'M':  /* (*MARK:NAME) */
10684                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10685                     op = MARKPOINT;
10686                     arg_required = 1;
10687                 }
10688                 break;
10689             case 'P':  /* (*PRUNE) */
10690                 if ( memEQs(start_verb,verb_len,"PRUNE") )
10691                     op = PRUNE;
10692                 break;
10693             case 'S':   /* (*SKIP) */
10694                 if ( memEQs(start_verb,verb_len,"SKIP") )
10695                     op = SKIP;
10696                 break;
10697             case 'T':  /* (*THEN) */
10698                 /* [19:06] <TimToady> :: is then */
10699                 if ( memEQs(start_verb,verb_len,"THEN") ) {
10700                     op = CUTGROUP;
10701                     RExC_seen |= REG_CUTGROUP_SEEN;
10702                 }
10703                 break;
10704             }
10705             if ( ! op ) {
10706                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10707                 vFAIL2utf8f(
10708                     "Unknown verb pattern '%" UTF8f "'",
10709                     UTF8fARG(UTF, verb_len, start_verb));
10710             }
10711             if ( arg_required && !start_arg ) {
10712                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10713                     verb_len, start_verb);
10714             }
10715             if (internal_argval == -1) {
10716                 ret = reganode(pRExC_state, op, 0);
10717             } else {
10718                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10719             }
10720             RExC_seen |= REG_VERBARG_SEEN;
10721             if ( ! SIZE_ONLY ) {
10722                 if (start_arg) {
10723                     SV *sv = newSVpvn( start_arg,
10724                                        RExC_parse - start_arg);
10725                     ARG(ret) = add_data( pRExC_state,
10726                                          STR_WITH_LEN("S"));
10727                     RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10728                     ret->flags = 1;
10729                 } else {
10730                     ret->flags = 0;
10731                 }
10732                 if ( internal_argval != -1 )
10733                     ARG2L_SET(ret, internal_argval);
10734             }
10735             nextchar(pRExC_state);
10736             return ret;
10737         }
10738         else if (*RExC_parse == '?') { /* (?...) */
10739             bool is_logical = 0;
10740             const char * const seqstart = RExC_parse;
10741             const char * endptr;
10742             if (has_intervening_patws) {
10743                 RExC_parse++;
10744                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10745             }
10746
10747             RExC_parse++;           /* past the '?' */
10748             paren = *RExC_parse;    /* might be a trailing NUL, if not
10749                                        well-formed */
10750             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10751             if (RExC_parse > RExC_end) {
10752                 paren = '\0';
10753             }
10754             ret = NULL;                 /* For look-ahead/behind. */
10755             switch (paren) {
10756
10757             case 'P':   /* (?P...) variants for those used to PCRE/Python */
10758                 paren = *RExC_parse;
10759                 if ( paren == '<') {    /* (?P<...>) named capture */
10760                     RExC_parse++;
10761                     if (RExC_parse >= RExC_end) {
10762                         vFAIL("Sequence (?P<... not terminated");
10763                     }
10764                     goto named_capture;
10765                 }
10766                 else if (paren == '>') {   /* (?P>name) named recursion */
10767                     RExC_parse++;
10768                     if (RExC_parse >= RExC_end) {
10769                         vFAIL("Sequence (?P>... not terminated");
10770                     }
10771                     goto named_recursion;
10772                 }
10773                 else if (paren == '=') {   /* (?P=...)  named backref */
10774                     RExC_parse++;
10775                     return handle_named_backref(pRExC_state, flagp,
10776                                                 parse_start, ')');
10777                 }
10778                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10779                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10780                 vFAIL3("Sequence (%.*s...) not recognized",
10781                                 RExC_parse-seqstart, seqstart);
10782                 NOT_REACHED; /*NOTREACHED*/
10783             case '<':           /* (?<...) */
10784                 if (*RExC_parse == '!')
10785                     paren = ',';
10786                 else if (*RExC_parse != '=')
10787               named_capture:
10788                 {               /* (?<...>) */
10789                     char *name_start;
10790                     SV *svname;
10791                     paren= '>';
10792                 /* FALLTHROUGH */
10793             case '\'':          /* (?'...') */
10794                     name_start = RExC_parse;
10795                     svname = reg_scan_name(pRExC_state,
10796                         SIZE_ONLY    /* reverse test from the others */
10797                         ? REG_RSN_RETURN_NAME
10798                         : REG_RSN_RETURN_NULL);
10799                     if (   RExC_parse == name_start
10800                         || RExC_parse >= RExC_end
10801                         || *RExC_parse != paren)
10802                     {
10803                         vFAIL2("Sequence (?%c... not terminated",
10804                             paren=='>' ? '<' : paren);
10805                     }
10806                     if (SIZE_ONLY) {
10807                         HE *he_str;
10808                         SV *sv_dat = NULL;
10809                         if (!svname) /* shouldn't happen */
10810                             Perl_croak(aTHX_
10811                                 "panic: reg_scan_name returned NULL");
10812                         if (!RExC_paren_names) {
10813                             RExC_paren_names= newHV();
10814                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10815 #ifdef DEBUGGING
10816                             RExC_paren_name_list= newAV();
10817                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10818 #endif
10819                         }
10820                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10821                         if ( he_str )
10822                             sv_dat = HeVAL(he_str);
10823                         if ( ! sv_dat ) {
10824                             /* croak baby croak */
10825                             Perl_croak(aTHX_
10826                                 "panic: paren_name hash element allocation failed");
10827                         } else if ( SvPOK(sv_dat) ) {
10828                             /* (?|...) can mean we have dupes so scan to check
10829                                its already been stored. Maybe a flag indicating
10830                                we are inside such a construct would be useful,
10831                                but the arrays are likely to be quite small, so
10832                                for now we punt -- dmq */
10833                             IV count = SvIV(sv_dat);
10834                             I32 *pv = (I32*)SvPVX(sv_dat);
10835                             IV i;
10836                             for ( i = 0 ; i < count ; i++ ) {
10837                                 if ( pv[i] == RExC_npar ) {
10838                                     count = 0;
10839                                     break;
10840                                 }
10841                             }
10842                             if ( count ) {
10843                                 pv = (I32*)SvGROW(sv_dat,
10844                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10845                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10846                                 pv[count] = RExC_npar;
10847                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10848                             }
10849                         } else {
10850                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10851                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10852                                                                 sizeof(I32));
10853                             SvIOK_on(sv_dat);
10854                             SvIV_set(sv_dat, 1);
10855                         }
10856 #ifdef DEBUGGING
10857                         /* Yes this does cause a memory leak in debugging Perls
10858                          * */
10859                         if (!av_store(RExC_paren_name_list,
10860                                       RExC_npar, SvREFCNT_inc(svname)))
10861                             SvREFCNT_dec_NN(svname);
10862 #endif
10863
10864                         /*sv_dump(sv_dat);*/
10865                     }
10866                     nextchar(pRExC_state);
10867                     paren = 1;
10868                     goto capturing_parens;
10869                 }
10870                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10871                 RExC_in_lookbehind++;
10872                 RExC_parse++;
10873                 if (RExC_parse >= RExC_end) {
10874                     vFAIL("Sequence (?... not terminated");
10875                 }
10876
10877                 /* FALLTHROUGH */
10878             case '=':           /* (?=...) */
10879                 RExC_seen_zerolen++;
10880                 break;
10881             case '!':           /* (?!...) */
10882                 RExC_seen_zerolen++;
10883                 /* check if we're really just a "FAIL" assertion */
10884                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10885                                         FALSE /* Don't force to /x */ );
10886                 if (*RExC_parse == ')') {
10887                     ret=reganode(pRExC_state, OPFAIL, 0);
10888                     nextchar(pRExC_state);
10889                     return ret;
10890                 }
10891                 break;
10892             case '|':           /* (?|...) */
10893                 /* branch reset, behave like a (?:...) except that
10894                    buffers in alternations share the same numbers */
10895                 paren = ':';
10896                 after_freeze = freeze_paren = RExC_npar;
10897                 break;
10898             case ':':           /* (?:...) */
10899             case '>':           /* (?>...) */
10900                 break;
10901             case '$':           /* (?$...) */
10902             case '@':           /* (?@...) */
10903                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10904                 break;
10905             case '0' :           /* (?0) */
10906             case 'R' :           /* (?R) */
10907                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10908                     FAIL("Sequence (?R) not terminated");
10909                 num = 0;
10910                 RExC_seen |= REG_RECURSE_SEEN;
10911                 *flagp |= POSTPONED;
10912                 goto gen_recurse_regop;
10913                 /*notreached*/
10914             /* named and numeric backreferences */
10915             case '&':            /* (?&NAME) */
10916                 parse_start = RExC_parse - 1;
10917               named_recursion:
10918                 {
10919                     SV *sv_dat = reg_scan_name(pRExC_state,
10920                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10921                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10922                 }
10923                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
10924                     vFAIL("Sequence (?&... not terminated");
10925                 goto gen_recurse_regop;
10926                 /* NOTREACHED */
10927             case '+':
10928                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10929                     RExC_parse++;
10930                     vFAIL("Illegal pattern");
10931                 }
10932                 goto parse_recursion;
10933                 /* NOTREACHED*/
10934             case '-': /* (?-1) */
10935                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10936                     RExC_parse--; /* rewind to let it be handled later */
10937                     goto parse_flags;
10938                 }
10939                 /* FALLTHROUGH */
10940             case '1': case '2': case '3': case '4': /* (?1) */
10941             case '5': case '6': case '7': case '8': case '9':
10942                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
10943               parse_recursion:
10944                 {
10945                     bool is_neg = FALSE;
10946                     UV unum;
10947                     parse_start = RExC_parse - 1; /* MJD */
10948                     if (*RExC_parse == '-') {
10949                         RExC_parse++;
10950                         is_neg = TRUE;
10951                     }
10952                     if (grok_atoUV(RExC_parse, &unum, &endptr)
10953                         && unum <= I32_MAX
10954                     ) {
10955                         num = (I32)unum;
10956                         RExC_parse = (char*)endptr;
10957                     } else
10958                         num = I32_MAX;
10959                     if (is_neg) {
10960                         /* Some limit for num? */
10961                         num = -num;
10962                     }
10963                 }
10964                 if (*RExC_parse!=')')
10965                     vFAIL("Expecting close bracket");
10966
10967               gen_recurse_regop:
10968                 if ( paren == '-' ) {
10969                     /*
10970                     Diagram of capture buffer numbering.
10971                     Top line is the normal capture buffer numbers
10972                     Bottom line is the negative indexing as from
10973                     the X (the (?-2))
10974
10975                     +   1 2    3 4 5 X          6 7
10976                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10977                     -   5 4    3 2 1 X          x x
10978
10979                     */
10980                     num = RExC_npar + num;
10981                     if (num < 1)  {
10982                         RExC_parse++;
10983                         vFAIL("Reference to nonexistent group");
10984                     }
10985                 } else if ( paren == '+' ) {
10986                     num = RExC_npar + num - 1;
10987                 }
10988                 /* We keep track how many GOSUB items we have produced.
10989                    To start off the ARG2L() of the GOSUB holds its "id",
10990                    which is used later in conjunction with RExC_recurse
10991                    to calculate the offset we need to jump for the GOSUB,
10992                    which it will store in the final representation.
10993                    We have to defer the actual calculation until much later
10994                    as the regop may move.
10995                  */
10996
10997                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10998                 if (!SIZE_ONLY) {
10999                     if (num > (I32)RExC_rx->nparens) {
11000                         RExC_parse++;
11001                         vFAIL("Reference to nonexistent group");
11002                     }
11003                     RExC_recurse_count++;
11004                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11005                         "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11006                               22, "|    |", (int)(depth * 2 + 1), "",
11007                               (UV)ARG(ret), (IV)ARG2L(ret)));
11008                 }
11009                 RExC_seen |= REG_RECURSE_SEEN;
11010
11011                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
11012                 Set_Node_Offset(ret, parse_start); /* MJD */
11013
11014                 *flagp |= POSTPONED;
11015                 assert(*RExC_parse == ')');
11016                 nextchar(pRExC_state);
11017                 return ret;
11018
11019             /* NOTREACHED */
11020
11021             case '?':           /* (??...) */
11022                 is_logical = 1;
11023                 if (*RExC_parse != '{') {
11024                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
11025                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11026                     vFAIL2utf8f(
11027                         "Sequence (%" UTF8f "...) not recognized",
11028                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11029                     NOT_REACHED; /*NOTREACHED*/
11030                 }
11031                 *flagp |= POSTPONED;
11032                 paren = '{';
11033                 RExC_parse++;
11034                 /* FALLTHROUGH */
11035             case '{':           /* (?{...}) */
11036             {
11037                 U32 n = 0;
11038                 struct reg_code_block *cb;
11039
11040                 RExC_seen_zerolen++;
11041
11042                 if (   !pRExC_state->code_blocks
11043                     || pRExC_state->code_index
11044                                         >= pRExC_state->code_blocks->count
11045                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11046                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11047                             - RExC_start)
11048                 ) {
11049                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11050                         FAIL("panic: Sequence (?{...}): no code block found\n");
11051                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11052                 }
11053                 /* this is a pre-compiled code block (?{...}) */
11054                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11055                 RExC_parse = RExC_start + cb->end;
11056                 if (!SIZE_ONLY) {
11057                     OP *o = cb->block;
11058                     if (cb->src_regex) {
11059                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11060                         RExC_rxi->data->data[n] =
11061                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
11062                         RExC_rxi->data->data[n+1] = (void*)o;
11063                     }
11064                     else {
11065                         n = add_data(pRExC_state,
11066                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11067                         RExC_rxi->data->data[n] = (void*)o;
11068                     }
11069                 }
11070                 pRExC_state->code_index++;
11071                 nextchar(pRExC_state);
11072
11073                 if (is_logical) {
11074                     regnode *eval;
11075                     ret = reg_node(pRExC_state, LOGICAL);
11076
11077                     eval = reg2Lanode(pRExC_state, EVAL,
11078                                        n,
11079
11080                                        /* for later propagation into (??{})
11081                                         * return value */
11082                                        RExC_flags & RXf_PMf_COMPILETIME
11083                                       );
11084                     if (!SIZE_ONLY) {
11085                         ret->flags = 2;
11086                     }
11087                     REGTAIL(pRExC_state, ret, eval);
11088                     /* deal with the length of this later - MJD */
11089                     return ret;
11090                 }
11091                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11092                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
11093                 Set_Node_Offset(ret, parse_start);
11094                 return ret;
11095             }
11096             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11097             {
11098                 int is_define= 0;
11099                 const int DEFINE_len = sizeof("DEFINE") - 1;
11100                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
11101                     if (   RExC_parse < RExC_end - 1
11102                         && (   RExC_parse[1] == '='
11103                             || RExC_parse[1] == '!'
11104                             || RExC_parse[1] == '<'
11105                             || RExC_parse[1] == '{')
11106                     ) { /* Lookahead or eval. */
11107                         I32 flag;
11108                         regnode *tail;
11109
11110                         ret = reg_node(pRExC_state, LOGICAL);
11111                         if (!SIZE_ONLY)
11112                             ret->flags = 1;
11113
11114                         tail = reg(pRExC_state, 1, &flag, depth+1);
11115                         if (flag & (RESTART_PASS1|NEED_UTF8)) {
11116                             *flagp = flag & (RESTART_PASS1|NEED_UTF8);
11117                             return NULL;
11118                         }
11119                         REGTAIL(pRExC_state, ret, tail);
11120                         goto insert_if;
11121                     }
11122                     /* Fall through to ‘Unknown switch condition’ at the
11123                        end of the if/else chain. */
11124                 }
11125                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11126                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11127                 {
11128                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11129                     char *name_start= RExC_parse++;
11130                     U32 num = 0;
11131                     SV *sv_dat=reg_scan_name(pRExC_state,
11132                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11133                     if (   RExC_parse == name_start
11134                         || RExC_parse >= RExC_end
11135                         || *RExC_parse != ch)
11136                     {
11137                         vFAIL2("Sequence (?(%c... not terminated",
11138                             (ch == '>' ? '<' : ch));
11139                     }
11140                     RExC_parse++;
11141                     if (!SIZE_ONLY) {
11142                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11143                         RExC_rxi->data->data[num]=(void*)sv_dat;
11144                         SvREFCNT_inc_simple_void(sv_dat);
11145                     }
11146                     ret = reganode(pRExC_state,NGROUPP,num);
11147                     goto insert_if_check_paren;
11148                 }
11149                 else if (RExC_end - RExC_parse >= DEFINE_len
11150                         && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
11151                 {
11152                     ret = reganode(pRExC_state,DEFINEP,0);
11153                     RExC_parse += DEFINE_len;
11154                     is_define = 1;
11155                     goto insert_if_check_paren;
11156                 }
11157                 else if (RExC_parse[0] == 'R') {
11158                     RExC_parse++;
11159                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11160                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11161                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11162                      */
11163                     parno = 0;
11164                     if (RExC_parse[0] == '0') {
11165                         parno = 1;
11166                         RExC_parse++;
11167                     }
11168                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11169                         UV uv;
11170                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11171                             && uv <= I32_MAX
11172                         ) {
11173                             parno = (I32)uv + 1;
11174                             RExC_parse = (char*)endptr;
11175                         }
11176                         /* else "Switch condition not recognized" below */
11177                     } else if (RExC_parse[0] == '&') {
11178                         SV *sv_dat;
11179                         RExC_parse++;
11180                         sv_dat = reg_scan_name(pRExC_state,
11181                             SIZE_ONLY
11182                             ? REG_RSN_RETURN_NULL
11183                             : REG_RSN_RETURN_DATA);
11184
11185                         /* we should only have a false sv_dat when
11186                          * SIZE_ONLY is true, and we always have false
11187                          * sv_dat when SIZE_ONLY is true.
11188                          * reg_scan_name() will VFAIL() if the name is
11189                          * unknown when SIZE_ONLY is false, and otherwise
11190                          * will return something, and when SIZE_ONLY is
11191                          * true, reg_scan_name() just parses the string,
11192                          * and doesnt return anything. (in theory) */
11193                         assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11194
11195                         if (sv_dat)
11196                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11197                     }
11198                     ret = reganode(pRExC_state,INSUBP,parno);
11199                     goto insert_if_check_paren;
11200                 }
11201                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11202                     /* (?(1)...) */
11203                     char c;
11204                     UV uv;
11205                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11206                         && uv <= I32_MAX
11207                     ) {
11208                         parno = (I32)uv;
11209                         RExC_parse = (char*)endptr;
11210                     }
11211                     else {
11212                         vFAIL("panic: grok_atoUV returned FALSE");
11213                     }
11214                     ret = reganode(pRExC_state, GROUPP, parno);
11215
11216                  insert_if_check_paren:
11217                     if (UCHARAT(RExC_parse) != ')') {
11218                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11219                         vFAIL("Switch condition not recognized");
11220                     }
11221                     nextchar(pRExC_state);
11222                   insert_if:
11223                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11224                     br = regbranch(pRExC_state, &flags, 1,depth+1);
11225                     if (br == NULL) {
11226                         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11227                             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11228                             return NULL;
11229                         }
11230                         FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11231                               (UV) flags);
11232                     } else
11233                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
11234                                                           LONGJMP, 0));
11235                     c = UCHARAT(RExC_parse);
11236                     nextchar(pRExC_state);
11237                     if (flags&HASWIDTH)
11238                         *flagp |= HASWIDTH;
11239                     if (c == '|') {
11240                         if (is_define)
11241                             vFAIL("(?(DEFINE)....) does not allow branches");
11242
11243                         /* Fake one for optimizer.  */
11244                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11245
11246                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
11247                             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11248                                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11249                                 return NULL;
11250                             }
11251                             FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11252                                   (UV) flags);
11253                         }
11254                         REGTAIL(pRExC_state, ret, lastbr);
11255                         if (flags&HASWIDTH)
11256                             *flagp |= HASWIDTH;
11257                         c = UCHARAT(RExC_parse);
11258                         nextchar(pRExC_state);
11259                     }
11260                     else
11261                         lastbr = NULL;
11262                     if (c != ')') {
11263                         if (RExC_parse >= RExC_end)
11264                             vFAIL("Switch (?(condition)... not terminated");
11265                         else
11266                             vFAIL("Switch (?(condition)... contains too many branches");
11267                     }
11268                     ender = reg_node(pRExC_state, TAIL);
11269                     REGTAIL(pRExC_state, br, ender);
11270                     if (lastbr) {
11271                         REGTAIL(pRExC_state, lastbr, ender);
11272                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11273                     }
11274                     else
11275                         REGTAIL(pRExC_state, ret, ender);
11276                     RExC_size++; /* XXX WHY do we need this?!!
11277                                     For large programs it seems to be required
11278                                     but I can't figure out why. -- dmq*/
11279                     return ret;
11280                 }
11281                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11282                 vFAIL("Unknown switch condition (?(...))");
11283             }
11284             case '[':           /* (?[ ... ]) */
11285                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
11286                                          oregcomp_parse);
11287             case 0: /* A NUL */
11288                 RExC_parse--; /* for vFAIL to print correctly */
11289                 vFAIL("Sequence (? incomplete");
11290                 break;
11291             default: /* e.g., (?i) */
11292                 RExC_parse = (char *) seqstart + 1;
11293               parse_flags:
11294                 parse_lparen_question_flags(pRExC_state);
11295                 if (UCHARAT(RExC_parse) != ':') {
11296                     if (RExC_parse < RExC_end)
11297                         nextchar(pRExC_state);
11298                     *flagp = TRYAGAIN;
11299                     return NULL;
11300                 }
11301                 paren = ':';
11302                 nextchar(pRExC_state);
11303                 ret = NULL;
11304                 goto parse_rest;
11305             } /* end switch */
11306         }
11307         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11308           capturing_parens:
11309             parno = RExC_npar;
11310             RExC_npar++;
11311
11312             ret = reganode(pRExC_state, OPEN, parno);
11313             if (!SIZE_ONLY ){
11314                 if (!RExC_nestroot)
11315                     RExC_nestroot = parno;
11316                 if (RExC_open_parens && !RExC_open_parens[parno])
11317                 {
11318                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11319                         "%*s%*s Setting open paren #%" IVdf " to %d\n",
11320                         22, "|    |", (int)(depth * 2 + 1), "",
11321                         (IV)parno, REG_NODE_NUM(ret)));
11322                     RExC_open_parens[parno]= ret;
11323                 }
11324             }
11325             Set_Node_Length(ret, 1); /* MJD */
11326             Set_Node_Offset(ret, RExC_parse); /* MJD */
11327             is_open = 1;
11328         } else {
11329             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11330             paren = ':';
11331             ret = NULL;
11332         }
11333     }
11334     else                        /* ! paren */
11335         ret = NULL;
11336
11337    parse_rest:
11338     /* Pick up the branches, linking them together. */
11339     parse_start = RExC_parse;   /* MJD */
11340     br = regbranch(pRExC_state, &flags, 1,depth+1);
11341
11342     /*     branch_len = (paren != 0); */
11343
11344     if (br == NULL) {
11345         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11346             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11347             return NULL;
11348         }
11349         FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11350     }
11351     if (*RExC_parse == '|') {
11352         if (!SIZE_ONLY && RExC_extralen) {
11353             reginsert(pRExC_state, BRANCHJ, br, depth+1);
11354         }
11355         else {                  /* MJD */
11356             reginsert(pRExC_state, BRANCH, br, depth+1);
11357             Set_Node_Length(br, paren != 0);
11358             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11359         }
11360         have_branch = 1;
11361         if (SIZE_ONLY)
11362             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
11363     }
11364     else if (paren == ':') {
11365         *flagp |= flags&SIMPLE;
11366     }
11367     if (is_open) {                              /* Starts with OPEN. */
11368         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
11369     }
11370     else if (paren != '?')              /* Not Conditional */
11371         ret = br;
11372     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11373     lastbr = br;
11374     while (*RExC_parse == '|') {
11375         if (!SIZE_ONLY && RExC_extralen) {
11376             ender = reganode(pRExC_state, LONGJMP,0);
11377
11378             /* Append to the previous. */
11379             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11380         }
11381         if (SIZE_ONLY)
11382             RExC_extralen += 2;         /* Account for LONGJMP. */
11383         nextchar(pRExC_state);
11384         if (freeze_paren) {
11385             if (RExC_npar > after_freeze)
11386                 after_freeze = RExC_npar;
11387             RExC_npar = freeze_paren;
11388         }
11389         br = regbranch(pRExC_state, &flags, 0, depth+1);
11390
11391         if (br == NULL) {
11392             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11393                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11394                 return NULL;
11395             }
11396             FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11397         }
11398         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
11399         lastbr = br;
11400         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11401     }
11402
11403     if (have_branch || paren != ':') {
11404         /* Make a closing node, and hook it on the end. */
11405         switch (paren) {
11406         case ':':
11407             ender = reg_node(pRExC_state, TAIL);
11408             break;
11409         case 1: case 2:
11410             ender = reganode(pRExC_state, CLOSE, parno);
11411             if ( RExC_close_parens ) {
11412                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11413                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
11414                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11415                 RExC_close_parens[parno]= ender;
11416                 if (RExC_nestroot == parno)
11417                     RExC_nestroot = 0;
11418             }
11419             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11420             Set_Node_Length(ender,1); /* MJD */
11421             break;
11422         case '<':
11423         case ',':
11424         case '=':
11425         case '!':
11426             *flagp &= ~HASWIDTH;
11427             /* FALLTHROUGH */
11428         case '>':
11429             ender = reg_node(pRExC_state, SUCCEED);
11430             break;
11431         case 0:
11432             ender = reg_node(pRExC_state, END);
11433             if (!SIZE_ONLY) {
11434                 assert(!RExC_end_op); /* there can only be one! */
11435                 RExC_end_op = ender;
11436                 if (RExC_close_parens) {
11437                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11438                         "%*s%*s Setting close paren #0 (END) to %d\n",
11439                         22, "|    |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11440
11441                     RExC_close_parens[0]= ender;
11442                 }
11443             }
11444             break;
11445         }
11446         DEBUG_PARSE_r(if (!SIZE_ONLY) {
11447             DEBUG_PARSE_MSG("lsbr");
11448             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11449             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11450             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11451                           SvPV_nolen_const(RExC_mysv1),
11452                           (IV)REG_NODE_NUM(lastbr),
11453                           SvPV_nolen_const(RExC_mysv2),
11454                           (IV)REG_NODE_NUM(ender),
11455                           (IV)(ender - lastbr)
11456             );
11457         });
11458         REGTAIL(pRExC_state, lastbr, ender);
11459
11460         if (have_branch && !SIZE_ONLY) {
11461             char is_nothing= 1;
11462             if (depth==1)
11463                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11464
11465             /* Hook the tails of the branches to the closing node. */
11466             for (br = ret; br; br = regnext(br)) {
11467                 const U8 op = PL_regkind[OP(br)];
11468                 if (op == BRANCH) {
11469                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11470                     if ( OP(NEXTOPER(br)) != NOTHING
11471                          || regnext(NEXTOPER(br)) != ender)
11472                         is_nothing= 0;
11473                 }
11474                 else if (op == BRANCHJ) {
11475                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11476                     /* for now we always disable this optimisation * /
11477                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11478                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11479                     */
11480                         is_nothing= 0;
11481                 }
11482             }
11483             if (is_nothing) {
11484                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11485                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11486                     DEBUG_PARSE_MSG("NADA");
11487                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11488                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11489                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11490                                   SvPV_nolen_const(RExC_mysv1),
11491                                   (IV)REG_NODE_NUM(ret),
11492                                   SvPV_nolen_const(RExC_mysv2),
11493                                   (IV)REG_NODE_NUM(ender),
11494                                   (IV)(ender - ret)
11495                     );
11496                 });
11497                 OP(br)= NOTHING;
11498                 if (OP(ender) == TAIL) {
11499                     NEXT_OFF(br)= 0;
11500                     RExC_emit= br + 1;
11501                 } else {
11502                     regnode *opt;
11503                     for ( opt= br + 1; opt < ender ; opt++ )
11504                         OP(opt)= OPTIMIZED;
11505                     NEXT_OFF(br)= ender - br;
11506                 }
11507             }
11508         }
11509     }
11510
11511     {
11512         const char *p;
11513         static const char parens[] = "=!<,>";
11514
11515         if (paren && (p = strchr(parens, paren))) {
11516             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11517             int flag = (p - parens) > 1;
11518
11519             if (paren == '>')
11520                 node = SUSPEND, flag = 0;
11521             reginsert(pRExC_state, node,ret, depth+1);
11522             Set_Node_Cur_Length(ret, parse_start);
11523             Set_Node_Offset(ret, parse_start + 1);
11524             ret->flags = flag;
11525             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11526         }
11527     }
11528
11529     /* Check for proper termination. */
11530     if (paren) {
11531         /* restore original flags, but keep (?p) and, if we've changed from /d
11532          * rules to /u, keep the /u */
11533         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11534         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11535             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11536         }
11537         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11538             RExC_parse = oregcomp_parse;
11539             vFAIL("Unmatched (");
11540         }
11541         nextchar(pRExC_state);
11542     }
11543     else if (!paren && RExC_parse < RExC_end) {
11544         if (*RExC_parse == ')') {
11545             RExC_parse++;
11546             vFAIL("Unmatched )");
11547         }
11548         else
11549             FAIL("Junk on end of regexp");      /* "Can't happen". */
11550         NOT_REACHED; /* NOTREACHED */
11551     }
11552
11553     if (RExC_in_lookbehind) {
11554         RExC_in_lookbehind--;
11555     }
11556     if (after_freeze > RExC_npar)
11557         RExC_npar = after_freeze;
11558     return(ret);
11559 }
11560
11561 /*
11562  - regbranch - one alternative of an | operator
11563  *
11564  * Implements the concatenation operator.
11565  *
11566  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11567  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11568  */
11569 STATIC regnode *
11570 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11571 {
11572     regnode *ret;
11573     regnode *chain = NULL;
11574     regnode *latest;
11575     I32 flags = 0, c = 0;
11576     GET_RE_DEBUG_FLAGS_DECL;
11577
11578     PERL_ARGS_ASSERT_REGBRANCH;
11579
11580     DEBUG_PARSE("brnc");
11581
11582     if (first)
11583         ret = NULL;
11584     else {
11585         if (!SIZE_ONLY && RExC_extralen)
11586             ret = reganode(pRExC_state, BRANCHJ,0);
11587         else {
11588             ret = reg_node(pRExC_state, BRANCH);
11589             Set_Node_Length(ret, 1);
11590         }
11591     }
11592
11593     if (!first && SIZE_ONLY)
11594         RExC_extralen += 1;                     /* BRANCHJ */
11595
11596     *flagp = WORST;                     /* Tentatively. */
11597
11598     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11599                             FALSE /* Don't force to /x */ );
11600     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11601         flags &= ~TRYAGAIN;
11602         latest = regpiece(pRExC_state, &flags,depth+1);
11603         if (latest == NULL) {
11604             if (flags & TRYAGAIN)
11605                 continue;
11606             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11607                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11608                 return NULL;
11609             }
11610             FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags);
11611         }
11612         else if (ret == NULL)
11613             ret = latest;
11614         *flagp |= flags&(HASWIDTH|POSTPONED);
11615         if (chain == NULL)      /* First piece. */
11616             *flagp |= flags&SPSTART;
11617         else {
11618             /* FIXME adding one for every branch after the first is probably
11619              * excessive now we have TRIE support. (hv) */
11620             MARK_NAUGHTY(1);
11621             REGTAIL(pRExC_state, chain, latest);
11622         }
11623         chain = latest;
11624         c++;
11625     }
11626     if (chain == NULL) {        /* Loop ran zero times. */
11627         chain = reg_node(pRExC_state, NOTHING);
11628         if (ret == NULL)
11629             ret = chain;
11630     }
11631     if (c == 1) {
11632         *flagp |= flags&SIMPLE;
11633     }
11634
11635     return ret;
11636 }
11637
11638 /*
11639  - regpiece - something followed by possible quantifier * + ? {n,m}
11640  *
11641  * Note that the branching code sequences used for ? and the general cases
11642  * of * and + are somewhat optimized:  they use the same NOTHING node as
11643  * both the endmarker for their branch list and the body of the last branch.
11644  * It might seem that this node could be dispensed with entirely, but the
11645  * endmarker role is not redundant.
11646  *
11647  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11648  * TRYAGAIN.
11649  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11650  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11651  */
11652 STATIC regnode *
11653 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11654 {
11655     regnode *ret;
11656     char op;
11657     char *next;
11658     I32 flags;
11659     const char * const origparse = RExC_parse;
11660     I32 min;
11661     I32 max = REG_INFTY;
11662 #ifdef RE_TRACK_PATTERN_OFFSETS
11663     char *parse_start;
11664 #endif
11665     const char *maxpos = NULL;
11666     UV uv;
11667
11668     /* Save the original in case we change the emitted regop to a FAIL. */
11669     regnode * const orig_emit = RExC_emit;
11670
11671     GET_RE_DEBUG_FLAGS_DECL;
11672
11673     PERL_ARGS_ASSERT_REGPIECE;
11674
11675     DEBUG_PARSE("piec");
11676
11677     ret = regatom(pRExC_state, &flags,depth+1);
11678     if (ret == NULL) {
11679         if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11680             *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11681         else
11682             FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags);
11683         return(NULL);
11684     }
11685
11686     op = *RExC_parse;
11687
11688     if (op == '{' && regcurly(RExC_parse)) {
11689         maxpos = NULL;
11690 #ifdef RE_TRACK_PATTERN_OFFSETS
11691         parse_start = RExC_parse; /* MJD */
11692 #endif
11693         next = RExC_parse + 1;
11694         while (isDIGIT(*next) || *next == ',') {
11695             if (*next == ',') {
11696                 if (maxpos)
11697                     break;
11698                 else
11699                     maxpos = next;
11700             }
11701             next++;
11702         }
11703         if (*next == '}') {             /* got one */
11704             const char* endptr;
11705             if (!maxpos)
11706                 maxpos = next;
11707             RExC_parse++;
11708             if (isDIGIT(*RExC_parse)) {
11709                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11710                     vFAIL("Invalid quantifier in {,}");
11711                 if (uv >= REG_INFTY)
11712                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11713                 min = (I32)uv;
11714             } else {
11715                 min = 0;
11716             }
11717             if (*maxpos == ',')
11718                 maxpos++;
11719             else
11720                 maxpos = RExC_parse;
11721             if (isDIGIT(*maxpos)) {
11722                 if (!grok_atoUV(maxpos, &uv, &endptr))
11723                     vFAIL("Invalid quantifier in {,}");
11724                 if (uv >= REG_INFTY)
11725                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11726                 max = (I32)uv;
11727             } else {
11728                 max = REG_INFTY;                /* meaning "infinity" */
11729             }
11730             RExC_parse = next;
11731             nextchar(pRExC_state);
11732             if (max < min) {    /* If can't match, warn and optimize to fail
11733                                    unconditionally */
11734                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
11735                 if (PASS2) {
11736                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11737                     NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
11738                 }
11739                 return ret;
11740             }
11741             else if (min == max && *RExC_parse == '?')
11742             {
11743                 if (PASS2) {
11744                     ckWARN2reg(RExC_parse + 1,
11745                                "Useless use of greediness modifier '%c'",
11746                                *RExC_parse);
11747                 }
11748             }
11749
11750           do_curly:
11751             if ((flags&SIMPLE)) {
11752                 if (min == 0 && max == REG_INFTY) {
11753                     reginsert(pRExC_state, STAR, ret, depth+1);
11754                     ret->flags = 0;
11755                     MARK_NAUGHTY(4);
11756                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11757                     goto nest_check;
11758                 }
11759                 if (min == 1 && max == REG_INFTY) {
11760                     reginsert(pRExC_state, PLUS, ret, depth+1);
11761                     ret->flags = 0;
11762                     MARK_NAUGHTY(3);
11763                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11764                     goto nest_check;
11765                 }
11766                 MARK_NAUGHTY_EXP(2, 2);
11767                 reginsert(pRExC_state, CURLY, ret, depth+1);
11768                 Set_Node_Offset(ret, parse_start+1); /* MJD */
11769                 Set_Node_Cur_Length(ret, parse_start);
11770             }
11771             else {
11772                 regnode * const w = reg_node(pRExC_state, WHILEM);
11773
11774                 w->flags = 0;
11775                 REGTAIL(pRExC_state, ret, w);
11776                 if (!SIZE_ONLY && RExC_extralen) {
11777                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
11778                     reginsert(pRExC_state, NOTHING,ret, depth+1);
11779                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
11780                 }
11781                 reginsert(pRExC_state, CURLYX,ret, depth+1);
11782                                 /* MJD hk */
11783                 Set_Node_Offset(ret, parse_start+1);
11784                 Set_Node_Length(ret,
11785                                 op == '{' ? (RExC_parse - parse_start) : 1);
11786
11787                 if (!SIZE_ONLY && RExC_extralen)
11788                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
11789                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11790                 if (SIZE_ONLY)
11791                     RExC_whilem_seen++, RExC_extralen += 3;
11792                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
11793             }
11794             ret->flags = 0;
11795
11796             if (min > 0)
11797                 *flagp = WORST;
11798             if (max > 0)
11799                 *flagp |= HASWIDTH;
11800             if (!SIZE_ONLY) {
11801                 ARG1_SET(ret, (U16)min);
11802                 ARG2_SET(ret, (U16)max);
11803             }
11804             if (max == REG_INFTY)
11805                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11806
11807             goto nest_check;
11808         }
11809     }
11810
11811     if (!ISMULT1(op)) {
11812         *flagp = flags;
11813         return(ret);
11814     }
11815
11816 #if 0                           /* Now runtime fix should be reliable. */
11817
11818     /* if this is reinstated, don't forget to put this back into perldiag:
11819
11820             =item Regexp *+ operand could be empty at {#} in regex m/%s/
11821
11822            (F) The part of the regexp subject to either the * or + quantifier
11823            could match an empty string. The {#} shows in the regular
11824            expression about where the problem was discovered.
11825
11826     */
11827
11828     if (!(flags&HASWIDTH) && op != '?')
11829       vFAIL("Regexp *+ operand could be empty");
11830 #endif
11831
11832 #ifdef RE_TRACK_PATTERN_OFFSETS
11833     parse_start = RExC_parse;
11834 #endif
11835     nextchar(pRExC_state);
11836
11837     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11838
11839     if (op == '*') {
11840         min = 0;
11841         goto do_curly;
11842     }
11843     else if (op == '+') {
11844         min = 1;
11845         goto do_curly;
11846     }
11847     else if (op == '?') {
11848         min = 0; max = 1;
11849         goto do_curly;
11850     }
11851   nest_check:
11852     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11853         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11854         ckWARN2reg(RExC_parse,
11855                    "%" UTF8f " matches null string many times",
11856                    UTF8fARG(UTF, (RExC_parse >= origparse
11857                                  ? RExC_parse - origparse
11858                                  : 0),
11859                    origparse));
11860         (void)ReREFCNT_inc(RExC_rx_sv);
11861     }
11862
11863     if (*RExC_parse == '?') {
11864         nextchar(pRExC_state);
11865         reginsert(pRExC_state, MINMOD, ret, depth+1);
11866         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11867     }
11868     else if (*RExC_parse == '+') {
11869         regnode *ender;
11870         nextchar(pRExC_state);
11871         ender = reg_node(pRExC_state, SUCCEED);
11872         REGTAIL(pRExC_state, ret, ender);
11873         reginsert(pRExC_state, SUSPEND, ret, depth+1);
11874         ret->flags = 0;
11875         ender = reg_node(pRExC_state, TAIL);
11876         REGTAIL(pRExC_state, ret, ender);
11877     }
11878
11879     if (ISMULT2(RExC_parse)) {
11880         RExC_parse++;
11881         vFAIL("Nested quantifiers");
11882     }
11883
11884     return(ret);
11885 }
11886
11887 STATIC bool
11888 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11889                 regnode ** node_p,
11890                 UV * code_point_p,
11891                 int * cp_count,
11892                 I32 * flagp,
11893                 const bool strict,
11894                 const U32 depth
11895     )
11896 {
11897  /* This routine teases apart the various meanings of \N and returns
11898   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11899   * in the current context.
11900   *
11901   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11902   *
11903   * If <code_point_p> is not NULL, the context is expecting the result to be a
11904   * single code point.  If this \N instance turns out to a single code point,
11905   * the function returns TRUE and sets *code_point_p to that code point.
11906   *
11907   * If <node_p> is not NULL, the context is expecting the result to be one of
11908   * the things representable by a regnode.  If this \N instance turns out to be
11909   * one such, the function generates the regnode, returns TRUE and sets *node_p
11910   * to point to that regnode.
11911   *
11912   * If this instance of \N isn't legal in any context, this function will
11913   * generate a fatal error and not return.
11914   *
11915   * On input, RExC_parse should point to the first char following the \N at the
11916   * time of the call.  On successful return, RExC_parse will have been updated
11917   * to point to just after the sequence identified by this routine.  Also
11918   * *flagp has been updated as needed.
11919   *
11920   * When there is some problem with the current context and this \N instance,
11921   * the function returns FALSE, without advancing RExC_parse, nor setting
11922   * *node_p, nor *code_point_p, nor *flagp.
11923   *
11924   * If <cp_count> is not NULL, the caller wants to know the length (in code
11925   * points) that this \N sequence matches.  This is set even if the function
11926   * returns FALSE, as detailed below.
11927   *
11928   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11929   *
11930   * Probably the most common case is for the \N to specify a single code point.
11931   * *cp_count will be set to 1, and *code_point_p will be set to that code
11932   * point.
11933   *
11934   * Another possibility is for the input to be an empty \N{}, which for
11935   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11936   * will be set to a generated NOTHING node.
11937   *
11938   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11939   * set to 0. *node_p will be set to a generated REG_ANY node.
11940   *
11941   * The fourth possibility is that \N resolves to a sequence of more than one
11942   * code points.  *cp_count will be set to the number of code points in the
11943   * sequence. *node_p * will be set to a generated node returned by this
11944   * function calling S_reg().
11945   *
11946   * The final possibility is that it is premature to be calling this function;
11947   * that pass1 needs to be restarted.  This can happen when this changes from
11948   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
11949   * latter occurs only when the fourth possibility would otherwise be in
11950   * effect, and is because one of those code points requires the pattern to be
11951   * recompiled as UTF-8.  The function returns FALSE, and sets the
11952   * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
11953   * happens, the caller needs to desist from continuing parsing, and return
11954   * this information to its caller.  This is not set for when there is only one
11955   * code point, as this can be called as part of an ANYOF node, and they can
11956   * store above-Latin1 code points without the pattern having to be in UTF-8.
11957   *
11958   * For non-single-quoted regexes, the tokenizer has resolved character and
11959   * sequence names inside \N{...} into their Unicode values, normalizing the
11960   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11961   * hex-represented code points in the sequence.  This is done there because
11962   * the names can vary based on what charnames pragma is in scope at the time,
11963   * so we need a way to take a snapshot of what they resolve to at the time of
11964   * the original parse. [perl #56444].
11965   *
11966   * That parsing is skipped for single-quoted regexes, so we may here get
11967   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11968   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11969   * is legal and handled here.  The code point is Unicode, and has to be
11970   * translated into the native character set for non-ASCII platforms.
11971   */
11972
11973     char * endbrace;    /* points to '}' following the name */
11974     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11975                            stream */
11976     char* p = RExC_parse; /* Temporary */
11977
11978     GET_RE_DEBUG_FLAGS_DECL;
11979
11980     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11981
11982     GET_RE_DEBUG_FLAGS;
11983
11984     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11985     assert(! (node_p && cp_count));               /* At most 1 should be set */
11986
11987     if (cp_count) {     /* Initialize return for the most common case */
11988         *cp_count = 1;
11989     }
11990
11991     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11992      * modifier.  The other meanings do not, so use a temporary until we find
11993      * out which we are being called with */
11994     skip_to_be_ignored_text(pRExC_state, &p,
11995                             FALSE /* Don't force to /x */ );
11996
11997     /* Disambiguate between \N meaning a named character versus \N meaning
11998      * [^\n].  The latter is assumed when the {...} following the \N is a legal
11999      * quantifier, or there is no '{' at all */
12000     if (*p != '{' || regcurly(p)) {
12001         RExC_parse = p;
12002         if (cp_count) {
12003             *cp_count = -1;
12004         }
12005
12006         if (! node_p) {
12007             return FALSE;
12008         }
12009
12010         *node_p = reg_node(pRExC_state, REG_ANY);
12011         *flagp |= HASWIDTH|SIMPLE;
12012         MARK_NAUGHTY(1);
12013         Set_Node_Length(*node_p, 1); /* MJD */
12014         return TRUE;
12015     }
12016
12017     /* Here, we have decided it should be a named character or sequence */
12018
12019     /* The test above made sure that the next real character is a '{', but
12020      * under the /x modifier, it could be separated by space (or a comment and
12021      * \n) and this is not allowed (for consistency with \x{...} and the
12022      * tokenizer handling of \N{NAME}). */
12023     if (*RExC_parse != '{') {
12024         vFAIL("Missing braces on \\N{}");
12025     }
12026
12027     RExC_parse++;       /* Skip past the '{' */
12028
12029     if (! (endbrace = strchr(RExC_parse, '}'))) { /* no trailing brace */
12030         vFAIL2("Missing right brace on \\%c{}", 'N');
12031     }
12032     else if(!(endbrace == RExC_parse            /* nothing between the {} */
12033               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked... */
12034                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
12035                                                        error msg) */
12036     {
12037         RExC_parse = endbrace;  /* position msg's '<--HERE' */
12038         vFAIL("\\N{NAME} must be resolved by the lexer");
12039     }
12040
12041     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12042                                         semantics */
12043
12044     if (endbrace == RExC_parse) {   /* empty: \N{} */
12045         if (strict) {
12046             RExC_parse++;   /* Position after the "}" */
12047             vFAIL("Zero length \\N{}");
12048         }
12049         if (cp_count) {
12050             *cp_count = 0;
12051         }
12052         nextchar(pRExC_state);
12053         if (! node_p) {
12054             return FALSE;
12055         }
12056
12057         *node_p = reg_node(pRExC_state,NOTHING);
12058         return TRUE;
12059     }
12060
12061     RExC_parse += 2;    /* Skip past the 'U+' */
12062
12063     /* Because toke.c has generated a special construct for us guaranteed not
12064      * to have NULs, we can use a str function */
12065     endchar = RExC_parse + strcspn(RExC_parse, ".}");
12066
12067     /* Code points are separated by dots.  If none, there is only one code
12068      * point, and is terminated by the brace */
12069
12070     if (endchar >= endbrace) {
12071         STRLEN length_of_hex;
12072         I32 grok_hex_flags;
12073
12074         /* Here, exactly one code point.  If that isn't what is wanted, fail */
12075         if (! code_point_p) {
12076             RExC_parse = p;
12077             return FALSE;
12078         }
12079
12080         /* Convert code point from hex */
12081         length_of_hex = (STRLEN)(endchar - RExC_parse);
12082         grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
12083                            | PERL_SCAN_DISALLOW_PREFIX
12084
12085                              /* No errors in the first pass (See [perl
12086                               * #122671].)  We let the code below find the
12087                               * errors when there are multiple chars. */
12088                            | ((SIZE_ONLY)
12089                               ? PERL_SCAN_SILENT_ILLDIGIT
12090                               : 0);
12091
12092         /* This routine is the one place where both single- and double-quotish
12093          * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
12094          * must be converted to native. */
12095         *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
12096                                          &length_of_hex,
12097                                          &grok_hex_flags,
12098                                          NULL));
12099
12100         /* The tokenizer should have guaranteed validity, but it's possible to
12101          * bypass it by using single quoting, so check.  Don't do the check
12102          * here when there are multiple chars; we do it below anyway. */
12103         if (length_of_hex == 0
12104             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
12105         {
12106             RExC_parse += length_of_hex;        /* Includes all the valid */
12107             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
12108                             ? UTF8SKIP(RExC_parse)
12109                             : 1;
12110             /* Guard against malformed utf8 */
12111             if (RExC_parse >= endchar) {
12112                 RExC_parse = endchar;
12113             }
12114             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12115         }
12116
12117         RExC_parse = endbrace + 1;
12118         return TRUE;
12119     }
12120     else {  /* Is a multiple character sequence */
12121         SV * substitute_parse;
12122         STRLEN len;
12123         char *orig_end = RExC_end;
12124         char *save_start = RExC_start;
12125         I32 flags;
12126
12127         /* Count the code points, if desired, in the sequence */
12128         if (cp_count) {
12129             *cp_count = 0;
12130             while (RExC_parse < endbrace) {
12131                 /* Point to the beginning of the next character in the sequence. */
12132                 RExC_parse = endchar + 1;
12133                 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12134                 (*cp_count)++;
12135             }
12136         }
12137
12138         /* Fail if caller doesn't want to handle a multi-code-point sequence.
12139          * But don't backup up the pointer if the caller want to know how many
12140          * code points there are (they can then handle things) */
12141         if (! node_p) {
12142             if (! cp_count) {
12143                 RExC_parse = p;
12144             }
12145             return FALSE;
12146         }
12147
12148         /* What is done here is to convert this to a sub-pattern of the form
12149          * \x{char1}\x{char2}...  and then call reg recursively to parse it
12150          * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
12151          * while not having to worry about special handling that some code
12152          * points may have. */
12153
12154         substitute_parse = newSVpvs("?:");
12155
12156         while (RExC_parse < endbrace) {
12157
12158             /* Convert to notation the rest of the code understands */
12159             sv_catpv(substitute_parse, "\\x{");
12160             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
12161             sv_catpv(substitute_parse, "}");
12162
12163             /* Point to the beginning of the next character in the sequence. */
12164             RExC_parse = endchar + 1;
12165             endchar = RExC_parse + strcspn(RExC_parse, ".}");
12166
12167         }
12168         sv_catpv(substitute_parse, ")");
12169
12170         RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
12171                                                              len);
12172
12173         /* Don't allow empty number */
12174         if (len < (STRLEN) 8) {
12175             RExC_parse = endbrace;
12176             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12177         }
12178         RExC_end = RExC_parse + len;
12179
12180         /* The values are Unicode, and therefore not subject to recoding, but
12181          * have to be converted to native on a non-Unicode (meaning non-ASCII)
12182          * platform. */
12183 #ifdef EBCDIC
12184         RExC_recode_x_to_native = 1;
12185 #endif
12186
12187         if (node_p) {
12188             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
12189                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12190                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12191                     return FALSE;
12192                 }
12193                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
12194                     (UV) flags);
12195             }
12196             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12197         }
12198
12199         /* Restore the saved values */
12200         RExC_start = RExC_adjusted_start = save_start;
12201         RExC_parse = endbrace;
12202         RExC_end = orig_end;
12203 #ifdef EBCDIC
12204         RExC_recode_x_to_native = 0;
12205 #endif
12206
12207         SvREFCNT_dec_NN(substitute_parse);
12208         nextchar(pRExC_state);
12209
12210         return TRUE;
12211     }
12212 }
12213
12214
12215 PERL_STATIC_INLINE U8
12216 S_compute_EXACTish(RExC_state_t *pRExC_state)
12217 {
12218     U8 op;
12219
12220     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12221
12222     if (! FOLD) {
12223         return (LOC)
12224                 ? EXACTL
12225                 : EXACT;
12226     }
12227
12228     op = get_regex_charset(RExC_flags);
12229     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12230         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12231                  been, so there is no hole */
12232     }
12233
12234     return op + EXACTF;
12235 }
12236
12237 PERL_STATIC_INLINE void
12238 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12239                          regnode *node, I32* flagp, STRLEN len, UV code_point,
12240                          bool downgradable)
12241 {
12242     /* This knows the details about sizing an EXACTish node, setting flags for
12243      * it (by setting <*flagp>, and potentially populating it with a single
12244      * character.
12245      *
12246      * If <len> (the length in bytes) is non-zero, this function assumes that
12247      * the node has already been populated, and just does the sizing.  In this
12248      * case <code_point> should be the final code point that has already been
12249      * placed into the node.  This value will be ignored except that under some
12250      * circumstances <*flagp> is set based on it.
12251      *
12252      * If <len> is zero, the function assumes that the node is to contain only
12253      * the single character given by <code_point> and calculates what <len>
12254      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
12255      * additionally will populate the node's STRING with <code_point> or its
12256      * fold if folding.
12257      *
12258      * In both cases <*flagp> is appropriately set
12259      *
12260      * It knows that under FOLD, the Latin Sharp S and UTF characters above
12261      * 255, must be folded (the former only when the rules indicate it can
12262      * match 'ss')
12263      *
12264      * When it does the populating, it looks at the flag 'downgradable'.  If
12265      * true with a node that folds, it checks if the single code point
12266      * participates in a fold, and if not downgrades the node to an EXACT.
12267      * This helps the optimizer */
12268
12269     bool len_passed_in = cBOOL(len != 0);
12270     U8 character[UTF8_MAXBYTES_CASE+1];
12271
12272     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12273
12274     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12275      * sizing difference, and is extra work that is thrown away */
12276     if (downgradable && ! PASS2) {
12277         downgradable = FALSE;
12278     }
12279
12280     if (! len_passed_in) {
12281         if (UTF) {
12282             if (UVCHR_IS_INVARIANT(code_point)) {
12283                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
12284                     *character = (U8) code_point;
12285                 }
12286                 else { /* Here is /i and not /l. (toFOLD() is defined on just
12287                           ASCII, which isn't the same thing as INVARIANT on
12288                           EBCDIC, but it works there, as the extra invariants
12289                           fold to themselves) */
12290                     *character = toFOLD((U8) code_point);
12291
12292                     /* We can downgrade to an EXACT node if this character
12293                      * isn't a folding one.  Note that this assumes that
12294                      * nothing above Latin1 folds to some other invariant than
12295                      * one of these alphabetics; otherwise we would also have
12296                      * to check:
12297                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12298                      *      || ASCII_FOLD_RESTRICTED))
12299                      */
12300                     if (downgradable && PL_fold[code_point] == code_point) {
12301                         OP(node) = EXACT;
12302                     }
12303                 }
12304                 len = 1;
12305             }
12306             else if (FOLD && (! LOC
12307                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12308             {   /* Folding, and ok to do so now */
12309                 UV folded = _to_uni_fold_flags(
12310                                    code_point,
12311                                    character,
12312                                    &len,
12313                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12314                                                       ? FOLD_FLAGS_NOMIX_ASCII
12315                                                       : 0));
12316                 if (downgradable
12317                     && folded == code_point /* This quickly rules out many
12318                                                cases, avoiding the
12319                                                _invlist_contains_cp() overhead
12320                                                for those.  */
12321                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12322                 {
12323                     OP(node) = (LOC)
12324                                ? EXACTL
12325                                : EXACT;
12326                 }
12327             }
12328             else if (code_point <= MAX_UTF8_TWO_BYTE) {
12329
12330                 /* Not folding this cp, and can output it directly */
12331                 *character = UTF8_TWO_BYTE_HI(code_point);
12332                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12333                 len = 2;
12334             }
12335             else {
12336                 uvchr_to_utf8( character, code_point);
12337                 len = UTF8SKIP(character);
12338             }
12339         } /* Else pattern isn't UTF8.  */
12340         else if (! FOLD) {
12341             *character = (U8) code_point;
12342             len = 1;
12343         } /* Else is folded non-UTF8 */
12344 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12345    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12346                                       || UNICODE_DOT_DOT_VERSION > 0)
12347         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12348 #else
12349         else if (1) {
12350 #endif
12351             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
12352              * comments at join_exact()); */
12353             *character = (U8) code_point;
12354             len = 1;
12355
12356             /* Can turn into an EXACT node if we know the fold at compile time,
12357              * and it folds to itself and doesn't particpate in other folds */
12358             if (downgradable
12359                 && ! LOC
12360                 && PL_fold_latin1[code_point] == code_point
12361                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12362                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12363             {
12364                 OP(node) = EXACT;
12365             }
12366         } /* else is Sharp s.  May need to fold it */
12367         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12368             *character = 's';
12369             *(character + 1) = 's';
12370             len = 2;
12371         }
12372         else {
12373             *character = LATIN_SMALL_LETTER_SHARP_S;
12374             len = 1;
12375         }
12376     }
12377
12378     if (SIZE_ONLY) {
12379         RExC_size += STR_SZ(len);
12380     }
12381     else {
12382         RExC_emit += STR_SZ(len);
12383         STR_LEN(node) = len;
12384         if (! len_passed_in) {
12385             Copy((char *) character, STRING(node), len, char);
12386         }
12387     }
12388
12389     *flagp |= HASWIDTH;
12390
12391     /* A single character node is SIMPLE, except for the special-cased SHARP S
12392      * under /di. */
12393     if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12394 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12395    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12396                                       || UNICODE_DOT_DOT_VERSION > 0)
12397         && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12398             || ! FOLD || ! DEPENDS_SEMANTICS)
12399 #endif
12400     ) {
12401         *flagp |= SIMPLE;
12402     }
12403
12404     /* The OP may not be well defined in PASS1 */
12405     if (PASS2 && OP(node) == EXACTFL) {
12406         RExC_contains_locale = 1;
12407     }
12408 }
12409
12410
12411 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12412  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12413
12414 static I32
12415 S_backref_value(char *p)
12416 {
12417     const char* endptr;
12418     UV val;
12419     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12420         return (I32)val;
12421     return I32_MAX;
12422 }
12423
12424
12425 /*
12426  - regatom - the lowest level
12427
12428    Try to identify anything special at the start of the current parse position.
12429    If there is, then handle it as required. This may involve generating a
12430    single regop, such as for an assertion; or it may involve recursing, such as
12431    to handle a () structure.
12432
12433    If the string doesn't start with something special then we gobble up
12434    as much literal text as we can.  If we encounter a quantifier, we have to
12435    back off the final literal character, as that quantifier applies to just it
12436    and not to the whole string of literals.
12437
12438    Once we have been able to handle whatever type of thing started the
12439    sequence, we return.
12440
12441    Note: we have to be careful with escapes, as they can be both literal
12442    and special, and in the case of \10 and friends, context determines which.
12443
12444    A summary of the code structure is:
12445
12446    switch (first_byte) {
12447         cases for each special:
12448             handle this special;
12449             break;
12450         case '\\':
12451             switch (2nd byte) {
12452                 cases for each unambiguous special:
12453                     handle this special;
12454                     break;
12455                 cases for each ambigous special/literal:
12456                     disambiguate;
12457                     if (special)  handle here
12458                     else goto defchar;
12459                 default: // unambiguously literal:
12460                     goto defchar;
12461             }
12462         default:  // is a literal char
12463             // FALL THROUGH
12464         defchar:
12465             create EXACTish node for literal;
12466             while (more input and node isn't full) {
12467                 switch (input_byte) {
12468                    cases for each special;
12469                        make sure parse pointer is set so that the next call to
12470                            regatom will see this special first
12471                        goto loopdone; // EXACTish node terminated by prev. char
12472                    default:
12473                        append char to EXACTISH node;
12474                 }
12475                 get next input byte;
12476             }
12477         loopdone:
12478    }
12479    return the generated node;
12480
12481    Specifically there are two separate switches for handling
12482    escape sequences, with the one for handling literal escapes requiring
12483    a dummy entry for all of the special escapes that are actually handled
12484    by the other.
12485
12486    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12487    TRYAGAIN.
12488    Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12489    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12490    Otherwise does not return NULL.
12491 */
12492
12493 STATIC regnode *
12494 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12495 {
12496     regnode *ret = NULL;
12497     I32 flags = 0;
12498     char *parse_start;
12499     U8 op;
12500     int invert = 0;
12501     U8 arg;
12502
12503     GET_RE_DEBUG_FLAGS_DECL;
12504
12505     *flagp = WORST;             /* Tentatively. */
12506
12507     DEBUG_PARSE("atom");
12508
12509     PERL_ARGS_ASSERT_REGATOM;
12510
12511   tryagain:
12512     parse_start = RExC_parse;
12513     assert(RExC_parse < RExC_end);
12514     switch ((U8)*RExC_parse) {
12515     case '^':
12516         RExC_seen_zerolen++;
12517         nextchar(pRExC_state);
12518         if (RExC_flags & RXf_PMf_MULTILINE)
12519             ret = reg_node(pRExC_state, MBOL);
12520         else
12521             ret = reg_node(pRExC_state, SBOL);
12522         Set_Node_Length(ret, 1); /* MJD */
12523         break;
12524     case '$':
12525         nextchar(pRExC_state);
12526         if (*RExC_parse)
12527             RExC_seen_zerolen++;
12528         if (RExC_flags & RXf_PMf_MULTILINE)
12529             ret = reg_node(pRExC_state, MEOL);
12530         else
12531             ret = reg_node(pRExC_state, SEOL);
12532         Set_Node_Length(ret, 1); /* MJD */
12533         break;
12534     case '.':
12535         nextchar(pRExC_state);
12536         if (RExC_flags & RXf_PMf_SINGLELINE)
12537             ret = reg_node(pRExC_state, SANY);
12538         else
12539             ret = reg_node(pRExC_state, REG_ANY);
12540         *flagp |= HASWIDTH|SIMPLE;
12541         MARK_NAUGHTY(1);
12542         Set_Node_Length(ret, 1); /* MJD */
12543         break;
12544     case '[':
12545     {
12546         char * const oregcomp_parse = ++RExC_parse;
12547         ret = regclass(pRExC_state, flagp,depth+1,
12548                        FALSE, /* means parse the whole char class */
12549                        TRUE, /* allow multi-char folds */
12550                        FALSE, /* don't silence non-portable warnings. */
12551                        (bool) RExC_strict,
12552                        TRUE, /* Allow an optimized regnode result */
12553                        NULL,
12554                        NULL);
12555         if (ret == NULL) {
12556             if (*flagp & (RESTART_PASS1|NEED_UTF8))
12557                 return NULL;
12558             FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
12559                   (UV) *flagp);
12560         }
12561         if (*RExC_parse != ']') {
12562             RExC_parse = oregcomp_parse;
12563             vFAIL("Unmatched [");
12564         }
12565         nextchar(pRExC_state);
12566         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12567         break;
12568     }
12569     case '(':
12570         nextchar(pRExC_state);
12571         ret = reg(pRExC_state, 2, &flags,depth+1);
12572         if (ret == NULL) {
12573                 if (flags & TRYAGAIN) {
12574                     if (RExC_parse >= RExC_end) {
12575                          /* Make parent create an empty node if needed. */
12576                         *flagp |= TRYAGAIN;
12577                         return(NULL);
12578                     }
12579                     goto tryagain;
12580                 }
12581                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12582                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12583                     return NULL;
12584                 }
12585                 FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf,
12586                                                                  (UV) flags);
12587         }
12588         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12589         break;
12590     case '|':
12591     case ')':
12592         if (flags & TRYAGAIN) {
12593             *flagp |= TRYAGAIN;
12594             return NULL;
12595         }
12596         vFAIL("Internal urp");
12597                                 /* Supposed to be caught earlier. */
12598         break;
12599     case '?':
12600     case '+':
12601     case '*':
12602         RExC_parse++;
12603         vFAIL("Quantifier follows nothing");
12604         break;
12605     case '\\':
12606         /* Special Escapes
12607
12608            This switch handles escape sequences that resolve to some kind
12609            of special regop and not to literal text. Escape sequnces that
12610            resolve to literal text are handled below in the switch marked
12611            "Literal Escapes".
12612
12613            Every entry in this switch *must* have a corresponding entry
12614            in the literal escape switch. However, the opposite is not
12615            required, as the default for this switch is to jump to the
12616            literal text handling code.
12617         */
12618         RExC_parse++;
12619         switch ((U8)*RExC_parse) {
12620         /* Special Escapes */
12621         case 'A':
12622             RExC_seen_zerolen++;
12623             ret = reg_node(pRExC_state, SBOL);
12624             /* SBOL is shared with /^/ so we set the flags so we can tell
12625              * /\A/ from /^/ in split. We check ret because first pass we
12626              * have no regop struct to set the flags on. */
12627             if (PASS2)
12628                 ret->flags = 1;
12629             *flagp |= SIMPLE;
12630             goto finish_meta_pat;
12631         case 'G':
12632             ret = reg_node(pRExC_state, GPOS);
12633             RExC_seen |= REG_GPOS_SEEN;
12634             *flagp |= SIMPLE;
12635             goto finish_meta_pat;
12636         case 'K':
12637             RExC_seen_zerolen++;
12638             ret = reg_node(pRExC_state, KEEPS);
12639             *flagp |= SIMPLE;
12640             /* XXX:dmq : disabling in-place substitution seems to
12641              * be necessary here to avoid cases of memory corruption, as
12642              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12643              */
12644             RExC_seen |= REG_LOOKBEHIND_SEEN;
12645             goto finish_meta_pat;
12646         case 'Z':
12647             ret = reg_node(pRExC_state, SEOL);
12648             *flagp |= SIMPLE;
12649             RExC_seen_zerolen++;                /* Do not optimize RE away */
12650             goto finish_meta_pat;
12651         case 'z':
12652             ret = reg_node(pRExC_state, EOS);
12653             *flagp |= SIMPLE;
12654             RExC_seen_zerolen++;                /* Do not optimize RE away */
12655             goto finish_meta_pat;
12656         case 'C':
12657             vFAIL("\\C no longer supported");
12658         case 'X':
12659             ret = reg_node(pRExC_state, CLUMP);
12660             *flagp |= HASWIDTH;
12661             goto finish_meta_pat;
12662
12663         case 'W':
12664             invert = 1;
12665             /* FALLTHROUGH */
12666         case 'w':
12667             arg = ANYOF_WORDCHAR;
12668             goto join_posix;
12669
12670         case 'B':
12671             invert = 1;
12672             /* FALLTHROUGH */
12673         case 'b':
12674           {
12675             regex_charset charset = get_regex_charset(RExC_flags);
12676
12677             RExC_seen_zerolen++;
12678             RExC_seen |= REG_LOOKBEHIND_SEEN;
12679             op = BOUND + charset;
12680
12681             if (op == BOUNDL) {
12682                 RExC_contains_locale = 1;
12683             }
12684
12685             ret = reg_node(pRExC_state, op);
12686             *flagp |= SIMPLE;
12687             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12688                 FLAGS(ret) = TRADITIONAL_BOUND;
12689                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
12690                     OP(ret) = BOUNDA;
12691                 }
12692             }
12693             else {
12694                 STRLEN length;
12695                 char name = *RExC_parse;
12696                 char * endbrace;
12697                 RExC_parse += 2;
12698                 endbrace = strchr(RExC_parse, '}');
12699
12700                 if (! endbrace) {
12701                     vFAIL2("Missing right brace on \\%c{}", name);
12702                 }
12703                 /* XXX Need to decide whether to take spaces or not.  Should be
12704                  * consistent with \p{}, but that currently is SPACE, which
12705                  * means vertical too, which seems wrong
12706                  * while (isBLANK(*RExC_parse)) {
12707                     RExC_parse++;
12708                 }*/
12709                 if (endbrace == RExC_parse) {
12710                     RExC_parse++;  /* After the '}' */
12711                     vFAIL2("Empty \\%c{}", name);
12712                 }
12713                 length = endbrace - RExC_parse;
12714                 /*while (isBLANK(*(RExC_parse + length - 1))) {
12715                     length--;
12716                 }*/
12717                 switch (*RExC_parse) {
12718                     case 'g':
12719                         if (length != 1
12720                             && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12721                         {
12722                             goto bad_bound_type;
12723                         }
12724                         FLAGS(ret) = GCB_BOUND;
12725                         break;
12726                     case 'l':
12727                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12728                             goto bad_bound_type;
12729                         }
12730                         FLAGS(ret) = LB_BOUND;
12731                         break;
12732                     case 's':
12733                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12734                             goto bad_bound_type;
12735                         }
12736                         FLAGS(ret) = SB_BOUND;
12737                         break;
12738                     case 'w':
12739                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12740                             goto bad_bound_type;
12741                         }
12742                         FLAGS(ret) = WB_BOUND;
12743                         break;
12744                     default:
12745                       bad_bound_type:
12746                         RExC_parse = endbrace;
12747                         vFAIL2utf8f(
12748                             "'%" UTF8f "' is an unknown bound type",
12749                             UTF8fARG(UTF, length, endbrace - length));
12750                         NOT_REACHED; /*NOTREACHED*/
12751                 }
12752                 RExC_parse = endbrace;
12753                 REQUIRE_UNI_RULES(flagp, NULL);
12754
12755                 if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
12756                     OP(ret) = BOUNDU;
12757                     length += 4;
12758
12759                     /* Don't have to worry about UTF-8, in this message because
12760                      * to get here the contents of the \b must be ASCII */
12761                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
12762                               "Using /u for '%.*s' instead of /%s",
12763                               (unsigned) length,
12764                               endbrace - length + 1,
12765                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12766                               ? ASCII_RESTRICT_PAT_MODS
12767                               : ASCII_MORE_RESTRICT_PAT_MODS);
12768                 }
12769             }
12770
12771             if (PASS2 && invert) {
12772                 OP(ret) += NBOUND - BOUND;
12773             }
12774             goto finish_meta_pat;
12775           }
12776
12777         case 'D':
12778             invert = 1;
12779             /* FALLTHROUGH */
12780         case 'd':
12781             arg = ANYOF_DIGIT;
12782             if (! DEPENDS_SEMANTICS) {
12783                 goto join_posix;
12784             }
12785
12786             /* \d doesn't have any matches in the upper Latin1 range, hence /d
12787              * is equivalent to /u.  Changing to /u saves some branches at
12788              * runtime */
12789             op = POSIXU;
12790             goto join_posix_op_known;
12791
12792         case 'R':
12793             ret = reg_node(pRExC_state, LNBREAK);
12794             *flagp |= HASWIDTH|SIMPLE;
12795             goto finish_meta_pat;
12796
12797         case 'H':
12798             invert = 1;
12799             /* FALLTHROUGH */
12800         case 'h':
12801             arg = ANYOF_BLANK;
12802             op = POSIXU;
12803             goto join_posix_op_known;
12804
12805         case 'V':
12806             invert = 1;
12807             /* FALLTHROUGH */
12808         case 'v':
12809             arg = ANYOF_VERTWS;
12810             op = POSIXU;
12811             goto join_posix_op_known;
12812
12813         case 'S':
12814             invert = 1;
12815             /* FALLTHROUGH */
12816         case 's':
12817             arg = ANYOF_SPACE;
12818
12819           join_posix:
12820
12821             op = POSIXD + get_regex_charset(RExC_flags);
12822             if (op > POSIXA) {  /* /aa is same as /a */
12823                 op = POSIXA;
12824             }
12825             else if (op == POSIXL) {
12826                 RExC_contains_locale = 1;
12827             }
12828
12829           join_posix_op_known:
12830
12831             if (invert) {
12832                 op += NPOSIXD - POSIXD;
12833             }
12834
12835             ret = reg_node(pRExC_state, op);
12836             if (! SIZE_ONLY) {
12837                 FLAGS(ret) = namedclass_to_classnum(arg);
12838             }
12839
12840             *flagp |= HASWIDTH|SIMPLE;
12841             /* FALLTHROUGH */
12842
12843           finish_meta_pat:
12844             nextchar(pRExC_state);
12845             Set_Node_Length(ret, 2); /* MJD */
12846             break;
12847         case 'p':
12848         case 'P':
12849             RExC_parse--;
12850
12851             ret = regclass(pRExC_state, flagp,depth+1,
12852                            TRUE, /* means just parse this element */
12853                            FALSE, /* don't allow multi-char folds */
12854                            FALSE, /* don't silence non-portable warnings.  It
12855                                      would be a bug if these returned
12856                                      non-portables */
12857                            (bool) RExC_strict,
12858                            TRUE, /* Allow an optimized regnode result */
12859                            NULL,
12860                            NULL);
12861             if (*flagp & RESTART_PASS1)
12862                 return NULL;
12863             /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12864              * multi-char folds are allowed.  */
12865             if (!ret)
12866                 FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
12867                       (UV) *flagp);
12868
12869             RExC_parse--;
12870
12871             Set_Node_Offset(ret, parse_start);
12872             Set_Node_Cur_Length(ret, parse_start - 2);
12873             nextchar(pRExC_state);
12874             break;
12875         case 'N':
12876             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12877              * \N{...} evaluates to a sequence of more than one code points).
12878              * The function call below returns a regnode, which is our result.
12879              * The parameters cause it to fail if the \N{} evaluates to a
12880              * single code point; we handle those like any other literal.  The
12881              * reason that the multicharacter case is handled here and not as
12882              * part of the EXACtish code is because of quantifiers.  In
12883              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12884              * this way makes that Just Happen. dmq.
12885              * join_exact() will join this up with adjacent EXACTish nodes
12886              * later on, if appropriate. */
12887             ++RExC_parse;
12888             if (grok_bslash_N(pRExC_state,
12889                               &ret,     /* Want a regnode returned */
12890                               NULL,     /* Fail if evaluates to a single code
12891                                            point */
12892                               NULL,     /* Don't need a count of how many code
12893                                            points */
12894                               flagp,
12895                               RExC_strict,
12896                               depth)
12897             ) {
12898                 break;
12899             }
12900
12901             if (*flagp & RESTART_PASS1)
12902                 return NULL;
12903
12904             /* Here, evaluates to a single code point.  Go get that */
12905             RExC_parse = parse_start;
12906             goto defchar;
12907
12908         case 'k':    /* Handle \k<NAME> and \k'NAME' */
12909       parse_named_seq:
12910         {
12911             char ch;
12912             if (   RExC_parse >= RExC_end - 1
12913                 || ((   ch = RExC_parse[1]) != '<'
12914                                       && ch != '\''
12915                                       && ch != '{'))
12916             {
12917                 RExC_parse++;
12918                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12919                 vFAIL2("Sequence %.2s... not terminated",parse_start);
12920             } else {
12921                 RExC_parse += 2;
12922                 ret = handle_named_backref(pRExC_state,
12923                                            flagp,
12924                                            parse_start,
12925                                            (ch == '<')
12926                                            ? '>'
12927                                            : (ch == '{')
12928                                              ? '}'
12929                                              : '\'');
12930             }
12931             break;
12932         }
12933         case 'g':
12934         case '1': case '2': case '3': case '4':
12935         case '5': case '6': case '7': case '8': case '9':
12936             {
12937                 I32 num;
12938                 bool hasbrace = 0;
12939
12940                 if (*RExC_parse == 'g') {
12941                     bool isrel = 0;
12942
12943                     RExC_parse++;
12944                     if (*RExC_parse == '{') {
12945                         RExC_parse++;
12946                         hasbrace = 1;
12947                     }
12948                     if (*RExC_parse == '-') {
12949                         RExC_parse++;
12950                         isrel = 1;
12951                     }
12952                     if (hasbrace && !isDIGIT(*RExC_parse)) {
12953                         if (isrel) RExC_parse--;
12954                         RExC_parse -= 2;
12955                         goto parse_named_seq;
12956                     }
12957
12958                     if (RExC_parse >= RExC_end) {
12959                         goto unterminated_g;
12960                     }
12961                     num = S_backref_value(RExC_parse);
12962                     if (num == 0)
12963                         vFAIL("Reference to invalid group 0");
12964                     else if (num == I32_MAX) {
12965                          if (isDIGIT(*RExC_parse))
12966                             vFAIL("Reference to nonexistent group");
12967                         else
12968                           unterminated_g:
12969                             vFAIL("Unterminated \\g... pattern");
12970                     }
12971
12972                     if (isrel) {
12973                         num = RExC_npar - num;
12974                         if (num < 1)
12975                             vFAIL("Reference to nonexistent or unclosed group");
12976                     }
12977                 }
12978                 else {
12979                     num = S_backref_value(RExC_parse);
12980                     /* bare \NNN might be backref or octal - if it is larger
12981                      * than or equal RExC_npar then it is assumed to be an
12982                      * octal escape. Note RExC_npar is +1 from the actual
12983                      * number of parens. */
12984                     /* Note we do NOT check if num == I32_MAX here, as that is
12985                      * handled by the RExC_npar check */
12986
12987                     if (
12988                         /* any numeric escape < 10 is always a backref */
12989                         num > 9
12990                         /* any numeric escape < RExC_npar is a backref */
12991                         && num >= RExC_npar
12992                         /* cannot be an octal escape if it starts with 8 */
12993                         && *RExC_parse != '8'
12994                         /* cannot be an octal escape it it starts with 9 */
12995                         && *RExC_parse != '9'
12996                     )
12997                     {
12998                         /* Probably not a backref, instead likely to be an
12999                          * octal character escape, e.g. \35 or \777.
13000                          * The above logic should make it obvious why using
13001                          * octal escapes in patterns is problematic. - Yves */
13002                         RExC_parse = parse_start;
13003                         goto defchar;
13004                     }
13005                 }
13006
13007                 /* At this point RExC_parse points at a numeric escape like
13008                  * \12 or \88 or something similar, which we should NOT treat
13009                  * as an octal escape. It may or may not be a valid backref
13010                  * escape. For instance \88888888 is unlikely to be a valid
13011                  * backref. */
13012                 while (isDIGIT(*RExC_parse))
13013                     RExC_parse++;
13014                 if (hasbrace) {
13015                     if (*RExC_parse != '}')
13016                         vFAIL("Unterminated \\g{...} pattern");
13017                     RExC_parse++;
13018                 }
13019                 if (!SIZE_ONLY) {
13020                     if (num > (I32)RExC_rx->nparens)
13021                         vFAIL("Reference to nonexistent group");
13022                 }
13023                 RExC_sawback = 1;
13024                 ret = reganode(pRExC_state,
13025                                ((! FOLD)
13026                                  ? REF
13027                                  : (ASCII_FOLD_RESTRICTED)
13028                                    ? REFFA
13029                                    : (AT_LEAST_UNI_SEMANTICS)
13030                                      ? REFFU
13031                                      : (LOC)
13032                                        ? REFFL
13033                                        : REFF),
13034                                 num);
13035                 *flagp |= HASWIDTH;
13036
13037                 /* override incorrect value set in reganode MJD */
13038                 Set_Node_Offset(ret, parse_start);
13039                 Set_Node_Cur_Length(ret, parse_start-1);
13040                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13041                                         FALSE /* Don't force to /x */ );
13042             }
13043             break;
13044         case '\0':
13045             if (RExC_parse >= RExC_end)
13046                 FAIL("Trailing \\");
13047             /* FALLTHROUGH */
13048         default:
13049             /* Do not generate "unrecognized" warnings here, we fall
13050                back into the quick-grab loop below */
13051             RExC_parse = parse_start;
13052             goto defchar;
13053         } /* end of switch on a \foo sequence */
13054         break;
13055
13056     case '#':
13057
13058         /* '#' comments should have been spaced over before this function was
13059          * called */
13060         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13061         /*
13062         if (RExC_flags & RXf_PMf_EXTENDED) {
13063             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13064             if (RExC_parse < RExC_end)
13065                 goto tryagain;
13066         }
13067         */
13068
13069         /* FALLTHROUGH */
13070
13071     default:
13072           defchar: {
13073
13074             /* Here, we have determined that the next thing is probably a
13075              * literal character.  RExC_parse points to the first byte of its
13076              * definition.  (It still may be an escape sequence that evaluates
13077              * to a single character) */
13078
13079             STRLEN len = 0;
13080             UV ender = 0;
13081             char *p;
13082             char *s;
13083 #define MAX_NODE_STRING_SIZE 127
13084             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
13085             char *s0;
13086             U8 upper_parse = MAX_NODE_STRING_SIZE;
13087             U8 node_type = compute_EXACTish(pRExC_state);
13088             bool next_is_quantifier;
13089             char * oldp = NULL;
13090
13091             /* We can convert EXACTF nodes to EXACTFU if they contain only
13092              * characters that match identically regardless of the target
13093              * string's UTF8ness.  The reason to do this is that EXACTF is not
13094              * trie-able, EXACTFU is.
13095              *
13096              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13097              * contain only above-Latin1 characters (hence must be in UTF8),
13098              * which don't participate in folds with Latin1-range characters,
13099              * as the latter's folds aren't known until runtime.  (We don't
13100              * need to figure this out until pass 2) */
13101             bool maybe_exactfu = PASS2
13102                                && (node_type == EXACTF || node_type == EXACTFL);
13103
13104             /* If a folding node contains only code points that don't
13105              * participate in folds, it can be changed into an EXACT node,
13106              * which allows the optimizer more things to look for */
13107             bool maybe_exact;
13108
13109             ret = reg_node(pRExC_state, node_type);
13110
13111             /* In pass1, folded, we use a temporary buffer instead of the
13112              * actual node, as the node doesn't exist yet */
13113             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
13114
13115             s0 = s;
13116
13117           reparse:
13118
13119             /* We look for the EXACTFish to EXACT node optimizaton only if
13120              * folding.  (And we don't need to figure this out until pass 2).
13121              * XXX It might actually make sense to split the node into portions
13122              * that are exact and ones that aren't, so that we could later use
13123              * the exact ones to find the longest fixed and floating strings.
13124              * One would want to join them back into a larger node.  One could
13125              * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
13126             maybe_exact = FOLD && PASS2;
13127
13128             /* XXX The node can hold up to 255 bytes, yet this only goes to
13129              * 127.  I (khw) do not know why.  Keeping it somewhat less than
13130              * 255 allows us to not have to worry about overflow due to
13131              * converting to utf8 and fold expansion, but that value is
13132              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
13133              * split up by this limit into a single one using the real max of
13134              * 255.  Even at 127, this breaks under rare circumstances.  If
13135              * folding, we do not want to split a node at a character that is a
13136              * non-final in a multi-char fold, as an input string could just
13137              * happen to want to match across the node boundary.  The join
13138              * would solve that problem if the join actually happens.  But a
13139              * series of more than two nodes in a row each of 127 would cause
13140              * the first join to succeed to get to 254, but then there wouldn't
13141              * be room for the next one, which could at be one of those split
13142              * multi-char folds.  I don't know of any fool-proof solution.  One
13143              * could back off to end with only a code point that isn't such a
13144              * non-final, but it is possible for there not to be any in the
13145              * entire node. */
13146
13147             assert(   ! UTF     /* Is at the beginning of a character */
13148                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13149                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13150
13151             /* Here, we have a literal character.  Find the maximal string of
13152              * them in the input that we can fit into a single EXACTish node.
13153              * We quit at the first non-literal or when the node gets full */
13154             for (p = RExC_parse;
13155                  len < upper_parse && p < RExC_end;
13156                  len++)
13157             {
13158                 oldp = p;
13159
13160                 /* White space has already been ignored */
13161                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13162                        || ! is_PATWS_safe((p), RExC_end, UTF));
13163
13164                 switch ((U8)*p) {
13165                 case '^':
13166                 case '$':
13167                 case '.':
13168                 case '[':
13169                 case '(':
13170                 case ')':
13171                 case '|':
13172                     goto loopdone;
13173                 case '\\':
13174                     /* Literal Escapes Switch
13175
13176                        This switch is meant to handle escape sequences that
13177                        resolve to a literal character.
13178
13179                        Every escape sequence that represents something
13180                        else, like an assertion or a char class, is handled
13181                        in the switch marked 'Special Escapes' above in this
13182                        routine, but also has an entry here as anything that
13183                        isn't explicitly mentioned here will be treated as
13184                        an unescaped equivalent literal.
13185                     */
13186
13187                     switch ((U8)*++p) {
13188                     /* These are all the special escapes. */
13189                     case 'A':             /* Start assertion */
13190                     case 'b': case 'B':   /* Word-boundary assertion*/
13191                     case 'C':             /* Single char !DANGEROUS! */
13192                     case 'd': case 'D':   /* digit class */
13193                     case 'g': case 'G':   /* generic-backref, pos assertion */
13194                     case 'h': case 'H':   /* HORIZWS */
13195                     case 'k': case 'K':   /* named backref, keep marker */
13196                     case 'p': case 'P':   /* Unicode property */
13197                               case 'R':   /* LNBREAK */
13198                     case 's': case 'S':   /* space class */
13199                     case 'v': case 'V':   /* VERTWS */
13200                     case 'w': case 'W':   /* word class */
13201                     case 'X':             /* eXtended Unicode "combining
13202                                              character sequence" */
13203                     case 'z': case 'Z':   /* End of line/string assertion */
13204                         --p;
13205                         goto loopdone;
13206
13207                     /* Anything after here is an escape that resolves to a
13208                        literal. (Except digits, which may or may not)
13209                      */
13210                     case 'n':
13211                         ender = '\n';
13212                         p++;
13213                         break;
13214                     case 'N': /* Handle a single-code point named character. */
13215                         RExC_parse = p + 1;
13216                         if (! grok_bslash_N(pRExC_state,
13217                                             NULL,   /* Fail if evaluates to
13218                                                        anything other than a
13219                                                        single code point */
13220                                             &ender, /* The returned single code
13221                                                        point */
13222                                             NULL,   /* Don't need a count of
13223                                                        how many code points */
13224                                             flagp,
13225                                             RExC_strict,
13226                                             depth)
13227                         ) {
13228                             if (*flagp & NEED_UTF8)
13229                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13230                             if (*flagp & RESTART_PASS1)
13231                                 return NULL;
13232
13233                             /* Here, it wasn't a single code point.  Go close
13234                              * up this EXACTish node.  The switch() prior to
13235                              * this switch handles the other cases */
13236                             RExC_parse = p = oldp;
13237                             goto loopdone;
13238                         }
13239                         p = RExC_parse;
13240                         if (ender > 0xff) {
13241                             REQUIRE_UTF8(flagp);
13242                         }
13243                         break;
13244                     case 'r':
13245                         ender = '\r';
13246                         p++;
13247                         break;
13248                     case 't':
13249                         ender = '\t';
13250                         p++;
13251                         break;
13252                     case 'f':
13253                         ender = '\f';
13254                         p++;
13255                         break;
13256                     case 'e':
13257                         ender = ESC_NATIVE;
13258                         p++;
13259                         break;
13260                     case 'a':
13261                         ender = '\a';
13262                         p++;
13263                         break;
13264                     case 'o':
13265                         {
13266                             UV result;
13267                             const char* error_msg;
13268
13269                             bool valid = grok_bslash_o(&p,
13270                                                        &result,
13271                                                        &error_msg,
13272                                                        PASS2, /* out warnings */
13273                                                        (bool) RExC_strict,
13274                                                        TRUE, /* Output warnings
13275                                                                 for non-
13276                                                                 portables */
13277                                                        UTF);
13278                             if (! valid) {
13279                                 RExC_parse = p; /* going to die anyway; point
13280                                                    to exact spot of failure */
13281                                 vFAIL(error_msg);
13282                             }
13283                             ender = result;
13284                             if (ender > 0xff) {
13285                                 REQUIRE_UTF8(flagp);
13286                             }
13287                             break;
13288                         }
13289                     case 'x':
13290                         {
13291                             UV result = UV_MAX; /* initialize to erroneous
13292                                                    value */
13293                             const char* error_msg;
13294
13295                             bool valid = grok_bslash_x(&p,
13296                                                        &result,
13297                                                        &error_msg,
13298                                                        PASS2, /* out warnings */
13299                                                        (bool) RExC_strict,
13300                                                        TRUE, /* Silence warnings
13301                                                                 for non-
13302                                                                 portables */
13303                                                        UTF);
13304                             if (! valid) {
13305                                 RExC_parse = p; /* going to die anyway; point
13306                                                    to exact spot of failure */
13307                                 vFAIL(error_msg);
13308                             }
13309                             ender = result;
13310
13311                             if (ender < 0x100) {
13312 #ifdef EBCDIC
13313                                 if (RExC_recode_x_to_native) {
13314                                     ender = LATIN1_TO_NATIVE(ender);
13315                                 }
13316 #endif
13317                             }
13318                             else {
13319                                 REQUIRE_UTF8(flagp);
13320                             }
13321                             break;
13322                         }
13323                     case 'c':
13324                         p++;
13325                         ender = grok_bslash_c(*p++, PASS2);
13326                         break;
13327                     case '8': case '9': /* must be a backreference */
13328                         --p;
13329                         /* we have an escape like \8 which cannot be an octal escape
13330                          * so we exit the loop, and let the outer loop handle this
13331                          * escape which may or may not be a legitimate backref. */
13332                         goto loopdone;
13333                     case '1': case '2': case '3':case '4':
13334                     case '5': case '6': case '7':
13335                         /* When we parse backslash escapes there is ambiguity
13336                          * between backreferences and octal escapes. Any escape
13337                          * from \1 - \9 is a backreference, any multi-digit
13338                          * escape which does not start with 0 and which when
13339                          * evaluated as decimal could refer to an already
13340                          * parsed capture buffer is a back reference. Anything
13341                          * else is octal.
13342                          *
13343                          * Note this implies that \118 could be interpreted as
13344                          * 118 OR as "\11" . "8" depending on whether there
13345                          * were 118 capture buffers defined already in the
13346                          * pattern.  */
13347
13348                         /* NOTE, RExC_npar is 1 more than the actual number of
13349                          * parens we have seen so far, hence the < RExC_npar below. */
13350
13351                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13352                         {  /* Not to be treated as an octal constant, go
13353                                    find backref */
13354                             --p;
13355                             goto loopdone;
13356                         }
13357                         /* FALLTHROUGH */
13358                     case '0':
13359                         {
13360                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13361                             STRLEN numlen = 3;
13362                             ender = grok_oct(p, &numlen, &flags, NULL);
13363                             if (ender > 0xff) {
13364                                 REQUIRE_UTF8(flagp);
13365                             }
13366                             p += numlen;
13367                             if (PASS2   /* like \08, \178 */
13368                                 && numlen < 3
13369                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13370                             {
13371                                 reg_warn_non_literal_string(
13372                                          p + 1,
13373                                          form_short_octal_warning(p, numlen));
13374                             }
13375                         }
13376                         break;
13377                     case '\0':
13378                         if (p >= RExC_end)
13379                             FAIL("Trailing \\");
13380                         /* FALLTHROUGH */
13381                     default:
13382                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13383                             /* Include any left brace following the alpha to emphasize
13384                              * that it could be part of an escape at some point
13385                              * in the future */
13386                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13387                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13388                         }
13389                         goto normal_default;
13390                     } /* End of switch on '\' */
13391                     break;
13392                 case '{':
13393                     /* Currently we don't care if the lbrace is at the start
13394                      * of a construct.  This catches it in the middle of a
13395                      * literal string, or when it's the first thing after
13396                      * something like "\b" */
13397                     if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
13398                         RExC_parse = p + 1;
13399                         vFAIL("Unescaped left brace in regex is illegal here");
13400                     }
13401                     goto normal_default;
13402                 case '}':
13403                 case ']':
13404                     if (PASS2 && p > RExC_parse && RExC_strict) {
13405                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
13406                     }
13407                     /*FALLTHROUGH*/
13408                 default:    /* A literal character */
13409                   normal_default:
13410                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
13411                         STRLEN numlen;
13412                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13413                                                &numlen, UTF8_ALLOW_DEFAULT);
13414                         p += numlen;
13415                     }
13416                     else
13417                         ender = (U8) *p++;
13418                     break;
13419                 } /* End of switch on the literal */
13420
13421                 /* Here, have looked at the literal character and <ender>
13422                  * contains its ordinal, <p> points to the character after it.
13423                  * We need to check if the next non-ignored thing is a
13424                  * quantifier.  Move <p> to after anything that should be
13425                  * ignored, which, as a side effect, positions <p> for the next
13426                  * loop iteration */
13427                 skip_to_be_ignored_text(pRExC_state, &p,
13428                                         FALSE /* Don't force to /x */ );
13429
13430                 /* If the next thing is a quantifier, it applies to this
13431                  * character only, which means that this character has to be in
13432                  * its own node and can't just be appended to the string in an
13433                  * existing node, so if there are already other characters in
13434                  * the node, close the node with just them, and set up to do
13435                  * this character again next time through, when it will be the
13436                  * only thing in its new node */
13437
13438                 if ((next_is_quantifier = (   LIKELY(p < RExC_end)
13439                                            && UNLIKELY(ISMULT2(p))))
13440                     && LIKELY(len))
13441                 {
13442                     p = oldp;
13443                     goto loopdone;
13444                 }
13445
13446                 /* Ready to add 'ender' to the node */
13447
13448                 if (! FOLD) {  /* The simple case, just append the literal */
13449
13450                     /* In the sizing pass, we need only the size of the
13451                      * character we are appending, hence we can delay getting
13452                      * its representation until PASS2. */
13453                     if (SIZE_ONLY) {
13454                         if (UTF) {
13455                             const STRLEN unilen = UVCHR_SKIP(ender);
13456                             s += unilen;
13457
13458                             /* We have to subtract 1 just below (and again in
13459                              * the corresponding PASS2 code) because the loop
13460                              * increments <len> each time, as all but this path
13461                              * (and one other) through it add a single byte to
13462                              * the EXACTish node.  But these paths would change
13463                              * len to be the correct final value, so cancel out
13464                              * the increment that follows */
13465                             len += unilen - 1;
13466                         }
13467                         else {
13468                             s++;
13469                         }
13470                     } else { /* PASS2 */
13471                       not_fold_common:
13472                         if (UTF) {
13473                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13474                             len += (char *) new_s - s - 1;
13475                             s = (char *) new_s;
13476                         }
13477                         else {
13478                             *(s++) = (char) ender;
13479                         }
13480                     }
13481                 }
13482                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13483
13484                     /* Here are folding under /l, and the code point is
13485                      * problematic.  First, we know we can't simplify things */
13486                     maybe_exact = FALSE;
13487                     maybe_exactfu = FALSE;
13488
13489                     /* A problematic code point in this context means that its
13490                      * fold isn't known until runtime, so we can't fold it now.
13491                      * (The non-problematic code points are the above-Latin1
13492                      * ones that fold to also all above-Latin1.  Their folds
13493                      * don't vary no matter what the locale is.) But here we
13494                      * have characters whose fold depends on the locale.
13495                      * Unlike the non-folding case above, we have to keep track
13496                      * of these in the sizing pass, so that we can make sure we
13497                      * don't split too-long nodes in the middle of a potential
13498                      * multi-char fold.  And unlike the regular fold case
13499                      * handled in the else clauses below, we don't actually
13500                      * fold and don't have special cases to consider.  What we
13501                      * do for both passes is the PASS2 code for non-folding */
13502                     goto not_fold_common;
13503                 }
13504                 else /* A regular FOLD code point */
13505                     if (! (   UTF
13506 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13507    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13508                                       || UNICODE_DOT_DOT_VERSION > 0)
13509                             /* See comments for join_exact() as to why we fold
13510                              * this non-UTF at compile time */
13511                             || (   node_type == EXACTFU
13512                                 && ender == LATIN_SMALL_LETTER_SHARP_S)
13513 #endif
13514                 )) {
13515                     /* Here, are folding and are not UTF-8 encoded; therefore
13516                      * the character must be in the range 0-255, and is not /l
13517                      * (Not /l because we already handled these under /l in
13518                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13519                     if (IS_IN_SOME_FOLD_L1(ender)) {
13520                         maybe_exact = FALSE;
13521
13522                         /* See if the character's fold differs between /d and
13523                          * /u.  This includes the multi-char fold SHARP S to
13524                          * 'ss' */
13525                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13526                             RExC_seen_unfolded_sharp_s = 1;
13527                             maybe_exactfu = FALSE;
13528                         }
13529                         else if (maybe_exactfu
13530                             && (PL_fold[ender] != PL_fold_latin1[ender]
13531 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13532    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13533                                       || UNICODE_DOT_DOT_VERSION > 0)
13534                                 || (   len > 0
13535                                     && isALPHA_FOLD_EQ(ender, 's')
13536                                     && isALPHA_FOLD_EQ(*(s-1), 's'))
13537 #endif
13538                         )) {
13539                             maybe_exactfu = FALSE;
13540                         }
13541                     }
13542
13543                     /* Even when folding, we store just the input character, as
13544                      * we have an array that finds its fold quickly */
13545                     *(s++) = (char) ender;
13546                 }
13547                 else {  /* FOLD, and UTF (or sharp s) */
13548                     /* Unlike the non-fold case, we do actually have to
13549                      * calculate the results here in pass 1.  This is for two
13550                      * reasons, the folded length may be longer than the
13551                      * unfolded, and we have to calculate how many EXACTish
13552                      * nodes it will take; and we may run out of room in a node
13553                      * in the middle of a potential multi-char fold, and have
13554                      * to back off accordingly.  */
13555
13556                     UV folded;
13557                     if (isASCII_uni(ender)) {
13558                         folded = toFOLD(ender);
13559                         *(s)++ = (U8) folded;
13560                     }
13561                     else {
13562                         STRLEN foldlen;
13563
13564                         folded = _to_uni_fold_flags(
13565                                      ender,
13566                                      (U8 *) s,
13567                                      &foldlen,
13568                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13569                                                         ? FOLD_FLAGS_NOMIX_ASCII
13570                                                         : 0));
13571                         s += foldlen;
13572
13573                         /* The loop increments <len> each time, as all but this
13574                          * path (and one other) through it add a single byte to
13575                          * the EXACTish node.  But this one has changed len to
13576                          * be the correct final value, so subtract one to
13577                          * cancel out the increment that follows */
13578                         len += foldlen - 1;
13579                     }
13580                     /* If this node only contains non-folding code points so
13581                      * far, see if this new one is also non-folding */
13582                     if (maybe_exact) {
13583                         if (folded != ender) {
13584                             maybe_exact = FALSE;
13585                         }
13586                         else {
13587                             /* Here the fold is the original; we have to check
13588                              * further to see if anything folds to it */
13589                             if (_invlist_contains_cp(PL_utf8_foldable,
13590                                                         ender))
13591                             {
13592                                 maybe_exact = FALSE;
13593                             }
13594                         }
13595                     }
13596                     ender = folded;
13597                 }
13598
13599                 if (next_is_quantifier) {
13600
13601                     /* Here, the next input is a quantifier, and to get here,
13602                      * the current character is the only one in the node.
13603                      * Also, here <len> doesn't include the final byte for this
13604                      * character */
13605                     len++;
13606                     goto loopdone;
13607                 }
13608
13609             } /* End of loop through literal characters */
13610
13611             /* Here we have either exhausted the input or ran out of room in
13612              * the node.  (If we encountered a character that can't be in the
13613              * node, transfer is made directly to <loopdone>, and so we
13614              * wouldn't have fallen off the end of the loop.)  In the latter
13615              * case, we artificially have to split the node into two, because
13616              * we just don't have enough space to hold everything.  This
13617              * creates a problem if the final character participates in a
13618              * multi-character fold in the non-final position, as a match that
13619              * should have occurred won't, due to the way nodes are matched,
13620              * and our artificial boundary.  So back off until we find a non-
13621              * problematic character -- one that isn't at the beginning or
13622              * middle of such a fold.  (Either it doesn't participate in any
13623              * folds, or appears only in the final position of all the folds it
13624              * does participate in.)  A better solution with far fewer false
13625              * positives, and that would fill the nodes more completely, would
13626              * be to actually have available all the multi-character folds to
13627              * test against, and to back-off only far enough to be sure that
13628              * this node isn't ending with a partial one.  <upper_parse> is set
13629              * further below (if we need to reparse the node) to include just
13630              * up through that final non-problematic character that this code
13631              * identifies, so when it is set to less than the full node, we can
13632              * skip the rest of this */
13633             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13634
13635                 const STRLEN full_len = len;
13636
13637                 assert(len >= MAX_NODE_STRING_SIZE);
13638
13639                 /* Here, <s> points to the final byte of the final character.
13640                  * Look backwards through the string until find a non-
13641                  * problematic character */
13642
13643                 if (! UTF) {
13644
13645                     /* This has no multi-char folds to non-UTF characters */
13646                     if (ASCII_FOLD_RESTRICTED) {
13647                         goto loopdone;
13648                     }
13649
13650                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13651                     len = s - s0 + 1;
13652                 }
13653                 else {
13654                     if (!  PL_NonL1NonFinalFold) {
13655                         PL_NonL1NonFinalFold = _new_invlist_C_array(
13656                                         NonL1_Perl_Non_Final_Folds_invlist);
13657                     }
13658
13659                     /* Point to the first byte of the final character */
13660                     s = (char *) utf8_hop((U8 *) s, -1);
13661
13662                     while (s >= s0) {   /* Search backwards until find
13663                                            non-problematic char */
13664                         if (UTF8_IS_INVARIANT(*s)) {
13665
13666                             /* There are no ascii characters that participate
13667                              * in multi-char folds under /aa.  In EBCDIC, the
13668                              * non-ascii invariants are all control characters,
13669                              * so don't ever participate in any folds. */
13670                             if (ASCII_FOLD_RESTRICTED
13671                                 || ! IS_NON_FINAL_FOLD(*s))
13672                             {
13673                                 break;
13674                             }
13675                         }
13676                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13677                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13678                                                                   *s, *(s+1))))
13679                             {
13680                                 break;
13681                             }
13682                         }
13683                         else if (! _invlist_contains_cp(
13684                                         PL_NonL1NonFinalFold,
13685                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
13686                         {
13687                             break;
13688                         }
13689
13690                         /* Here, the current character is problematic in that
13691                          * it does occur in the non-final position of some
13692                          * fold, so try the character before it, but have to
13693                          * special case the very first byte in the string, so
13694                          * we don't read outside the string */
13695                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13696                     } /* End of loop backwards through the string */
13697
13698                     /* If there were only problematic characters in the string,
13699                      * <s> will point to before s0, in which case the length
13700                      * should be 0, otherwise include the length of the
13701                      * non-problematic character just found */
13702                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13703                 }
13704
13705                 /* Here, have found the final character, if any, that is
13706                  * non-problematic as far as ending the node without splitting
13707                  * it across a potential multi-char fold.  <len> contains the
13708                  * number of bytes in the node up-to and including that
13709                  * character, or is 0 if there is no such character, meaning
13710                  * the whole node contains only problematic characters.  In
13711                  * this case, give up and just take the node as-is.  We can't
13712                  * do any better */
13713                 if (len == 0) {
13714                     len = full_len;
13715
13716                     /* If the node ends in an 's' we make sure it stays EXACTF,
13717                      * as if it turns into an EXACTFU, it could later get
13718                      * joined with another 's' that would then wrongly match
13719                      * the sharp s */
13720                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13721                     {
13722                         maybe_exactfu = FALSE;
13723                     }
13724                 } else {
13725
13726                     /* Here, the node does contain some characters that aren't
13727                      * problematic.  If one such is the final character in the
13728                      * node, we are done */
13729                     if (len == full_len) {
13730                         goto loopdone;
13731                     }
13732                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13733
13734                         /* If the final character is problematic, but the
13735                          * penultimate is not, back-off that last character to
13736                          * later start a new node with it */
13737                         p = oldp;
13738                         goto loopdone;
13739                     }
13740
13741                     /* Here, the final non-problematic character is earlier
13742                      * in the input than the penultimate character.  What we do
13743                      * is reparse from the beginning, going up only as far as
13744                      * this final ok one, thus guaranteeing that the node ends
13745                      * in an acceptable character.  The reason we reparse is
13746                      * that we know how far in the character is, but we don't
13747                      * know how to correlate its position with the input parse.
13748                      * An alternate implementation would be to build that
13749                      * correlation as we go along during the original parse,
13750                      * but that would entail extra work for every node, whereas
13751                      * this code gets executed only when the string is too
13752                      * large for the node, and the final two characters are
13753                      * problematic, an infrequent occurrence.  Yet another
13754                      * possible strategy would be to save the tail of the
13755                      * string, and the next time regatom is called, initialize
13756                      * with that.  The problem with this is that unless you
13757                      * back off one more character, you won't be guaranteed
13758                      * regatom will get called again, unless regbranch,
13759                      * regpiece ... are also changed.  If you do back off that
13760                      * extra character, so that there is input guaranteed to
13761                      * force calling regatom, you can't handle the case where
13762                      * just the first character in the node is acceptable.  I
13763                      * (khw) decided to try this method which doesn't have that
13764                      * pitfall; if performance issues are found, we can do a
13765                      * combination of the current approach plus that one */
13766                     upper_parse = len;
13767                     len = 0;
13768                     s = s0;
13769                     goto reparse;
13770                 }
13771             }   /* End of verifying node ends with an appropriate char */
13772
13773           loopdone:   /* Jumped to when encounters something that shouldn't be
13774                          in the node */
13775
13776             /* I (khw) don't know if you can get here with zero length, but the
13777              * old code handled this situation by creating a zero-length EXACT
13778              * node.  Might as well be NOTHING instead */
13779             if (len == 0) {
13780                 OP(ret) = NOTHING;
13781             }
13782             else {
13783                 if (FOLD) {
13784                     /* If 'maybe_exact' is still set here, means there are no
13785                      * code points in the node that participate in folds;
13786                      * similarly for 'maybe_exactfu' and code points that match
13787                      * differently depending on UTF8ness of the target string
13788                      * (for /u), or depending on locale for /l */
13789                     if (maybe_exact) {
13790                         OP(ret) = (LOC)
13791                                   ? EXACTL
13792                                   : EXACT;
13793                     }
13794                     else if (maybe_exactfu) {
13795                         OP(ret) = (LOC)
13796                                   ? EXACTFLU8
13797                                   : EXACTFU;
13798                     }
13799                 }
13800                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13801                                            FALSE /* Don't look to see if could
13802                                                     be turned into an EXACT
13803                                                     node, as we have already
13804                                                     computed that */
13805                                           );
13806             }
13807
13808             RExC_parse = p - 1;
13809             Set_Node_Cur_Length(ret, parse_start);
13810             RExC_parse = p;
13811             {
13812                 /* len is STRLEN which is unsigned, need to copy to signed */
13813                 IV iv = len;
13814                 if (iv < 0)
13815                     vFAIL("Internal disaster");
13816             }
13817
13818         } /* End of label 'defchar:' */
13819         break;
13820     } /* End of giant switch on input character */
13821
13822     /* Position parse to next real character */
13823     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13824                                             FALSE /* Don't force to /x */ );
13825     if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
13826         ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
13827     }
13828
13829     return(ret);
13830 }
13831
13832
13833 STATIC void
13834 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13835 {
13836     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
13837      * sets up the bitmap and any flags, removing those code points from the
13838      * inversion list, setting it to NULL should it become completely empty */
13839
13840     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13841     assert(PL_regkind[OP(node)] == ANYOF);
13842
13843     ANYOF_BITMAP_ZERO(node);
13844     if (*invlist_ptr) {
13845
13846         /* This gets set if we actually need to modify things */
13847         bool change_invlist = FALSE;
13848
13849         UV start, end;
13850
13851         /* Start looking through *invlist_ptr */
13852         invlist_iterinit(*invlist_ptr);
13853         while (invlist_iternext(*invlist_ptr, &start, &end)) {
13854             UV high;
13855             int i;
13856
13857             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13858                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13859             }
13860
13861             /* Quit if are above what we should change */
13862             if (start >= NUM_ANYOF_CODE_POINTS) {
13863                 break;
13864             }
13865
13866             change_invlist = TRUE;
13867
13868             /* Set all the bits in the range, up to the max that we are doing */
13869             high = (end < NUM_ANYOF_CODE_POINTS - 1)
13870                    ? end
13871                    : NUM_ANYOF_CODE_POINTS - 1;
13872             for (i = start; i <= (int) high; i++) {
13873                 if (! ANYOF_BITMAP_TEST(node, i)) {
13874                     ANYOF_BITMAP_SET(node, i);
13875                 }
13876             }
13877         }
13878         invlist_iterfinish(*invlist_ptr);
13879
13880         /* Done with loop; remove any code points that are in the bitmap from
13881          * *invlist_ptr; similarly for code points above the bitmap if we have
13882          * a flag to match all of them anyways */
13883         if (change_invlist) {
13884             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13885         }
13886         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13887             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13888         }
13889
13890         /* If have completely emptied it, remove it completely */
13891         if (_invlist_len(*invlist_ptr) == 0) {
13892             SvREFCNT_dec_NN(*invlist_ptr);
13893             *invlist_ptr = NULL;
13894         }
13895     }
13896 }
13897
13898 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13899    Character classes ([:foo:]) can also be negated ([:^foo:]).
13900    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13901    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13902    but trigger failures because they are currently unimplemented. */
13903
13904 #define POSIXCC_DONE(c)   ((c) == ':')
13905 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13906 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13907 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
13908
13909 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
13910 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
13911 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
13912
13913 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
13914
13915 /* 'posix_warnings' and 'warn_text' are names of variables in the following
13916  * routine. q.v. */
13917 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
13918         if (posix_warnings) {                                               \
13919             if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
13920             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                          \
13921                                              WARNING_PREFIX                 \
13922                                              text                           \
13923                                              REPORT_LOCATION,               \
13924                                              REPORT_LOCATION_ARGS(p)));     \
13925         }                                                                   \
13926     } STMT_END
13927
13928 STATIC int
13929 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
13930
13931     const char * const s,      /* Where the putative posix class begins.
13932                                   Normally, this is one past the '['.  This
13933                                   parameter exists so it can be somewhere
13934                                   besides RExC_parse. */
13935     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
13936                                   NULL */
13937     AV ** posix_warnings,      /* Where to place any generated warnings, or
13938                                   NULL */
13939     const bool check_only      /* Don't die if error */
13940 )
13941 {
13942     /* This parses what the caller thinks may be one of the three POSIX
13943      * constructs:
13944      *  1) a character class, like [:blank:]
13945      *  2) a collating symbol, like [. .]
13946      *  3) an equivalence class, like [= =]
13947      * In the latter two cases, it croaks if it finds a syntactically legal
13948      * one, as these are not handled by Perl.
13949      *
13950      * The main purpose is to look for a POSIX character class.  It returns:
13951      *  a) the class number
13952      *      if it is a completely syntactically and semantically legal class.
13953      *      'updated_parse_ptr', if not NULL, is set to point to just after the
13954      *      closing ']' of the class
13955      *  b) OOB_NAMEDCLASS
13956      *      if it appears that one of the three POSIX constructs was meant, but
13957      *      its specification was somehow defective.  'updated_parse_ptr', if
13958      *      not NULL, is set to point to the character just after the end
13959      *      character of the class.  See below for handling of warnings.
13960      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
13961      *      if it  doesn't appear that a POSIX construct was intended.
13962      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
13963      *      raised.
13964      *
13965      * In b) there may be errors or warnings generated.  If 'check_only' is
13966      * TRUE, then any errors are discarded.  Warnings are returned to the
13967      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
13968      * instead it is NULL, warnings are suppressed.  This is done in all
13969      * passes.  The reason for this is that the rest of the parsing is heavily
13970      * dependent on whether this routine found a valid posix class or not.  If
13971      * it did, the closing ']' is absorbed as part of the class.  If no class,
13972      * or an invalid one is found, any ']' will be considered the terminator of
13973      * the outer bracketed character class, leading to very different results.
13974      * In particular, a '(?[ ])' construct will likely have a syntax error if
13975      * the class is parsed other than intended, and this will happen in pass1,
13976      * before the warnings would normally be output.  This mechanism allows the
13977      * caller to output those warnings in pass1 just before dieing, giving a
13978      * much better clue as to what is wrong.
13979      *
13980      * The reason for this function, and its complexity is that a bracketed
13981      * character class can contain just about anything.  But it's easy to
13982      * mistype the very specific posix class syntax but yielding a valid
13983      * regular bracketed class, so it silently gets compiled into something
13984      * quite unintended.
13985      *
13986      * The solution adopted here maintains backward compatibility except that
13987      * it adds a warning if it looks like a posix class was intended but
13988      * improperly specified.  The warning is not raised unless what is input
13989      * very closely resembles one of the 14 legal posix classes.  To do this,
13990      * it uses fuzzy parsing.  It calculates how many single-character edits it
13991      * would take to transform what was input into a legal posix class.  Only
13992      * if that number is quite small does it think that the intention was a
13993      * posix class.  Obviously these are heuristics, and there will be cases
13994      * where it errs on one side or another, and they can be tweaked as
13995      * experience informs.
13996      *
13997      * The syntax for a legal posix class is:
13998      *
13999      * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
14000      *
14001      * What this routine considers syntactically to be an intended posix class
14002      * is this (the comments indicate some restrictions that the pattern
14003      * doesn't show):
14004      *
14005      *  qr/(?x: \[?                         # The left bracket, possibly
14006      *                                      # omitted
14007      *          \h*                         # possibly followed by blanks
14008      *          (?: \^ \h* )?               # possibly a misplaced caret
14009      *          [:;]?                       # The opening class character,
14010      *                                      # possibly omitted.  A typo
14011      *                                      # semi-colon can also be used.
14012      *          \h*
14013      *          \^?                         # possibly a correctly placed
14014      *                                      # caret, but not if there was also
14015      *                                      # a misplaced one
14016      *          \h*
14017      *          .{3,15}                     # The class name.  If there are
14018      *                                      # deviations from the legal syntax,
14019      *                                      # its edit distance must be close
14020      *                                      # to a real class name in order
14021      *                                      # for it to be considered to be
14022      *                                      # an intended posix class.
14023      *          \h*
14024      *          [:punct:]?                  # The closing class character,
14025      *                                      # possibly omitted.  If not a colon
14026      *                                      # nor semi colon, the class name
14027      *                                      # must be even closer to a valid
14028      *                                      # one
14029      *          \h*
14030      *          \]?                         # The right bracket, possibly
14031      *                                      # omitted.
14032      *     )/
14033      *
14034      * In the above, \h must be ASCII-only.
14035      *
14036      * These are heuristics, and can be tweaked as field experience dictates.
14037      * There will be cases when someone didn't intend to specify a posix class
14038      * that this warns as being so.  The goal is to minimize these, while
14039      * maximizing the catching of things intended to be a posix class that
14040      * aren't parsed as such.
14041      */
14042
14043     const char* p             = s;
14044     const char * const e      = RExC_end;
14045     unsigned complement       = 0;      /* If to complement the class */
14046     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
14047     bool has_opening_bracket  = FALSE;
14048     bool has_opening_colon    = FALSE;
14049     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
14050                                                    valid class */
14051     const char * possible_end = NULL;   /* used for a 2nd parse pass */
14052     const char* name_start;             /* ptr to class name first char */
14053
14054     /* If the number of single-character typos the input name is away from a
14055      * legal name is no more than this number, it is considered to have meant
14056      * the legal name */
14057     int max_distance          = 2;
14058
14059     /* to store the name.  The size determines the maximum length before we
14060      * decide that no posix class was intended.  Should be at least
14061      * sizeof("alphanumeric") */
14062     UV input_text[15];
14063
14064     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14065
14066     if (posix_warnings && RExC_warn_text)
14067         av_clear(RExC_warn_text);
14068
14069     if (p >= e) {
14070         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14071     }
14072
14073     if (*(p - 1) != '[') {
14074         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14075         found_problem = TRUE;
14076     }
14077     else {
14078         has_opening_bracket = TRUE;
14079     }
14080
14081     /* They could be confused and think you can put spaces between the
14082      * components */
14083     if (isBLANK(*p)) {
14084         found_problem = TRUE;
14085
14086         do {
14087             p++;
14088         } while (p < e && isBLANK(*p));
14089
14090         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14091     }
14092
14093     /* For [. .] and [= =].  These are quite different internally from [: :],
14094      * so they are handled separately.  */
14095     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14096                                             and 1 for at least one char in it
14097                                           */
14098     {
14099         const char open_char  = *p;
14100         const char * temp_ptr = p + 1;
14101
14102         /* These two constructs are not handled by perl, and if we find a
14103          * syntactically valid one, we croak.  khw, who wrote this code, finds
14104          * this explanation of them very unclear:
14105          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14106          * And searching the rest of the internet wasn't very helpful either.
14107          * It looks like just about any byte can be in these constructs,
14108          * depending on the locale.  But unless the pattern is being compiled
14109          * under /l, which is very rare, Perl runs under the C or POSIX locale.
14110          * In that case, it looks like [= =] isn't allowed at all, and that
14111          * [. .] could be any single code point, but for longer strings the
14112          * constituent characters would have to be the ASCII alphabetics plus
14113          * the minus-hyphen.  Any sensible locale definition would limit itself
14114          * to these.  And any portable one definitely should.  Trying to parse
14115          * the general case is a nightmare (see [perl #127604]).  So, this code
14116          * looks only for interiors of these constructs that match:
14117          *      qr/.|[-\w]{2,}/
14118          * Using \w relaxes the apparent rules a little, without adding much
14119          * danger of mistaking something else for one of these constructs.
14120          *
14121          * [. .] in some implementations described on the internet is usable to
14122          * escape a character that otherwise is special in bracketed character
14123          * classes.  For example [.].] means a literal right bracket instead of
14124          * the ending of the class
14125          *
14126          * [= =] can legitimately contain a [. .] construct, but we don't
14127          * handle this case, as that [. .] construct will later get parsed
14128          * itself and croak then.  And [= =] is checked for even when not under
14129          * /l, as Perl has long done so.
14130          *
14131          * The code below relies on there being a trailing NUL, so it doesn't
14132          * have to keep checking if the parse ptr < e.
14133          */
14134         if (temp_ptr[1] == open_char) {
14135             temp_ptr++;
14136         }
14137         else while (    temp_ptr < e
14138                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14139         {
14140             temp_ptr++;
14141         }
14142
14143         if (*temp_ptr == open_char) {
14144             temp_ptr++;
14145             if (*temp_ptr == ']') {
14146                 temp_ptr++;
14147                 if (! found_problem && ! check_only) {
14148                     RExC_parse = (char *) temp_ptr;
14149                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
14150                             "extensions", open_char, open_char);
14151                 }
14152
14153                 /* Here, the syntax wasn't completely valid, or else the call
14154                  * is to check-only */
14155                 if (updated_parse_ptr) {
14156                     *updated_parse_ptr = (char *) temp_ptr;
14157                 }
14158
14159                 return OOB_NAMEDCLASS;
14160             }
14161         }
14162
14163         /* If we find something that started out to look like one of these
14164          * constructs, but isn't, we continue below so that it can be checked
14165          * for being a class name with a typo of '.' or '=' instead of a colon.
14166          * */
14167     }
14168
14169     /* Here, we think there is a possibility that a [: :] class was meant, and
14170      * we have the first real character.  It could be they think the '^' comes
14171      * first */
14172     if (*p == '^') {
14173         found_problem = TRUE;
14174         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14175         complement = 1;
14176         p++;
14177
14178         if (isBLANK(*p)) {
14179             found_problem = TRUE;
14180
14181             do {
14182                 p++;
14183             } while (p < e && isBLANK(*p));
14184
14185             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14186         }
14187     }
14188
14189     /* But the first character should be a colon, which they could have easily
14190      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14191      * distinguish from a colon, so treat that as a colon).  */
14192     if (*p == ':') {
14193         p++;
14194         has_opening_colon = TRUE;
14195     }
14196     else if (*p == ';') {
14197         found_problem = TRUE;
14198         p++;
14199         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14200         has_opening_colon = TRUE;
14201     }
14202     else {
14203         found_problem = TRUE;
14204         ADD_POSIX_WARNING(p, "there must be a starting ':'");
14205
14206         /* Consider an initial punctuation (not one of the recognized ones) to
14207          * be a left terminator */
14208         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14209             p++;
14210         }
14211     }
14212
14213     /* They may think that you can put spaces between the components */
14214     if (isBLANK(*p)) {
14215         found_problem = TRUE;
14216
14217         do {
14218             p++;
14219         } while (p < e && isBLANK(*p));
14220
14221         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14222     }
14223
14224     if (*p == '^') {
14225
14226         /* We consider something like [^:^alnum:]] to not have been intended to
14227          * be a posix class, but XXX maybe we should */
14228         if (complement) {
14229             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14230         }
14231
14232         complement = 1;
14233         p++;
14234     }
14235
14236     /* Again, they may think that you can put spaces between the components */
14237     if (isBLANK(*p)) {
14238         found_problem = TRUE;
14239
14240         do {
14241             p++;
14242         } while (p < e && isBLANK(*p));
14243
14244         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14245     }
14246
14247     if (*p == ']') {
14248
14249         /* XXX This ']' may be a typo, and something else was meant.  But
14250          * treating it as such creates enough complications, that that
14251          * possibility isn't currently considered here.  So we assume that the
14252          * ']' is what is intended, and if we've already found an initial '[',
14253          * this leaves this construct looking like [:] or [:^], which almost
14254          * certainly weren't intended to be posix classes */
14255         if (has_opening_bracket) {
14256             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14257         }
14258
14259         /* But this function can be called when we parse the colon for
14260          * something like qr/[alpha:]]/, so we back up to look for the
14261          * beginning */
14262         p--;
14263
14264         if (*p == ';') {
14265             found_problem = TRUE;
14266             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14267         }
14268         else if (*p != ':') {
14269
14270             /* XXX We are currently very restrictive here, so this code doesn't
14271              * consider the possibility that, say, /[alpha.]]/ was intended to
14272              * be a posix class. */
14273             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14274         }
14275
14276         /* Here we have something like 'foo:]'.  There was no initial colon,
14277          * and we back up over 'foo.  XXX Unlike the going forward case, we
14278          * don't handle typos of non-word chars in the middle */
14279         has_opening_colon = FALSE;
14280         p--;
14281
14282         while (p > RExC_start && isWORDCHAR(*p)) {
14283             p--;
14284         }
14285         p++;
14286
14287         /* Here, we have positioned ourselves to where we think the first
14288          * character in the potential class is */
14289     }
14290
14291     /* Now the interior really starts.  There are certain key characters that
14292      * can end the interior, or these could just be typos.  To catch both
14293      * cases, we may have to do two passes.  In the first pass, we keep on
14294      * going unless we come to a sequence that matches
14295      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
14296      * This means it takes a sequence to end the pass, so two typos in a row if
14297      * that wasn't what was intended.  If the class is perfectly formed, just
14298      * this one pass is needed.  We also stop if there are too many characters
14299      * being accumulated, but this number is deliberately set higher than any
14300      * real class.  It is set high enough so that someone who thinks that
14301      * 'alphanumeric' is a correct name would get warned that it wasn't.
14302      * While doing the pass, we keep track of where the key characters were in
14303      * it.  If we don't find an end to the class, and one of the key characters
14304      * was found, we redo the pass, but stop when we get to that character.
14305      * Thus the key character was considered a typo in the first pass, but a
14306      * terminator in the second.  If two key characters are found, we stop at
14307      * the second one in the first pass.  Again this can miss two typos, but
14308      * catches a single one
14309      *
14310      * In the first pass, 'possible_end' starts as NULL, and then gets set to
14311      * point to the first key character.  For the second pass, it starts as -1.
14312      * */
14313
14314     name_start = p;
14315   parse_name:
14316     {
14317         bool has_blank               = FALSE;
14318         bool has_upper               = FALSE;
14319         bool has_terminating_colon   = FALSE;
14320         bool has_terminating_bracket = FALSE;
14321         bool has_semi_colon          = FALSE;
14322         unsigned int name_len        = 0;
14323         int punct_count              = 0;
14324
14325         while (p < e) {
14326
14327             /* Squeeze out blanks when looking up the class name below */
14328             if (isBLANK(*p) ) {
14329                 has_blank = TRUE;
14330                 found_problem = TRUE;
14331                 p++;
14332                 continue;
14333             }
14334
14335             /* The name will end with a punctuation */
14336             if (isPUNCT(*p)) {
14337                 const char * peek = p + 1;
14338
14339                 /* Treat any non-']' punctuation followed by a ']' (possibly
14340                  * with intervening blanks) as trying to terminate the class.
14341                  * ']]' is very likely to mean a class was intended (but
14342                  * missing the colon), but the warning message that gets
14343                  * generated shows the error position better if we exit the
14344                  * loop at the bottom (eventually), so skip it here. */
14345                 if (*p != ']') {
14346                     if (peek < e && isBLANK(*peek)) {
14347                         has_blank = TRUE;
14348                         found_problem = TRUE;
14349                         do {
14350                             peek++;
14351                         } while (peek < e && isBLANK(*peek));
14352                     }
14353
14354                     if (peek < e && *peek == ']') {
14355                         has_terminating_bracket = TRUE;
14356                         if (*p == ':') {
14357                             has_terminating_colon = TRUE;
14358                         }
14359                         else if (*p == ';') {
14360                             has_semi_colon = TRUE;
14361                             has_terminating_colon = TRUE;
14362                         }
14363                         else {
14364                             found_problem = TRUE;
14365                         }
14366                         p = peek + 1;
14367                         goto try_posix;
14368                     }
14369                 }
14370
14371                 /* Here we have punctuation we thought didn't end the class.
14372                  * Keep track of the position of the key characters that are
14373                  * more likely to have been class-enders */
14374                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14375
14376                     /* Allow just one such possible class-ender not actually
14377                      * ending the class. */
14378                     if (possible_end) {
14379                         break;
14380                     }
14381                     possible_end = p;
14382                 }
14383
14384                 /* If we have too many punctuation characters, no use in
14385                  * keeping going */
14386                 if (++punct_count > max_distance) {
14387                     break;
14388                 }
14389
14390                 /* Treat the punctuation as a typo. */
14391                 input_text[name_len++] = *p;
14392                 p++;
14393             }
14394             else if (isUPPER(*p)) { /* Use lowercase for lookup */
14395                 input_text[name_len++] = toLOWER(*p);
14396                 has_upper = TRUE;
14397                 found_problem = TRUE;
14398                 p++;
14399             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14400                 input_text[name_len++] = *p;
14401                 p++;
14402             }
14403             else {
14404                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14405                 p+= UTF8SKIP(p);
14406             }
14407
14408             /* The declaration of 'input_text' is how long we allow a potential
14409              * class name to be, before saying they didn't mean a class name at
14410              * all */
14411             if (name_len >= C_ARRAY_LENGTH(input_text)) {
14412                 break;
14413             }
14414         }
14415
14416         /* We get to here when the possible class name hasn't been properly
14417          * terminated before:
14418          *   1) we ran off the end of the pattern; or
14419          *   2) found two characters, each of which might have been intended to
14420          *      be the name's terminator
14421          *   3) found so many punctuation characters in the purported name,
14422          *      that the edit distance to a valid one is exceeded
14423          *   4) we decided it was more characters than anyone could have
14424          *      intended to be one. */
14425
14426         found_problem = TRUE;
14427
14428         /* In the final two cases, we know that looking up what we've
14429          * accumulated won't lead to a match, even a fuzzy one. */
14430         if (   name_len >= C_ARRAY_LENGTH(input_text)
14431             || punct_count > max_distance)
14432         {
14433             /* If there was an intermediate key character that could have been
14434              * an intended end, redo the parse, but stop there */
14435             if (possible_end && possible_end != (char *) -1) {
14436                 possible_end = (char *) -1; /* Special signal value to say
14437                                                we've done a first pass */
14438                 p = name_start;
14439                 goto parse_name;
14440             }
14441
14442             /* Otherwise, it can't have meant to have been a class */
14443             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14444         }
14445
14446         /* If we ran off the end, and the final character was a punctuation
14447          * one, back up one, to look at that final one just below.  Later, we
14448          * will restore the parse pointer if appropriate */
14449         if (name_len && p == e && isPUNCT(*(p-1))) {
14450             p--;
14451             name_len--;
14452         }
14453
14454         if (p < e && isPUNCT(*p)) {
14455             if (*p == ']') {
14456                 has_terminating_bracket = TRUE;
14457
14458                 /* If this is a 2nd ']', and the first one is just below this
14459                  * one, consider that to be the real terminator.  This gives a
14460                  * uniform and better positioning for the warning message  */
14461                 if (   possible_end
14462                     && possible_end != (char *) -1
14463                     && *possible_end == ']'
14464                     && name_len && input_text[name_len - 1] == ']')
14465                 {
14466                     name_len--;
14467                     p = possible_end;
14468
14469                     /* And this is actually equivalent to having done the 2nd
14470                      * pass now, so set it to not try again */
14471                     possible_end = (char *) -1;
14472                 }
14473             }
14474             else {
14475                 if (*p == ':') {
14476                     has_terminating_colon = TRUE;
14477                 }
14478                 else if (*p == ';') {
14479                     has_semi_colon = TRUE;
14480                     has_terminating_colon = TRUE;
14481                 }
14482                 p++;
14483             }
14484         }
14485
14486     try_posix:
14487
14488         /* Here, we have a class name to look up.  We can short circuit the
14489          * stuff below for short names that can't possibly be meant to be a
14490          * class name.  (We can do this on the first pass, as any second pass
14491          * will yield an even shorter name) */
14492         if (name_len < 3) {
14493             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14494         }
14495
14496         /* Find which class it is.  Initially switch on the length of the name.
14497          * */
14498         switch (name_len) {
14499             case 4:
14500                 if (memEQ(name_start, "word", 4)) {
14501                     /* this is not POSIX, this is the Perl \w */
14502                     class_number = ANYOF_WORDCHAR;
14503                 }
14504                 break;
14505             case 5:
14506                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14507                  *                        graph lower print punct space upper
14508                  * Offset 4 gives the best switch position.  */
14509                 switch (name_start[4]) {
14510                     case 'a':
14511                         if (memEQ(name_start, "alph", 4)) /* alpha */
14512                             class_number = ANYOF_ALPHA;
14513                         break;
14514                     case 'e':
14515                         if (memEQ(name_start, "spac", 4)) /* space */
14516                             class_number = ANYOF_SPACE;
14517                         break;
14518                     case 'h':
14519                         if (memEQ(name_start, "grap", 4)) /* graph */
14520                             class_number = ANYOF_GRAPH;
14521                         break;
14522                     case 'i':
14523                         if (memEQ(name_start, "asci", 4)) /* ascii */
14524                             class_number = ANYOF_ASCII;
14525                         break;
14526                     case 'k':
14527                         if (memEQ(name_start, "blan", 4)) /* blank */
14528                             class_number = ANYOF_BLANK;
14529                         break;
14530                     case 'l':
14531                         if (memEQ(name_start, "cntr", 4)) /* cntrl */
14532                             class_number = ANYOF_CNTRL;
14533                         break;
14534                     case 'm':
14535                         if (memEQ(name_start, "alnu", 4)) /* alnum */
14536                             class_number = ANYOF_ALPHANUMERIC;
14537                         break;
14538                     case 'r':
14539                         if (memEQ(name_start, "lowe", 4)) /* lower */
14540                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14541                         else if (memEQ(name_start, "uppe", 4)) /* upper */
14542                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14543                         break;
14544                     case 't':
14545                         if (memEQ(name_start, "digi", 4)) /* digit */
14546                             class_number = ANYOF_DIGIT;
14547                         else if (memEQ(name_start, "prin", 4)) /* print */
14548                             class_number = ANYOF_PRINT;
14549                         else if (memEQ(name_start, "punc", 4)) /* punct */
14550                             class_number = ANYOF_PUNCT;
14551                         break;
14552                 }
14553                 break;
14554             case 6:
14555                 if (memEQ(name_start, "xdigit", 6))
14556                     class_number = ANYOF_XDIGIT;
14557                 break;
14558         }
14559
14560         /* If the name exactly matches a posix class name the class number will
14561          * here be set to it, and the input almost certainly was meant to be a
14562          * posix class, so we can skip further checking.  If instead the syntax
14563          * is exactly correct, but the name isn't one of the legal ones, we
14564          * will return that as an error below.  But if neither of these apply,
14565          * it could be that no posix class was intended at all, or that one
14566          * was, but there was a typo.  We tease these apart by doing fuzzy
14567          * matching on the name */
14568         if (class_number == OOB_NAMEDCLASS && found_problem) {
14569             const UV posix_names[][6] = {
14570                                                 { 'a', 'l', 'n', 'u', 'm' },
14571                                                 { 'a', 'l', 'p', 'h', 'a' },
14572                                                 { 'a', 's', 'c', 'i', 'i' },
14573                                                 { 'b', 'l', 'a', 'n', 'k' },
14574                                                 { 'c', 'n', 't', 'r', 'l' },
14575                                                 { 'd', 'i', 'g', 'i', 't' },
14576                                                 { 'g', 'r', 'a', 'p', 'h' },
14577                                                 { 'l', 'o', 'w', 'e', 'r' },
14578                                                 { 'p', 'r', 'i', 'n', 't' },
14579                                                 { 'p', 'u', 'n', 'c', 't' },
14580                                                 { 's', 'p', 'a', 'c', 'e' },
14581                                                 { 'u', 'p', 'p', 'e', 'r' },
14582                                                 { 'w', 'o', 'r', 'd' },
14583                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
14584                                             };
14585             /* The names of the above all have added NULs to make them the same
14586              * size, so we need to also have the real lengths */
14587             const UV posix_name_lengths[] = {
14588                                                 sizeof("alnum") - 1,
14589                                                 sizeof("alpha") - 1,
14590                                                 sizeof("ascii") - 1,
14591                                                 sizeof("blank") - 1,
14592                                                 sizeof("cntrl") - 1,
14593                                                 sizeof("digit") - 1,
14594                                                 sizeof("graph") - 1,
14595                                                 sizeof("lower") - 1,
14596                                                 sizeof("print") - 1,
14597                                                 sizeof("punct") - 1,
14598                                                 sizeof("space") - 1,
14599                                                 sizeof("upper") - 1,
14600                                                 sizeof("word")  - 1,
14601                                                 sizeof("xdigit")- 1
14602                                             };
14603             unsigned int i;
14604             int temp_max = max_distance;    /* Use a temporary, so if we
14605                                                reparse, we haven't changed the
14606                                                outer one */
14607
14608             /* Use a smaller max edit distance if we are missing one of the
14609              * delimiters */
14610             if (   has_opening_bracket + has_opening_colon < 2
14611                 || has_terminating_bracket + has_terminating_colon < 2)
14612             {
14613                 temp_max--;
14614             }
14615
14616             /* See if the input name is close to a legal one */
14617             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14618
14619                 /* Short circuit call if the lengths are too far apart to be
14620                  * able to match */
14621                 if (abs( (int) (name_len - posix_name_lengths[i]))
14622                     > temp_max)
14623                 {
14624                     continue;
14625                 }
14626
14627                 if (edit_distance(input_text,
14628                                   posix_names[i],
14629                                   name_len,
14630                                   posix_name_lengths[i],
14631                                   temp_max
14632                                  )
14633                     > -1)
14634                 { /* If it is close, it probably was intended to be a class */
14635                     goto probably_meant_to_be;
14636                 }
14637             }
14638
14639             /* Here the input name is not close enough to a valid class name
14640              * for us to consider it to be intended to be a posix class.  If
14641              * we haven't already done so, and the parse found a character that
14642              * could have been terminators for the name, but which we absorbed
14643              * as typos during the first pass, repeat the parse, signalling it
14644              * to stop at that character */
14645             if (possible_end && possible_end != (char *) -1) {
14646                 possible_end = (char *) -1;
14647                 p = name_start;
14648                 goto parse_name;
14649             }
14650
14651             /* Here neither pass found a close-enough class name */
14652             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14653         }
14654
14655     probably_meant_to_be:
14656
14657         /* Here we think that a posix specification was intended.  Update any
14658          * parse pointer */
14659         if (updated_parse_ptr) {
14660             *updated_parse_ptr = (char *) p;
14661         }
14662
14663         /* If a posix class name was intended but incorrectly specified, we
14664          * output or return the warnings */
14665         if (found_problem) {
14666
14667             /* We set flags for these issues in the parse loop above instead of
14668              * adding them to the list of warnings, because we can parse it
14669              * twice, and we only want one warning instance */
14670             if (has_upper) {
14671                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14672             }
14673             if (has_blank) {
14674                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14675             }
14676             if (has_semi_colon) {
14677                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14678             }
14679             else if (! has_terminating_colon) {
14680                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14681             }
14682             if (! has_terminating_bracket) {
14683                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14684             }
14685
14686             if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
14687                 *posix_warnings = RExC_warn_text;
14688             }
14689         }
14690         else if (class_number != OOB_NAMEDCLASS) {
14691             /* If it is a known class, return the class.  The class number
14692              * #defines are structured so each complement is +1 to the normal
14693              * one */
14694             return class_number + complement;
14695         }
14696         else if (! check_only) {
14697
14698             /* Here, it is an unrecognized class.  This is an error (unless the
14699             * call is to check only, which we've already handled above) */
14700             const char * const complement_string = (complement)
14701                                                    ? "^"
14702                                                    : "";
14703             RExC_parse = (char *) p;
14704             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
14705                         complement_string,
14706                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14707         }
14708     }
14709
14710     return OOB_NAMEDCLASS;
14711 }
14712 #undef ADD_POSIX_WARNING
14713
14714 STATIC unsigned  int
14715 S_regex_set_precedence(const U8 my_operator) {
14716
14717     /* Returns the precedence in the (?[...]) construct of the input operator,
14718      * specified by its character representation.  The precedence follows
14719      * general Perl rules, but it extends this so that ')' and ']' have (low)
14720      * precedence even though they aren't really operators */
14721
14722     switch (my_operator) {
14723         case '!':
14724             return 5;
14725         case '&':
14726             return 4;
14727         case '^':
14728         case '|':
14729         case '+':
14730         case '-':
14731             return 3;
14732         case ')':
14733             return 2;
14734         case ']':
14735             return 1;
14736     }
14737
14738     NOT_REACHED; /* NOTREACHED */
14739     return 0;   /* Silence compiler warning */
14740 }
14741
14742 STATIC regnode *
14743 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14744                     I32 *flagp, U32 depth,
14745                     char * const oregcomp_parse)
14746 {
14747     /* Handle the (?[...]) construct to do set operations */
14748
14749     U8 curchar;                     /* Current character being parsed */
14750     UV start, end;                  /* End points of code point ranges */
14751     SV* final = NULL;               /* The end result inversion list */
14752     SV* result_string;              /* 'final' stringified */
14753     AV* stack;                      /* stack of operators and operands not yet
14754                                        resolved */
14755     AV* fence_stack = NULL;         /* A stack containing the positions in
14756                                        'stack' of where the undealt-with left
14757                                        parens would be if they were actually
14758                                        put there */
14759     /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
14760      * in Solaris Studio 12.3. See RT #127455 */
14761     VOL IV fence = 0;               /* Position of where most recent undealt-
14762                                        with left paren in stack is; -1 if none.
14763                                      */
14764     STRLEN len;                     /* Temporary */
14765     regnode* node;                  /* Temporary, and final regnode returned by
14766                                        this function */
14767     const bool save_fold = FOLD;    /* Temporary */
14768     char *save_end, *save_parse;    /* Temporaries */
14769     const bool in_locale = LOC;     /* we turn off /l during processing */
14770     AV* posix_warnings = NULL;
14771
14772     GET_RE_DEBUG_FLAGS_DECL;
14773
14774     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14775
14776     if (in_locale) {
14777         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14778     }
14779
14780     REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
14781                                          This is required so that the compile
14782                                          time values are valid in all runtime
14783                                          cases */
14784
14785     /* This will return only an ANYOF regnode, or (unlikely) something smaller
14786      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
14787      * call regclass to handle '[]' so as to not have to reinvent its parsing
14788      * rules here (throwing away the size it computes each time).  And, we exit
14789      * upon an unescaped ']' that isn't one ending a regclass.  To do both
14790      * these things, we need to realize that something preceded by a backslash
14791      * is escaped, so we have to keep track of backslashes */
14792     if (SIZE_ONLY) {
14793         UV depth = 0; /* how many nested (?[...]) constructs */
14794
14795         while (RExC_parse < RExC_end) {
14796             SV* current = NULL;
14797
14798             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14799                                     TRUE /* Force /x */ );
14800
14801             switch (*RExC_parse) {
14802                 case '?':
14803                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
14804                     /* FALLTHROUGH */
14805                 default:
14806                     break;
14807                 case '\\':
14808                     /* Skip past this, so the next character gets skipped, after
14809                      * the switch */
14810                     RExC_parse++;
14811                     if (*RExC_parse == 'c') {
14812                             /* Skip the \cX notation for control characters */
14813                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14814                     }
14815                     break;
14816
14817                 case '[':
14818                 {
14819                     /* See if this is a [:posix:] class. */
14820                     bool is_posix_class = (OOB_NAMEDCLASS
14821                             < handle_possible_posix(pRExC_state,
14822                                                 RExC_parse + 1,
14823                                                 NULL,
14824                                                 NULL,
14825                                                 TRUE /* checking only */));
14826                     /* If it is a posix class, leave the parse pointer at the
14827                      * '[' to fool regclass() into thinking it is part of a
14828                      * '[[:posix:]]'. */
14829                     if (! is_posix_class) {
14830                         RExC_parse++;
14831                     }
14832
14833                     /* regclass() can only return RESTART_PASS1 and NEED_UTF8
14834                      * if multi-char folds are allowed.  */
14835                     if (!regclass(pRExC_state, flagp,depth+1,
14836                                   is_posix_class, /* parse the whole char
14837                                                      class only if not a
14838                                                      posix class */
14839                                   FALSE, /* don't allow multi-char folds */
14840                                   TRUE, /* silence non-portable warnings. */
14841                                   TRUE, /* strict */
14842                                   FALSE, /* Require return to be an ANYOF */
14843                                   &current,
14844                                   &posix_warnings
14845                                  ))
14846                         FAIL2("panic: regclass returned NULL to handle_sets, "
14847                               "flags=%#" UVxf, (UV) *flagp);
14848
14849                     /* function call leaves parse pointing to the ']', except
14850                      * if we faked it */
14851                     if (is_posix_class) {
14852                         RExC_parse--;
14853                     }
14854
14855                     SvREFCNT_dec(current);   /* In case it returned something */
14856                     break;
14857                 }
14858
14859                 case ']':
14860                     if (depth--) break;
14861                     RExC_parse++;
14862                     if (*RExC_parse == ')') {
14863                         node = reganode(pRExC_state, ANYOF, 0);
14864                         RExC_size += ANYOF_SKIP;
14865                         nextchar(pRExC_state);
14866                         Set_Node_Length(node,
14867                                 RExC_parse - oregcomp_parse + 1); /* MJD */
14868                         if (in_locale) {
14869                             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14870                         }
14871
14872                         return node;
14873                     }
14874                     goto no_close;
14875             }
14876
14877             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14878         }
14879
14880       no_close:
14881         /* We output the messages even if warnings are off, because we'll fail
14882          * the very next thing, and these give a likely diagnosis for that */
14883         if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
14884             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
14885         }
14886
14887         FAIL("Syntax error in (?[...])");
14888     }
14889
14890     /* Pass 2 only after this. */
14891     Perl_ck_warner_d(aTHX_
14892         packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
14893         "The regex_sets feature is experimental" REPORT_LOCATION,
14894         REPORT_LOCATION_ARGS(RExC_parse));
14895
14896     /* Everything in this construct is a metacharacter.  Operands begin with
14897      * either a '\' (for an escape sequence), or a '[' for a bracketed
14898      * character class.  Any other character should be an operator, or
14899      * parenthesis for grouping.  Both types of operands are handled by calling
14900      * regclass() to parse them.  It is called with a parameter to indicate to
14901      * return the computed inversion list.  The parsing here is implemented via
14902      * a stack.  Each entry on the stack is a single character representing one
14903      * of the operators; or else a pointer to an operand inversion list. */
14904
14905 #define IS_OPERATOR(a) SvIOK(a)
14906 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
14907
14908     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
14909      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
14910      * with pronouncing it called it Reverse Polish instead, but now that YOU
14911      * know how to pronounce it you can use the correct term, thus giving due
14912      * credit to the person who invented it, and impressing your geek friends.
14913      * Wikipedia says that the pronounciation of "Ł" has been changing so that
14914      * it is now more like an English initial W (as in wonk) than an L.)
14915      *
14916      * This means that, for example, 'a | b & c' is stored on the stack as
14917      *
14918      * c  [4]
14919      * b  [3]
14920      * &  [2]
14921      * a  [1]
14922      * |  [0]
14923      *
14924      * where the numbers in brackets give the stack [array] element number.
14925      * In this implementation, parentheses are not stored on the stack.
14926      * Instead a '(' creates a "fence" so that the part of the stack below the
14927      * fence is invisible except to the corresponding ')' (this allows us to
14928      * replace testing for parens, by using instead subtraction of the fence
14929      * position).  As new operands are processed they are pushed onto the stack
14930      * (except as noted in the next paragraph).  New operators of higher
14931      * precedence than the current final one are inserted on the stack before
14932      * the lhs operand (so that when the rhs is pushed next, everything will be
14933      * in the correct positions shown above.  When an operator of equal or
14934      * lower precedence is encountered in parsing, all the stacked operations
14935      * of equal or higher precedence are evaluated, leaving the result as the
14936      * top entry on the stack.  This makes higher precedence operations
14937      * evaluate before lower precedence ones, and causes operations of equal
14938      * precedence to left associate.
14939      *
14940      * The only unary operator '!' is immediately pushed onto the stack when
14941      * encountered.  When an operand is encountered, if the top of the stack is
14942      * a '!", the complement is immediately performed, and the '!' popped.  The
14943      * resulting value is treated as a new operand, and the logic in the
14944      * previous paragraph is executed.  Thus in the expression
14945      *      [a] + ! [b]
14946      * the stack looks like
14947      *
14948      * !
14949      * a
14950      * +
14951      *
14952      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
14953      * becomes
14954      *
14955      * !b
14956      * a
14957      * +
14958      *
14959      * A ')' is treated as an operator with lower precedence than all the
14960      * aforementioned ones, which causes all operations on the stack above the
14961      * corresponding '(' to be evaluated down to a single resultant operand.
14962      * Then the fence for the '(' is removed, and the operand goes through the
14963      * algorithm above, without the fence.
14964      *
14965      * A separate stack is kept of the fence positions, so that the position of
14966      * the latest so-far unbalanced '(' is at the top of it.
14967      *
14968      * The ']' ending the construct is treated as the lowest operator of all,
14969      * so that everything gets evaluated down to a single operand, which is the
14970      * result */
14971
14972     sv_2mortal((SV *)(stack = newAV()));
14973     sv_2mortal((SV *)(fence_stack = newAV()));
14974
14975     while (RExC_parse < RExC_end) {
14976         I32 top_index;              /* Index of top-most element in 'stack' */
14977         SV** top_ptr;               /* Pointer to top 'stack' element */
14978         SV* current = NULL;         /* To contain the current inversion list
14979                                        operand */
14980         SV* only_to_avoid_leaks;
14981
14982         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14983                                 TRUE /* Force /x */ );
14984         if (RExC_parse >= RExC_end) {
14985             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
14986         }
14987
14988         curchar = UCHARAT(RExC_parse);
14989
14990 redo_curchar:
14991
14992 #ifdef ENABLE_REGEX_SETS_DEBUGGING
14993                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
14994         DEBUG_U(dump_regex_sets_structures(pRExC_state,
14995                                            stack, fence, fence_stack));
14996 #endif
14997
14998         top_index = av_tindex_skip_len_mg(stack);
14999
15000         switch (curchar) {
15001             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15002             char stacked_operator;  /* The topmost operator on the 'stack'. */
15003             SV* lhs;                /* Operand to the left of the operator */
15004             SV* rhs;                /* Operand to the right of the operator */
15005             SV* fence_ptr;          /* Pointer to top element of the fence
15006                                        stack */
15007
15008             case '(':
15009
15010                 if (   RExC_parse < RExC_end - 1
15011                     && (UCHARAT(RExC_parse + 1) == '?'))
15012                 {
15013                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
15014                      * This happens when we have some thing like
15015                      *
15016                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15017                      *   ...
15018                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15019                      *
15020                      * Here we would be handling the interpolated
15021                      * '$thai_or_lao'.  We handle this by a recursive call to
15022                      * ourselves which returns the inversion list the
15023                      * interpolated expression evaluates to.  We use the flags
15024                      * from the interpolated pattern. */
15025                     U32 save_flags = RExC_flags;
15026                     const char * save_parse;
15027
15028                     RExC_parse += 2;        /* Skip past the '(?' */
15029                     save_parse = RExC_parse;
15030
15031                     /* Parse any flags for the '(?' */
15032                     parse_lparen_question_flags(pRExC_state);
15033
15034                     if (RExC_parse == save_parse  /* Makes sure there was at
15035                                                      least one flag (or else
15036                                                      this embedding wasn't
15037                                                      compiled) */
15038                         || RExC_parse >= RExC_end - 4
15039                         || UCHARAT(RExC_parse) != ':'
15040                         || UCHARAT(++RExC_parse) != '('
15041                         || UCHARAT(++RExC_parse) != '?'
15042                         || UCHARAT(++RExC_parse) != '[')
15043                     {
15044
15045                         /* In combination with the above, this moves the
15046                          * pointer to the point just after the first erroneous
15047                          * character (or if there are no flags, to where they
15048                          * should have been) */
15049                         if (RExC_parse >= RExC_end - 4) {
15050                             RExC_parse = RExC_end;
15051                         }
15052                         else if (RExC_parse != save_parse) {
15053                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15054                         }
15055                         vFAIL("Expecting '(?flags:(?[...'");
15056                     }
15057
15058                     /* Recurse, with the meat of the embedded expression */
15059                     RExC_parse++;
15060                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15061                                                     depth+1, oregcomp_parse);
15062
15063                     /* Here, 'current' contains the embedded expression's
15064                      * inversion list, and RExC_parse points to the trailing
15065                      * ']'; the next character should be the ')' */
15066                     RExC_parse++;
15067                     assert(UCHARAT(RExC_parse) == ')');
15068
15069                     /* Then the ')' matching the original '(' handled by this
15070                      * case: statement */
15071                     RExC_parse++;
15072                     assert(UCHARAT(RExC_parse) == ')');
15073
15074                     RExC_parse++;
15075                     RExC_flags = save_flags;
15076                     goto handle_operand;
15077                 }
15078
15079                 /* A regular '('.  Look behind for illegal syntax */
15080                 if (top_index - fence >= 0) {
15081                     /* If the top entry on the stack is an operator, it had
15082                      * better be a '!', otherwise the entry below the top
15083                      * operand should be an operator */
15084                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
15085                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15086                         || (   IS_OPERAND(*top_ptr)
15087                             && (   top_index - fence < 1
15088                                 || ! (stacked_ptr = av_fetch(stack,
15089                                                              top_index - 1,
15090                                                              FALSE))
15091                                 || ! IS_OPERATOR(*stacked_ptr))))
15092                     {
15093                         RExC_parse++;
15094                         vFAIL("Unexpected '(' with no preceding operator");
15095                     }
15096                 }
15097
15098                 /* Stack the position of this undealt-with left paren */
15099                 av_push(fence_stack, newSViv(fence));
15100                 fence = top_index + 1;
15101                 break;
15102
15103             case '\\':
15104                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15105                  * multi-char folds are allowed.  */
15106                 if (!regclass(pRExC_state, flagp,depth+1,
15107                               TRUE, /* means parse just the next thing */
15108                               FALSE, /* don't allow multi-char folds */
15109                               FALSE, /* don't silence non-portable warnings.  */
15110                               TRUE,  /* strict */
15111                               FALSE, /* Require return to be an ANYOF */
15112                               &current,
15113                               NULL))
15114                 {
15115                     FAIL2("panic: regclass returned NULL to handle_sets, "
15116                           "flags=%#" UVxf, (UV) *flagp);
15117                 }
15118
15119                 /* regclass() will return with parsing just the \ sequence,
15120                  * leaving the parse pointer at the next thing to parse */
15121                 RExC_parse--;
15122                 goto handle_operand;
15123
15124             case '[':   /* Is a bracketed character class */
15125             {
15126                 /* See if this is a [:posix:] class. */
15127                 bool is_posix_class = (OOB_NAMEDCLASS
15128                             < handle_possible_posix(pRExC_state,
15129                                                 RExC_parse + 1,
15130                                                 NULL,
15131                                                 NULL,
15132                                                 TRUE /* checking only */));
15133                 /* If it is a posix class, leave the parse pointer at the '['
15134                  * to fool regclass() into thinking it is part of a
15135                  * '[[:posix:]]'. */
15136                 if (! is_posix_class) {
15137                     RExC_parse++;
15138                 }
15139
15140                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15141                  * multi-char folds are allowed.  */
15142                 if (!regclass(pRExC_state, flagp,depth+1,
15143                                 is_posix_class, /* parse the whole char
15144                                                     class only if not a
15145                                                     posix class */
15146                                 FALSE, /* don't allow multi-char folds */
15147                                 TRUE, /* silence non-portable warnings. */
15148                                 TRUE, /* strict */
15149                                 FALSE, /* Require return to be an ANYOF */
15150                                 &current,
15151                                 NULL
15152                                 ))
15153                 {
15154                     FAIL2("panic: regclass returned NULL to handle_sets, "
15155                           "flags=%#" UVxf, (UV) *flagp);
15156                 }
15157
15158                 /* function call leaves parse pointing to the ']', except if we
15159                  * faked it */
15160                 if (is_posix_class) {
15161                     RExC_parse--;
15162                 }
15163
15164                 goto handle_operand;
15165             }
15166
15167             case ']':
15168                 if (top_index >= 1) {
15169                     goto join_operators;
15170                 }
15171
15172                 /* Only a single operand on the stack: are done */
15173                 goto done;
15174
15175             case ')':
15176                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15177                     RExC_parse++;
15178                     vFAIL("Unexpected ')'");
15179                 }
15180
15181                 /* If nothing after the fence, is missing an operand */
15182                 if (top_index - fence < 0) {
15183                     RExC_parse++;
15184                     goto bad_syntax;
15185                 }
15186                 /* If at least two things on the stack, treat this as an
15187                   * operator */
15188                 if (top_index - fence >= 1) {
15189                     goto join_operators;
15190                 }
15191
15192                 /* Here only a single thing on the fenced stack, and there is a
15193                  * fence.  Get rid of it */
15194                 fence_ptr = av_pop(fence_stack);
15195                 assert(fence_ptr);
15196                 fence = SvIV(fence_ptr) - 1;
15197                 SvREFCNT_dec_NN(fence_ptr);
15198                 fence_ptr = NULL;
15199
15200                 if (fence < 0) {
15201                     fence = 0;
15202                 }
15203
15204                 /* Having gotten rid of the fence, we pop the operand at the
15205                  * stack top and process it as a newly encountered operand */
15206                 current = av_pop(stack);
15207                 if (IS_OPERAND(current)) {
15208                     goto handle_operand;
15209                 }
15210
15211                 RExC_parse++;
15212                 goto bad_syntax;
15213
15214             case '&':
15215             case '|':
15216             case '+':
15217             case '-':
15218             case '^':
15219
15220                 /* These binary operators should have a left operand already
15221                  * parsed */
15222                 if (   top_index - fence < 0
15223                     || top_index - fence == 1
15224                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15225                     || ! IS_OPERAND(*top_ptr))
15226                 {
15227                     goto unexpected_binary;
15228                 }
15229
15230                 /* If only the one operand is on the part of the stack visible
15231                  * to us, we just place this operator in the proper position */
15232                 if (top_index - fence < 2) {
15233
15234                     /* Place the operator before the operand */
15235
15236                     SV* lhs = av_pop(stack);
15237                     av_push(stack, newSVuv(curchar));
15238                     av_push(stack, lhs);
15239                     break;
15240                 }
15241
15242                 /* But if there is something else on the stack, we need to
15243                  * process it before this new operator if and only if the
15244                  * stacked operation has equal or higher precedence than the
15245                  * new one */
15246
15247              join_operators:
15248
15249                 /* The operator on the stack is supposed to be below both its
15250                  * operands */
15251                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15252                     || IS_OPERAND(*stacked_ptr))
15253                 {
15254                     /* But if not, it's legal and indicates we are completely
15255                      * done if and only if we're currently processing a ']',
15256                      * which should be the final thing in the expression */
15257                     if (curchar == ']') {
15258                         goto done;
15259                     }
15260
15261                   unexpected_binary:
15262                     RExC_parse++;
15263                     vFAIL2("Unexpected binary operator '%c' with no "
15264                            "preceding operand", curchar);
15265                 }
15266                 stacked_operator = (char) SvUV(*stacked_ptr);
15267
15268                 if (regex_set_precedence(curchar)
15269                     > regex_set_precedence(stacked_operator))
15270                 {
15271                     /* Here, the new operator has higher precedence than the
15272                      * stacked one.  This means we need to add the new one to
15273                      * the stack to await its rhs operand (and maybe more
15274                      * stuff).  We put it before the lhs operand, leaving
15275                      * untouched the stacked operator and everything below it
15276                      * */
15277                     lhs = av_pop(stack);
15278                     assert(IS_OPERAND(lhs));
15279
15280                     av_push(stack, newSVuv(curchar));
15281                     av_push(stack, lhs);
15282                     break;
15283                 }
15284
15285                 /* Here, the new operator has equal or lower precedence than
15286                  * what's already there.  This means the operation already
15287                  * there should be performed now, before the new one. */
15288
15289                 rhs = av_pop(stack);
15290                 if (! IS_OPERAND(rhs)) {
15291
15292                     /* This can happen when a ! is not followed by an operand,
15293                      * like in /(?[\t &!])/ */
15294                     goto bad_syntax;
15295                 }
15296
15297                 lhs = av_pop(stack);
15298
15299                 if (! IS_OPERAND(lhs)) {
15300
15301                     /* This can happen when there is an empty (), like in
15302                      * /(?[[0]+()+])/ */
15303                     goto bad_syntax;
15304                 }
15305
15306                 switch (stacked_operator) {
15307                     case '&':
15308                         _invlist_intersection(lhs, rhs, &rhs);
15309                         break;
15310
15311                     case '|':
15312                     case '+':
15313                         _invlist_union(lhs, rhs, &rhs);
15314                         break;
15315
15316                     case '-':
15317                         _invlist_subtract(lhs, rhs, &rhs);
15318                         break;
15319
15320                     case '^':   /* The union minus the intersection */
15321                     {
15322                         SV* i = NULL;
15323                         SV* u = NULL;
15324
15325                         _invlist_union(lhs, rhs, &u);
15326                         _invlist_intersection(lhs, rhs, &i);
15327                         _invlist_subtract(u, i, &rhs);
15328                         SvREFCNT_dec_NN(i);
15329                         SvREFCNT_dec_NN(u);
15330                         break;
15331                     }
15332                 }
15333                 SvREFCNT_dec(lhs);
15334
15335                 /* Here, the higher precedence operation has been done, and the
15336                  * result is in 'rhs'.  We overwrite the stacked operator with
15337                  * the result.  Then we redo this code to either push the new
15338                  * operator onto the stack or perform any higher precedence
15339                  * stacked operation */
15340                 only_to_avoid_leaks = av_pop(stack);
15341                 SvREFCNT_dec(only_to_avoid_leaks);
15342                 av_push(stack, rhs);
15343                 goto redo_curchar;
15344
15345             case '!':   /* Highest priority, right associative */
15346
15347                 /* If what's already at the top of the stack is another '!",
15348                  * they just cancel each other out */
15349                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
15350                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15351                 {
15352                     only_to_avoid_leaks = av_pop(stack);
15353                     SvREFCNT_dec(only_to_avoid_leaks);
15354                 }
15355                 else { /* Otherwise, since it's right associative, just push
15356                           onto the stack */
15357                     av_push(stack, newSVuv(curchar));
15358                 }
15359                 break;
15360
15361             default:
15362                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15363                 vFAIL("Unexpected character");
15364
15365           handle_operand:
15366
15367             /* Here 'current' is the operand.  If something is already on the
15368              * stack, we have to check if it is a !.  But first, the code above
15369              * may have altered the stack in the time since we earlier set
15370              * 'top_index'.  */
15371
15372             top_index = av_tindex_skip_len_mg(stack);
15373             if (top_index - fence >= 0) {
15374                 /* If the top entry on the stack is an operator, it had better
15375                  * be a '!', otherwise the entry below the top operand should
15376                  * be an operator */
15377                 top_ptr = av_fetch(stack, top_index, FALSE);
15378                 assert(top_ptr);
15379                 if (IS_OPERATOR(*top_ptr)) {
15380
15381                     /* The only permissible operator at the top of the stack is
15382                      * '!', which is applied immediately to this operand. */
15383                     curchar = (char) SvUV(*top_ptr);
15384                     if (curchar != '!') {
15385                         SvREFCNT_dec(current);
15386                         vFAIL2("Unexpected binary operator '%c' with no "
15387                                 "preceding operand", curchar);
15388                     }
15389
15390                     _invlist_invert(current);
15391
15392                     only_to_avoid_leaks = av_pop(stack);
15393                     SvREFCNT_dec(only_to_avoid_leaks);
15394
15395                     /* And we redo with the inverted operand.  This allows
15396                      * handling multiple ! in a row */
15397                     goto handle_operand;
15398                 }
15399                           /* Single operand is ok only for the non-binary ')'
15400                            * operator */
15401                 else if ((top_index - fence == 0 && curchar != ')')
15402                          || (top_index - fence > 0
15403                              && (! (stacked_ptr = av_fetch(stack,
15404                                                            top_index - 1,
15405                                                            FALSE))
15406                                  || IS_OPERAND(*stacked_ptr))))
15407                 {
15408                     SvREFCNT_dec(current);
15409                     vFAIL("Operand with no preceding operator");
15410                 }
15411             }
15412
15413             /* Here there was nothing on the stack or the top element was
15414              * another operand.  Just add this new one */
15415             av_push(stack, current);
15416
15417         } /* End of switch on next parse token */
15418
15419         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15420     } /* End of loop parsing through the construct */
15421
15422   done:
15423     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
15424         vFAIL("Unmatched (");
15425     }
15426
15427     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
15428         || ((final = av_pop(stack)) == NULL)
15429         || ! IS_OPERAND(final)
15430         || SvTYPE(final) != SVt_INVLIST
15431         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
15432     {
15433       bad_syntax:
15434         SvREFCNT_dec(final);
15435         vFAIL("Incomplete expression within '(?[ ])'");
15436     }
15437
15438     /* Here, 'final' is the resultant inversion list from evaluating the
15439      * expression.  Return it if so requested */
15440     if (return_invlist) {
15441         *return_invlist = final;
15442         return END;
15443     }
15444
15445     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
15446      * expecting a string of ranges and individual code points */
15447     invlist_iterinit(final);
15448     result_string = newSVpvs("");
15449     while (invlist_iternext(final, &start, &end)) {
15450         if (start == end) {
15451             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
15452         }
15453         else {
15454             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
15455                                                      start,          end);
15456         }
15457     }
15458
15459     /* About to generate an ANYOF (or similar) node from the inversion list we
15460      * have calculated */
15461     save_parse = RExC_parse;
15462     RExC_parse = SvPV(result_string, len);
15463     save_end = RExC_end;
15464     RExC_end = RExC_parse + len;
15465
15466     /* We turn off folding around the call, as the class we have constructed
15467      * already has all folding taken into consideration, and we don't want
15468      * regclass() to add to that */
15469     RExC_flags &= ~RXf_PMf_FOLD;
15470     /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15471      * folds are allowed.  */
15472     node = regclass(pRExC_state, flagp,depth+1,
15473                     FALSE, /* means parse the whole char class */
15474                     FALSE, /* don't allow multi-char folds */
15475                     TRUE, /* silence non-portable warnings.  The above may very
15476                              well have generated non-portable code points, but
15477                              they're valid on this machine */
15478                     FALSE, /* similarly, no need for strict */
15479                     FALSE, /* Require return to be an ANYOF */
15480                     NULL,
15481                     NULL
15482                 );
15483     if (!node)
15484         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf,
15485                     PTR2UV(flagp));
15486
15487     /* Fix up the node type if we are in locale.  (We have pretended we are
15488      * under /u for the purposes of regclass(), as this construct will only
15489      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
15490      * as to cause any warnings about bad locales to be output in regexec.c),
15491      * and add the flag that indicates to check if not in a UTF-8 locale.  The
15492      * reason we above forbid optimization into something other than an ANYOF
15493      * node is simply to minimize the number of code changes in regexec.c.
15494      * Otherwise we would have to create new EXACTish node types and deal with
15495      * them.  This decision could be revisited should this construct become
15496      * popular.
15497      *
15498      * (One might think we could look at the resulting ANYOF node and suppress
15499      * the flag if everything is above 255, as those would be UTF-8 only,
15500      * but this isn't true, as the components that led to that result could
15501      * have been locale-affected, and just happen to cancel each other out
15502      * under UTF-8 locales.) */
15503     if (in_locale) {
15504         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15505
15506         assert(OP(node) == ANYOF);
15507
15508         OP(node) = ANYOFL;
15509         ANYOF_FLAGS(node)
15510                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15511     }
15512
15513     if (save_fold) {
15514         RExC_flags |= RXf_PMf_FOLD;
15515     }
15516
15517     RExC_parse = save_parse + 1;
15518     RExC_end = save_end;
15519     SvREFCNT_dec_NN(final);
15520     SvREFCNT_dec_NN(result_string);
15521
15522     nextchar(pRExC_state);
15523     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15524     return node;
15525 }
15526
15527 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15528
15529 STATIC void
15530 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
15531                              AV * stack, const IV fence, AV * fence_stack)
15532 {   /* Dumps the stacks in handle_regex_sets() */
15533
15534     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
15535     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
15536     SSize_t i;
15537
15538     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
15539
15540     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
15541
15542     if (stack_top < 0) {
15543         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
15544     }
15545     else {
15546         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
15547         for (i = stack_top; i >= 0; i--) {
15548             SV ** element_ptr = av_fetch(stack, i, FALSE);
15549             if (! element_ptr) {
15550             }
15551
15552             if (IS_OPERATOR(*element_ptr)) {
15553                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
15554                                             (int) i, (int) SvIV(*element_ptr));
15555             }
15556             else {
15557                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
15558                 sv_dump(*element_ptr);
15559             }
15560         }
15561     }
15562
15563     if (fence_stack_top < 0) {
15564         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
15565     }
15566     else {
15567         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
15568         for (i = fence_stack_top; i >= 0; i--) {
15569             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
15570             if (! element_ptr) {
15571             }
15572
15573             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
15574                                             (int) i, (int) SvIV(*element_ptr));
15575         }
15576     }
15577 }
15578
15579 #endif
15580
15581 #undef IS_OPERATOR
15582 #undef IS_OPERAND
15583
15584 STATIC void
15585 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15586 {
15587     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15588      * innocent-looking character class, like /[ks]/i won't have to go out to
15589      * disk to find the possible matches.
15590      *
15591      * This should be called only for a Latin1-range code points, cp, which is
15592      * known to be involved in a simple fold with other code points above
15593      * Latin1.  It would give false results if /aa has been specified.
15594      * Multi-char folds are outside the scope of this, and must be handled
15595      * specially.
15596      *
15597      * XXX It would be better to generate these via regen, in case a new
15598      * version of the Unicode standard adds new mappings, though that is not
15599      * really likely, and may be caught by the default: case of the switch
15600      * below. */
15601
15602     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15603
15604     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15605
15606     switch (cp) {
15607         case 'k':
15608         case 'K':
15609           *invlist =
15610              add_cp_to_invlist(*invlist, KELVIN_SIGN);
15611             break;
15612         case 's':
15613         case 'S':
15614           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15615             break;
15616         case MICRO_SIGN:
15617           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15618           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15619             break;
15620         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15621         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15622           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15623             break;
15624         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15625           *invlist = add_cp_to_invlist(*invlist,
15626                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15627             break;
15628
15629 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15630
15631         case LATIN_SMALL_LETTER_SHARP_S:
15632           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15633             break;
15634
15635 #endif
15636
15637 #if    UNICODE_MAJOR_VERSION < 3                                        \
15638    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15639
15640         /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15641          * U+0131.  */
15642         case 'i':
15643         case 'I':
15644           *invlist =
15645              add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15646 #   if UNICODE_DOT_DOT_VERSION == 1
15647           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15648 #   endif
15649             break;
15650 #endif
15651
15652         default:
15653             /* Use deprecated warning to increase the chances of this being
15654              * output */
15655             if (PASS2) {
15656                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15657             }
15658             break;
15659     }
15660 }
15661
15662 STATIC void
15663 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15664 {
15665     /* If the final parameter is NULL, output the elements of the array given
15666      * by '*posix_warnings' as REGEXP warnings.  Otherwise, the elements are
15667      * pushed onto it, (creating if necessary) */
15668
15669     SV * msg;
15670     const bool first_is_fatal =  ! return_posix_warnings
15671                                 && ckDEAD(packWARN(WARN_REGEXP));
15672
15673     PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15674
15675     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15676         if (return_posix_warnings) {
15677             if (! *return_posix_warnings) { /* mortalize to not leak if
15678                                                warnings are fatal */
15679                 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15680             }
15681             av_push(*return_posix_warnings, msg);
15682         }
15683         else {
15684             if (first_is_fatal) {           /* Avoid leaking this */
15685                 av_undef(posix_warnings);   /* This isn't necessary if the
15686                                                array is mortal, but is a
15687                                                fail-safe */
15688                 (void) sv_2mortal(msg);
15689                 if (PASS2) {
15690                     SAVEFREESV(RExC_rx_sv);
15691                 }
15692             }
15693             Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15694             SvREFCNT_dec_NN(msg);
15695         }
15696     }
15697 }
15698
15699 STATIC AV *
15700 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15701 {
15702     /* This adds the string scalar <multi_string> to the array
15703      * <multi_char_matches>.  <multi_string> is known to have exactly
15704      * <cp_count> code points in it.  This is used when constructing a
15705      * bracketed character class and we find something that needs to match more
15706      * than a single character.
15707      *
15708      * <multi_char_matches> is actually an array of arrays.  Each top-level
15709      * element is an array that contains all the strings known so far that are
15710      * the same length.  And that length (in number of code points) is the same
15711      * as the index of the top-level array.  Hence, the [2] element is an
15712      * array, each element thereof is a string containing TWO code points;
15713      * while element [3] is for strings of THREE characters, and so on.  Since
15714      * this is for multi-char strings there can never be a [0] nor [1] element.
15715      *
15716      * When we rewrite the character class below, we will do so such that the
15717      * longest strings are written first, so that it prefers the longest
15718      * matching strings first.  This is done even if it turns out that any
15719      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
15720      * Christiansen has agreed that this is ok.  This makes the test for the
15721      * ligature 'ffi' come before the test for 'ff', for example */
15722
15723     AV* this_array;
15724     AV** this_array_ptr;
15725
15726     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15727
15728     if (! multi_char_matches) {
15729         multi_char_matches = newAV();
15730     }
15731
15732     if (av_exists(multi_char_matches, cp_count)) {
15733         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15734         this_array = *this_array_ptr;
15735     }
15736     else {
15737         this_array = newAV();
15738         av_store(multi_char_matches, cp_count,
15739                  (SV*) this_array);
15740     }
15741     av_push(this_array, multi_string);
15742
15743     return multi_char_matches;
15744 }
15745
15746 /* The names of properties whose definitions are not known at compile time are
15747  * stored in this SV, after a constant heading.  So if the length has been
15748  * changed since initialization, then there is a run-time definition. */
15749 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
15750                                         (SvCUR(listsv) != initial_listsv_len)
15751
15752 /* There is a restricted set of white space characters that are legal when
15753  * ignoring white space in a bracketed character class.  This generates the
15754  * code to skip them.
15755  *
15756  * There is a line below that uses the same white space criteria but is outside
15757  * this macro.  Both here and there must use the same definition */
15758 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
15759     STMT_START {                                                        \
15760         if (do_skip) {                                                  \
15761             while (isBLANK_A(UCHARAT(p)))                               \
15762             {                                                           \
15763                 p++;                                                    \
15764             }                                                           \
15765         }                                                               \
15766     } STMT_END
15767
15768 STATIC regnode *
15769 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15770                  const bool stop_at_1,  /* Just parse the next thing, don't
15771                                            look for a full character class */
15772                  bool allow_multi_folds,
15773                  const bool silence_non_portable,   /* Don't output warnings
15774                                                        about too large
15775                                                        characters */
15776                  const bool strict,
15777                  bool optimizable,                  /* ? Allow a non-ANYOF return
15778                                                        node */
15779                  SV** ret_invlist, /* Return an inversion list, not a node */
15780                  AV** return_posix_warnings
15781           )
15782 {
15783     /* parse a bracketed class specification.  Most of these will produce an
15784      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
15785      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
15786      * under /i with multi-character folds: it will be rewritten following the
15787      * paradigm of this example, where the <multi-fold>s are characters which
15788      * fold to multiple character sequences:
15789      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
15790      * gets effectively rewritten as:
15791      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
15792      * reg() gets called (recursively) on the rewritten version, and this
15793      * function will return what it constructs.  (Actually the <multi-fold>s
15794      * aren't physically removed from the [abcdefghi], it's just that they are
15795      * ignored in the recursion by means of a flag:
15796      * <RExC_in_multi_char_class>.)
15797      *
15798      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
15799      * characters, with the corresponding bit set if that character is in the
15800      * list.  For characters above this, a range list or swash is used.  There
15801      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
15802      * determinable at compile time
15803      *
15804      * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
15805      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
15806      * to UTF-8.  This can only happen if ret_invlist is non-NULL.
15807      */
15808
15809     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
15810     IV range = 0;
15811     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
15812     regnode *ret;
15813     STRLEN numlen;
15814     int namedclass = OOB_NAMEDCLASS;
15815     char *rangebegin = NULL;
15816     bool need_class = 0;
15817     SV *listsv = NULL;
15818     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
15819                                       than just initialized.  */
15820     SV* properties = NULL;    /* Code points that match \p{} \P{} */
15821     SV* posixes = NULL;     /* Code points that match classes like [:word:],
15822                                extended beyond the Latin1 range.  These have to
15823                                be kept separate from other code points for much
15824                                of this function because their handling  is
15825                                different under /i, and for most classes under
15826                                /d as well */
15827     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
15828                                separate for a while from the non-complemented
15829                                versions because of complications with /d
15830                                matching */
15831     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
15832                                   treated more simply than the general case,
15833                                   leading to less compilation and execution
15834                                   work */
15835     UV element_count = 0;   /* Number of distinct elements in the class.
15836                                Optimizations may be possible if this is tiny */
15837     AV * multi_char_matches = NULL; /* Code points that fold to more than one
15838                                        character; used under /i */
15839     UV n;
15840     char * stop_ptr = RExC_end;    /* where to stop parsing */
15841
15842     /* ignore unescaped whitespace? */
15843     const bool skip_white = cBOOL(   ret_invlist
15844                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
15845
15846     /* Unicode properties are stored in a swash; this holds the current one
15847      * being parsed.  If this swash is the only above-latin1 component of the
15848      * character class, an optimization is to pass it directly on to the
15849      * execution engine.  Otherwise, it is set to NULL to indicate that there
15850      * are other things in the class that have to be dealt with at execution
15851      * time */
15852     SV* swash = NULL;           /* Code points that match \p{} \P{} */
15853
15854     /* Set if a component of this character class is user-defined; just passed
15855      * on to the engine */
15856     bool has_user_defined_property = FALSE;
15857
15858     /* inversion list of code points this node matches only when the target
15859      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
15860      * /d) */
15861     SV* has_upper_latin1_only_utf8_matches = NULL;
15862
15863     /* Inversion list of code points this node matches regardless of things
15864      * like locale, folding, utf8ness of the target string */
15865     SV* cp_list = NULL;
15866
15867     /* Like cp_list, but code points on this list need to be checked for things
15868      * that fold to/from them under /i */
15869     SV* cp_foldable_list = NULL;
15870
15871     /* Like cp_list, but code points on this list are valid only when the
15872      * runtime locale is UTF-8 */
15873     SV* only_utf8_locale_list = NULL;
15874
15875     /* In a range, if one of the endpoints is non-character-set portable,
15876      * meaning that it hard-codes a code point that may mean a different
15877      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
15878      * mnemonic '\t' which each mean the same character no matter which
15879      * character set the platform is on. */
15880     unsigned int non_portable_endpoint = 0;
15881
15882     /* Is the range unicode? which means on a platform that isn't 1-1 native
15883      * to Unicode (i.e. non-ASCII), each code point in it should be considered
15884      * to be a Unicode value.  */
15885     bool unicode_range = FALSE;
15886     bool invert = FALSE;    /* Is this class to be complemented */
15887
15888     bool warn_super = ALWAYS_WARN_SUPER;
15889
15890     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
15891         case we need to change the emitted regop to an EXACT. */
15892     const char * orig_parse = RExC_parse;
15893     const SSize_t orig_size = RExC_size;
15894     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
15895
15896     /* This variable is used to mark where the end in the input is of something
15897      * that looks like a POSIX construct but isn't.  During the parse, when
15898      * something looks like it could be such a construct is encountered, it is
15899      * checked for being one, but not if we've already checked this area of the
15900      * input.  Only after this position is reached do we check again */
15901     char *not_posix_region_end = RExC_parse - 1;
15902
15903     AV* posix_warnings = NULL;
15904     const bool do_posix_warnings =     return_posix_warnings
15905                                    || (PASS2 && ckWARN(WARN_REGEXP));
15906
15907     GET_RE_DEBUG_FLAGS_DECL;
15908
15909     PERL_ARGS_ASSERT_REGCLASS;
15910 #ifndef DEBUGGING
15911     PERL_UNUSED_ARG(depth);
15912 #endif
15913
15914     DEBUG_PARSE("clas");
15915
15916 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
15917     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
15918                                    && UNICODE_DOT_DOT_VERSION == 0)
15919     allow_multi_folds = FALSE;
15920 #endif
15921
15922     /* Assume we are going to generate an ANYOF node. */
15923     ret = reganode(pRExC_state,
15924                    (LOC)
15925                     ? ANYOFL
15926                     : ANYOF,
15927                    0);
15928
15929     if (SIZE_ONLY) {
15930         RExC_size += ANYOF_SKIP;
15931         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
15932     }
15933     else {
15934         ANYOF_FLAGS(ret) = 0;
15935
15936         RExC_emit += ANYOF_SKIP;
15937         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
15938         initial_listsv_len = SvCUR(listsv);
15939         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
15940     }
15941
15942     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15943
15944     assert(RExC_parse <= RExC_end);
15945
15946     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
15947         RExC_parse++;
15948         invert = TRUE;
15949         allow_multi_folds = FALSE;
15950         MARK_NAUGHTY(1);
15951         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15952     }
15953
15954     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
15955     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
15956         int maybe_class = handle_possible_posix(pRExC_state,
15957                                                 RExC_parse,
15958                                                 &not_posix_region_end,
15959                                                 NULL,
15960                                                 TRUE /* checking only */);
15961         if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
15962             SAVEFREESV(RExC_rx_sv);
15963             ckWARN4reg(not_posix_region_end,
15964                     "POSIX syntax [%c %c] belongs inside character classes%s",
15965                     *RExC_parse, *RExC_parse,
15966                     (maybe_class == OOB_NAMEDCLASS)
15967                     ? ((POSIXCC_NOTYET(*RExC_parse))
15968                         ? " (but this one isn't implemented)"
15969                         : " (but this one isn't fully valid)")
15970                     : ""
15971                     );
15972             (void)ReREFCNT_inc(RExC_rx_sv);
15973         }
15974     }
15975
15976     /* If the caller wants us to just parse a single element, accomplish this
15977      * by faking the loop ending condition */
15978     if (stop_at_1 && RExC_end > RExC_parse) {
15979         stop_ptr = RExC_parse + 1;
15980     }
15981
15982     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
15983     if (UCHARAT(RExC_parse) == ']')
15984         goto charclassloop;
15985
15986     while (1) {
15987
15988         if (   posix_warnings
15989             && av_tindex_skip_len_mg(posix_warnings) >= 0
15990             && RExC_parse > not_posix_region_end)
15991         {
15992             /* Warnings about posix class issues are considered tentative until
15993              * we are far enough along in the parse that we can no longer
15994              * change our mind, at which point we either output them or add
15995              * them, if it has so specified, to what gets returned to the
15996              * caller.  This is done each time through the loop so that a later
15997              * class won't zap them before they have been dealt with. */
15998             output_or_return_posix_warnings(pRExC_state, posix_warnings,
15999                                             return_posix_warnings);
16000         }
16001
16002         if  (RExC_parse >= stop_ptr) {
16003             break;
16004         }
16005
16006         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16007
16008         if  (UCHARAT(RExC_parse) == ']') {
16009             break;
16010         }
16011
16012       charclassloop:
16013
16014         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16015         save_value = value;
16016         save_prevvalue = prevvalue;
16017
16018         if (!range) {
16019             rangebegin = RExC_parse;
16020             element_count++;
16021             non_portable_endpoint = 0;
16022         }
16023         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16024             value = utf8n_to_uvchr((U8*)RExC_parse,
16025                                    RExC_end - RExC_parse,
16026                                    &numlen, UTF8_ALLOW_DEFAULT);
16027             RExC_parse += numlen;
16028         }
16029         else
16030             value = UCHARAT(RExC_parse++);
16031
16032         if (value == '[') {
16033             char * posix_class_end;
16034             namedclass = handle_possible_posix(pRExC_state,
16035                                                RExC_parse,
16036                                                &posix_class_end,
16037                                                do_posix_warnings ? &posix_warnings : NULL,
16038                                                FALSE    /* die if error */);
16039             if (namedclass > OOB_NAMEDCLASS) {
16040
16041                 /* If there was an earlier attempt to parse this particular
16042                  * posix class, and it failed, it was a false alarm, as this
16043                  * successful one proves */
16044                 if (   posix_warnings
16045                     && av_tindex_skip_len_mg(posix_warnings) >= 0
16046                     && not_posix_region_end >= RExC_parse
16047                     && not_posix_region_end <= posix_class_end)
16048                 {
16049                     av_undef(posix_warnings);
16050                 }
16051
16052                 RExC_parse = posix_class_end;
16053             }
16054             else if (namedclass == OOB_NAMEDCLASS) {
16055                 not_posix_region_end = posix_class_end;
16056             }
16057             else {
16058                 namedclass = OOB_NAMEDCLASS;
16059             }
16060         }
16061         else if (   RExC_parse - 1 > not_posix_region_end
16062                  && MAYBE_POSIXCC(value))
16063         {
16064             (void) handle_possible_posix(
16065                         pRExC_state,
16066                         RExC_parse - 1,  /* -1 because parse has already been
16067                                             advanced */
16068                         &not_posix_region_end,
16069                         do_posix_warnings ? &posix_warnings : NULL,
16070                         TRUE /* checking only */);
16071         }
16072         else if (value == '\\') {
16073             /* Is a backslash; get the code point of the char after it */
16074
16075             if (RExC_parse >= RExC_end) {
16076                 vFAIL("Unmatched [");
16077             }
16078
16079             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16080                 value = utf8n_to_uvchr((U8*)RExC_parse,
16081                                    RExC_end - RExC_parse,
16082                                    &numlen, UTF8_ALLOW_DEFAULT);
16083                 RExC_parse += numlen;
16084             }
16085             else
16086                 value = UCHARAT(RExC_parse++);
16087
16088             /* Some compilers cannot handle switching on 64-bit integer
16089              * values, therefore value cannot be an UV.  Yes, this will
16090              * be a problem later if we want switch on Unicode.
16091              * A similar issue a little bit later when switching on
16092              * namedclass. --jhi */
16093
16094             /* If the \ is escaping white space when white space is being
16095              * skipped, it means that that white space is wanted literally, and
16096              * is already in 'value'.  Otherwise, need to translate the escape
16097              * into what it signifies. */
16098             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16099
16100             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
16101             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
16102             case 's':   namedclass = ANYOF_SPACE;       break;
16103             case 'S':   namedclass = ANYOF_NSPACE;      break;
16104             case 'd':   namedclass = ANYOF_DIGIT;       break;
16105             case 'D':   namedclass = ANYOF_NDIGIT;      break;
16106             case 'v':   namedclass = ANYOF_VERTWS;      break;
16107             case 'V':   namedclass = ANYOF_NVERTWS;     break;
16108             case 'h':   namedclass = ANYOF_HORIZWS;     break;
16109             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
16110             case 'N':  /* Handle \N{NAME} in class */
16111                 {
16112                     const char * const backslash_N_beg = RExC_parse - 2;
16113                     int cp_count;
16114
16115                     if (! grok_bslash_N(pRExC_state,
16116                                         NULL,      /* No regnode */
16117                                         &value,    /* Yes single value */
16118                                         &cp_count, /* Multiple code pt count */
16119                                         flagp,
16120                                         strict,
16121                                         depth)
16122                     ) {
16123
16124                         if (*flagp & NEED_UTF8)
16125                             FAIL("panic: grok_bslash_N set NEED_UTF8");
16126                         if (*flagp & RESTART_PASS1)
16127                             return NULL;
16128
16129                         if (cp_count < 0) {
16130                             vFAIL("\\N in a character class must be a named character: \\N{...}");
16131                         }
16132                         else if (cp_count == 0) {
16133                             if (PASS2) {
16134                                 ckWARNreg(RExC_parse,
16135                                         "Ignoring zero length \\N{} in character class");
16136                             }
16137                         }
16138                         else { /* cp_count > 1 */
16139                             if (! RExC_in_multi_char_class) {
16140                                 if (invert || range || *RExC_parse == '-') {
16141                                     if (strict) {
16142                                         RExC_parse--;
16143                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16144                                     }
16145                                     else if (PASS2) {
16146                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16147                                     }
16148                                     break; /* <value> contains the first code
16149                                               point. Drop out of the switch to
16150                                               process it */
16151                                 }
16152                                 else {
16153                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
16154                                                  RExC_parse - backslash_N_beg);
16155                                     multi_char_matches
16156                                         = add_multi_match(multi_char_matches,
16157                                                           multi_char_N,
16158                                                           cp_count);
16159                                 }
16160                             }
16161                         } /* End of cp_count != 1 */
16162
16163                         /* This element should not be processed further in this
16164                          * class */
16165                         element_count--;
16166                         value = save_value;
16167                         prevvalue = save_prevvalue;
16168                         continue;   /* Back to top of loop to get next char */
16169                     }
16170
16171                     /* Here, is a single code point, and <value> contains it */
16172                     unicode_range = TRUE;   /* \N{} are Unicode */
16173                 }
16174                 break;
16175             case 'p':
16176             case 'P':
16177                 {
16178                 char *e;
16179
16180                 /* We will handle any undefined properties ourselves */
16181                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16182                                        /* And we actually would prefer to get
16183                                         * the straight inversion list of the
16184                                         * swash, since we will be accessing it
16185                                         * anyway, to save a little time */
16186                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16187
16188                 if (RExC_parse >= RExC_end)
16189                     vFAIL2("Empty \\%c", (U8)value);
16190                 if (*RExC_parse == '{') {
16191                     const U8 c = (U8)value;
16192                     e = strchr(RExC_parse, '}');
16193                     if (!e) {
16194                         RExC_parse++;
16195                         vFAIL2("Missing right brace on \\%c{}", c);
16196                     }
16197
16198                     RExC_parse++;
16199                     while (isSPACE(*RExC_parse)) {
16200                          RExC_parse++;
16201                     }
16202
16203                     if (UCHARAT(RExC_parse) == '^') {
16204
16205                         /* toggle.  (The rhs xor gets the single bit that
16206                          * differs between P and p; the other xor inverts just
16207                          * that bit) */
16208                         value ^= 'P' ^ 'p';
16209
16210                         RExC_parse++;
16211                         while (isSPACE(*RExC_parse)) {
16212                             RExC_parse++;
16213                         }
16214                     }
16215
16216                     if (e == RExC_parse)
16217                         vFAIL2("Empty \\%c{}", c);
16218
16219                     n = e - RExC_parse;
16220                     while (isSPACE(*(RExC_parse + n - 1)))
16221                         n--;
16222                 }   /* The \p isn't immediately followed by a '{' */
16223                 else if (! isALPHA(*RExC_parse)) {
16224                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16225                     vFAIL2("Character following \\%c must be '{' or a "
16226                            "single-character Unicode property name",
16227                            (U8) value);
16228                 }
16229                 else {
16230                     e = RExC_parse;
16231                     n = 1;
16232                 }
16233                 if (!SIZE_ONLY) {
16234                     SV* invlist;
16235                     char* name;
16236                     char* base_name;    /* name after any packages are stripped */
16237                     char* lookup_name = NULL;
16238                     const char * const colon_colon = "::";
16239
16240                     /* Try to get the definition of the property into
16241                      * <invlist>.  If /i is in effect, the effective property
16242                      * will have its name be <__NAME_i>.  The design is
16243                      * discussed in commit
16244                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16245                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16246                     SAVEFREEPV(name);
16247                     if (FOLD) {
16248                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16249
16250                         /* The function call just below that uses this can fail
16251                          * to return, leaking memory if we don't do this */
16252                         SAVEFREEPV(lookup_name);
16253                     }
16254
16255                     /* Look up the property name, and get its swash and
16256                      * inversion list, if the property is found  */
16257                     SvREFCNT_dec(swash); /* Free any left-overs */
16258                     swash = _core_swash_init("utf8",
16259                                              (lookup_name)
16260                                               ? lookup_name
16261                                               : name,
16262                                              &PL_sv_undef,
16263                                              1, /* binary */
16264                                              0, /* not tr/// */
16265                                              NULL, /* No inversion list */
16266                                              &swash_init_flags
16267                                             );
16268                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16269                         HV* curpkg = (IN_PERL_COMPILETIME)
16270                                       ? PL_curstash
16271                                       : CopSTASH(PL_curcop);
16272                         UV final_n = n;
16273                         bool has_pkg;
16274
16275                         if (swash) {    /* Got a swash but no inversion list.
16276                                            Something is likely wrong that will
16277                                            be sorted-out later */
16278                             SvREFCNT_dec_NN(swash);
16279                             swash = NULL;
16280                         }
16281
16282                         /* Here didn't find it.  It could be a an error (like a
16283                          * typo) in specifying a Unicode property, or it could
16284                          * be a user-defined property that will be available at
16285                          * run-time.  The names of these must begin with 'In'
16286                          * or 'Is' (after any packages are stripped off).  So
16287                          * if not one of those, or if we accept only
16288                          * compile-time properties, is an error; otherwise add
16289                          * it to the list for run-time look up. */
16290                         if ((base_name = rninstr(name, name + n,
16291                                                  colon_colon, colon_colon + 2)))
16292                         { /* Has ::.  We know this must be a user-defined
16293                              property */
16294                             base_name += 2;
16295                             final_n -= base_name - name;
16296                             has_pkg = TRUE;
16297                         }
16298                         else {
16299                             base_name = name;
16300                             has_pkg = FALSE;
16301                         }
16302
16303                         if (   final_n < 3
16304                             || base_name[0] != 'I'
16305                             || (base_name[1] != 's' && base_name[1] != 'n')
16306                             || ret_invlist)
16307                         {
16308                             const char * const msg
16309                                 = (has_pkg)
16310                                   ? "Illegal user-defined property name"
16311                                   : "Can't find Unicode property definition";
16312                             RExC_parse = e + 1;
16313
16314                             /* diag_listed_as: Can't find Unicode property definition "%s" */
16315                             vFAIL3utf8f("%s \"%" UTF8f "\"",
16316                                 msg, UTF8fARG(UTF, n, name));
16317                         }
16318
16319                         /* If the property name doesn't already have a package
16320                          * name, add the current one to it so that it can be
16321                          * referred to outside it. [perl #121777] */
16322                         if (! has_pkg && curpkg) {
16323                             char* pkgname = HvNAME(curpkg);
16324                             if (strNE(pkgname, "main")) {
16325                                 char* full_name = Perl_form(aTHX_
16326                                                             "%s::%s",
16327                                                             pkgname,
16328                                                             name);
16329                                 n = strlen(full_name);
16330                                 name = savepvn(full_name, n);
16331                                 SAVEFREEPV(name);
16332                             }
16333                         }
16334                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
16335                                         (value == 'p' ? '+' : '!'),
16336                                         (FOLD) ? "__" : "",
16337                                         UTF8fARG(UTF, n, name),
16338                                         (FOLD) ? "_i" : "");
16339                         has_user_defined_property = TRUE;
16340                         optimizable = FALSE;    /* Will have to leave this an
16341                                                    ANYOF node */
16342
16343                         /* We don't know yet what this matches, so have to flag
16344                          * it */
16345                         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16346                     }
16347                     else {
16348
16349                         /* Here, did get the swash and its inversion list.  If
16350                          * the swash is from a user-defined property, then this
16351                          * whole character class should be regarded as such */
16352                         if (swash_init_flags
16353                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16354                         {
16355                             has_user_defined_property = TRUE;
16356                         }
16357                         else if
16358                             /* We warn on matching an above-Unicode code point
16359                              * if the match would return true, except don't
16360                              * warn for \p{All}, which has exactly one element
16361                              * = 0 */
16362                             (_invlist_contains_cp(invlist, 0x110000)
16363                                 && (! (_invlist_len(invlist) == 1
16364                                        && *invlist_array(invlist) == 0)))
16365                         {
16366                             warn_super = TRUE;
16367                         }
16368
16369
16370                         /* Invert if asking for the complement */
16371                         if (value == 'P') {
16372                             _invlist_union_complement_2nd(properties,
16373                                                           invlist,
16374                                                           &properties);
16375
16376                             /* The swash can't be used as-is, because we've
16377                              * inverted things; delay removing it to here after
16378                              * have copied its invlist above */
16379                             SvREFCNT_dec_NN(swash);
16380                             swash = NULL;
16381                         }
16382                         else {
16383                             _invlist_union(properties, invlist, &properties);
16384                         }
16385                     }
16386                 }
16387                 RExC_parse = e + 1;
16388                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
16389                                                 named */
16390
16391                 /* \p means they want Unicode semantics */
16392                 REQUIRE_UNI_RULES(flagp, NULL);
16393                 }
16394                 break;
16395             case 'n':   value = '\n';                   break;
16396             case 'r':   value = '\r';                   break;
16397             case 't':   value = '\t';                   break;
16398             case 'f':   value = '\f';                   break;
16399             case 'b':   value = '\b';                   break;
16400             case 'e':   value = ESC_NATIVE;             break;
16401             case 'a':   value = '\a';                   break;
16402             case 'o':
16403                 RExC_parse--;   /* function expects to be pointed at the 'o' */
16404                 {
16405                     const char* error_msg;
16406                     bool valid = grok_bslash_o(&RExC_parse,
16407                                                &value,
16408                                                &error_msg,
16409                                                PASS2,   /* warnings only in
16410                                                            pass 2 */
16411                                                strict,
16412                                                silence_non_portable,
16413                                                UTF);
16414                     if (! valid) {
16415                         vFAIL(error_msg);
16416                     }
16417                 }
16418                 non_portable_endpoint++;
16419                 break;
16420             case 'x':
16421                 RExC_parse--;   /* function expects to be pointed at the 'x' */
16422                 {
16423                     const char* error_msg;
16424                     bool valid = grok_bslash_x(&RExC_parse,
16425                                                &value,
16426                                                &error_msg,
16427                                                PASS2, /* Output warnings */
16428                                                strict,
16429                                                silence_non_portable,
16430                                                UTF);
16431                     if (! valid) {
16432                         vFAIL(error_msg);
16433                     }
16434                 }
16435                 non_portable_endpoint++;
16436                 break;
16437             case 'c':
16438                 value = grok_bslash_c(*RExC_parse++, PASS2);
16439                 non_portable_endpoint++;
16440                 break;
16441             case '0': case '1': case '2': case '3': case '4':
16442             case '5': case '6': case '7':
16443                 {
16444                     /* Take 1-3 octal digits */
16445                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16446                     numlen = (strict) ? 4 : 3;
16447                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16448                     RExC_parse += numlen;
16449                     if (numlen != 3) {
16450                         if (strict) {
16451                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16452                             vFAIL("Need exactly 3 octal digits");
16453                         }
16454                         else if (! SIZE_ONLY /* like \08, \178 */
16455                                  && numlen < 3
16456                                  && RExC_parse < RExC_end
16457                                  && isDIGIT(*RExC_parse)
16458                                  && ckWARN(WARN_REGEXP))
16459                         {
16460                             SAVEFREESV(RExC_rx_sv);
16461                             reg_warn_non_literal_string(
16462                                  RExC_parse + 1,
16463                                  form_short_octal_warning(RExC_parse, numlen));
16464                             (void)ReREFCNT_inc(RExC_rx_sv);
16465                         }
16466                     }
16467                     non_portable_endpoint++;
16468                     break;
16469                 }
16470             default:
16471                 /* Allow \_ to not give an error */
16472                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16473                     if (strict) {
16474                         vFAIL2("Unrecognized escape \\%c in character class",
16475                                (int)value);
16476                     }
16477                     else {
16478                         SAVEFREESV(RExC_rx_sv);
16479                         ckWARN2reg(RExC_parse,
16480                             "Unrecognized escape \\%c in character class passed through",
16481                             (int)value);
16482                         (void)ReREFCNT_inc(RExC_rx_sv);
16483                     }
16484                 }
16485                 break;
16486             }   /* End of switch on char following backslash */
16487         } /* end of handling backslash escape sequences */
16488
16489         /* Here, we have the current token in 'value' */
16490
16491         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16492             U8 classnum;
16493
16494             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
16495              * literal, as is the character that began the false range, i.e.
16496              * the 'a' in the examples */
16497             if (range) {
16498                 if (!SIZE_ONLY) {
16499                     const int w = (RExC_parse >= rangebegin)
16500                                   ? RExC_parse - rangebegin
16501                                   : 0;
16502                     if (strict) {
16503                         vFAIL2utf8f(
16504                             "False [] range \"%" UTF8f "\"",
16505                             UTF8fARG(UTF, w, rangebegin));
16506                     }
16507                     else {
16508                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16509                         ckWARN2reg(RExC_parse,
16510                             "False [] range \"%" UTF8f "\"",
16511                             UTF8fARG(UTF, w, rangebegin));
16512                         (void)ReREFCNT_inc(RExC_rx_sv);
16513                         cp_list = add_cp_to_invlist(cp_list, '-');
16514                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16515                                                              prevvalue);
16516                     }
16517                 }
16518
16519                 range = 0; /* this was not a true range */
16520                 element_count += 2; /* So counts for three values */
16521             }
16522
16523             classnum = namedclass_to_classnum(namedclass);
16524
16525             if (LOC && namedclass < ANYOF_POSIXL_MAX
16526 #ifndef HAS_ISASCII
16527                 && classnum != _CC_ASCII
16528 #endif
16529             ) {
16530                 /* What the Posix classes (like \w, [:space:]) match in locale
16531                  * isn't knowable under locale until actual match time.  Room
16532                  * must be reserved (one time per outer bracketed class) to
16533                  * store such classes.  The space will contain a bit for each
16534                  * named class that is to be matched against.  This isn't
16535                  * needed for \p{} and pseudo-classes, as they are not affected
16536                  * by locale, and hence are dealt with separately */
16537                 if (! need_class) {
16538                     need_class = 1;
16539                     if (SIZE_ONLY) {
16540                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16541                     }
16542                     else {
16543                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16544                     }
16545                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16546                     ANYOF_POSIXL_ZERO(ret);
16547
16548                     /* We can't change this into some other type of node
16549                      * (unless this is the only element, in which case there
16550                      * are nodes that mean exactly this) as has runtime
16551                      * dependencies */
16552                     optimizable = FALSE;
16553                 }
16554
16555                 /* Coverity thinks it is possible for this to be negative; both
16556                  * jhi and khw think it's not, but be safer */
16557                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16558                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16559
16560                 /* See if it already matches the complement of this POSIX
16561                  * class */
16562                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16563                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16564                                                             ? -1
16565                                                             : 1)))
16566                 {
16567                     posixl_matches_all = TRUE;
16568                     break;  /* No need to continue.  Since it matches both
16569                                e.g., \w and \W, it matches everything, and the
16570                                bracketed class can be optimized into qr/./s */
16571                 }
16572
16573                 /* Add this class to those that should be checked at runtime */
16574                 ANYOF_POSIXL_SET(ret, namedclass);
16575
16576                 /* The above-Latin1 characters are not subject to locale rules.
16577                  * Just add them, in the second pass, to the
16578                  * unconditionally-matched list */
16579                 if (! SIZE_ONLY) {
16580                     SV* scratch_list = NULL;
16581
16582                     /* Get the list of the above-Latin1 code points this
16583                      * matches */
16584                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16585                                           PL_XPosix_ptrs[classnum],
16586
16587                                           /* Odd numbers are complements, like
16588                                            * NDIGIT, NASCII, ... */
16589                                           namedclass % 2 != 0,
16590                                           &scratch_list);
16591                     /* Checking if 'cp_list' is NULL first saves an extra
16592                      * clone.  Its reference count will be decremented at the
16593                      * next union, etc, or if this is the only instance, at the
16594                      * end of the routine */
16595                     if (! cp_list) {
16596                         cp_list = scratch_list;
16597                     }
16598                     else {
16599                         _invlist_union(cp_list, scratch_list, &cp_list);
16600                         SvREFCNT_dec_NN(scratch_list);
16601                     }
16602                     continue;   /* Go get next character */
16603                 }
16604             }
16605             else if (! SIZE_ONLY) {
16606
16607                 /* Here, not in pass1 (in that pass we skip calculating the
16608                  * contents of this class), and is not /l, or is a POSIX class
16609                  * for which /l doesn't matter (or is a Unicode property, which
16610                  * is skipped here). */
16611                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
16612                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16613
16614                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
16615                          * nor /l make a difference in what these match,
16616                          * therefore we just add what they match to cp_list. */
16617                         if (classnum != _CC_VERTSPACE) {
16618                             assert(   namedclass == ANYOF_HORIZWS
16619                                    || namedclass == ANYOF_NHORIZWS);
16620
16621                             /* It turns out that \h is just a synonym for
16622                              * XPosixBlank */
16623                             classnum = _CC_BLANK;
16624                         }
16625
16626                         _invlist_union_maybe_complement_2nd(
16627                                 cp_list,
16628                                 PL_XPosix_ptrs[classnum],
16629                                 namedclass % 2 != 0,    /* Complement if odd
16630                                                           (NHORIZWS, NVERTWS)
16631                                                         */
16632                                 &cp_list);
16633                     }
16634                 }
16635                 else if (  UNI_SEMANTICS
16636                         || classnum == _CC_ASCII
16637                         || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
16638                                                   || classnum == _CC_XDIGIT)))
16639                 {
16640                     /* We usually have to worry about /d and /a affecting what
16641                      * POSIX classes match, with special code needed for /d
16642                      * because we won't know until runtime what all matches.
16643                      * But there is no extra work needed under /u, and
16644                      * [:ascii:] is unaffected by /a and /d; and :digit: and
16645                      * :xdigit: don't have runtime differences under /d.  So we
16646                      * can special case these, and avoid some extra work below,
16647                      * and at runtime. */
16648                     _invlist_union_maybe_complement_2nd(
16649                                                      simple_posixes,
16650                                                      PL_XPosix_ptrs[classnum],
16651                                                      namedclass % 2 != 0,
16652                                                      &simple_posixes);
16653                 }
16654                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
16655                            complement and use nposixes */
16656                     SV** posixes_ptr = namedclass % 2 == 0
16657                                        ? &posixes
16658                                        : &nposixes;
16659                     _invlist_union_maybe_complement_2nd(
16660                                                      *posixes_ptr,
16661                                                      PL_XPosix_ptrs[classnum],
16662                                                      namedclass % 2 != 0,
16663                                                      posixes_ptr);
16664                 }
16665             }
16666         } /* end of namedclass \blah */
16667
16668         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16669
16670         /* If 'range' is set, 'value' is the ending of a range--check its
16671          * validity.  (If value isn't a single code point in the case of a
16672          * range, we should have figured that out above in the code that
16673          * catches false ranges).  Later, we will handle each individual code
16674          * point in the range.  If 'range' isn't set, this could be the
16675          * beginning of a range, so check for that by looking ahead to see if
16676          * the next real character to be processed is the range indicator--the
16677          * minus sign */
16678
16679         if (range) {
16680 #ifdef EBCDIC
16681             /* For unicode ranges, we have to test that the Unicode as opposed
16682              * to the native values are not decreasing.  (Above 255, there is
16683              * no difference between native and Unicode) */
16684             if (unicode_range && prevvalue < 255 && value < 255) {
16685                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16686                     goto backwards_range;
16687                 }
16688             }
16689             else
16690 #endif
16691             if (prevvalue > value) /* b-a */ {
16692                 int w;
16693 #ifdef EBCDIC
16694               backwards_range:
16695 #endif
16696                 w = RExC_parse - rangebegin;
16697                 vFAIL2utf8f(
16698                     "Invalid [] range \"%" UTF8f "\"",
16699                     UTF8fARG(UTF, w, rangebegin));
16700                 NOT_REACHED; /* NOTREACHED */
16701             }
16702         }
16703         else {
16704             prevvalue = value; /* save the beginning of the potential range */
16705             if (! stop_at_1     /* Can't be a range if parsing just one thing */
16706                 && *RExC_parse == '-')
16707             {
16708                 char* next_char_ptr = RExC_parse + 1;
16709
16710                 /* Get the next real char after the '-' */
16711                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16712
16713                 /* If the '-' is at the end of the class (just before the ']',
16714                  * it is a literal minus; otherwise it is a range */
16715                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16716                     RExC_parse = next_char_ptr;
16717
16718                     /* a bad range like \w-, [:word:]- ? */
16719                     if (namedclass > OOB_NAMEDCLASS) {
16720                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16721                             const int w = RExC_parse >= rangebegin
16722                                           ?  RExC_parse - rangebegin
16723                                           : 0;
16724                             if (strict) {
16725                                 vFAIL4("False [] range \"%*.*s\"",
16726                                     w, w, rangebegin);
16727                             }
16728                             else if (PASS2) {
16729                                 vWARN4(RExC_parse,
16730                                     "False [] range \"%*.*s\"",
16731                                     w, w, rangebegin);
16732                             }
16733                         }
16734                         if (!SIZE_ONLY) {
16735                             cp_list = add_cp_to_invlist(cp_list, '-');
16736                         }
16737                         element_count++;
16738                     } else
16739                         range = 1;      /* yeah, it's a range! */
16740                     continue;   /* but do it the next time */
16741                 }
16742             }
16743         }
16744
16745         if (namedclass > OOB_NAMEDCLASS) {
16746             continue;
16747         }
16748
16749         /* Here, we have a single value this time through the loop, and
16750          * <prevvalue> is the beginning of the range, if any; or <value> if
16751          * not. */
16752
16753         /* non-Latin1 code point implies unicode semantics.  Must be set in
16754          * pass1 so is there for the whole of pass 2 */
16755         if (value > 255) {
16756             REQUIRE_UNI_RULES(flagp, NULL);
16757         }
16758
16759         /* Ready to process either the single value, or the completed range.
16760          * For single-valued non-inverted ranges, we consider the possibility
16761          * of multi-char folds.  (We made a conscious decision to not do this
16762          * for the other cases because it can often lead to non-intuitive
16763          * results.  For example, you have the peculiar case that:
16764          *  "s s" =~ /^[^\xDF]+$/i => Y
16765          *  "ss"  =~ /^[^\xDF]+$/i => N
16766          *
16767          * See [perl #89750] */
16768         if (FOLD && allow_multi_folds && value == prevvalue) {
16769             if (value == LATIN_SMALL_LETTER_SHARP_S
16770                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
16771                                                         value)))
16772             {
16773                 /* Here <value> is indeed a multi-char fold.  Get what it is */
16774
16775                 U8 foldbuf[UTF8_MAXBYTES_CASE];
16776                 STRLEN foldlen;
16777
16778                 UV folded = _to_uni_fold_flags(
16779                                 value,
16780                                 foldbuf,
16781                                 &foldlen,
16782                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
16783                                                    ? FOLD_FLAGS_NOMIX_ASCII
16784                                                    : 0)
16785                                 );
16786
16787                 /* Here, <folded> should be the first character of the
16788                  * multi-char fold of <value>, with <foldbuf> containing the
16789                  * whole thing.  But, if this fold is not allowed (because of
16790                  * the flags), <fold> will be the same as <value>, and should
16791                  * be processed like any other character, so skip the special
16792                  * handling */
16793                 if (folded != value) {
16794
16795                     /* Skip if we are recursed, currently parsing the class
16796                      * again.  Otherwise add this character to the list of
16797                      * multi-char folds. */
16798                     if (! RExC_in_multi_char_class) {
16799                         STRLEN cp_count = utf8_length(foldbuf,
16800                                                       foldbuf + foldlen);
16801                         SV* multi_fold = sv_2mortal(newSVpvs(""));
16802
16803                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
16804
16805                         multi_char_matches
16806                                         = add_multi_match(multi_char_matches,
16807                                                           multi_fold,
16808                                                           cp_count);
16809
16810                     }
16811
16812                     /* This element should not be processed further in this
16813                      * class */
16814                     element_count--;
16815                     value = save_value;
16816                     prevvalue = save_prevvalue;
16817                     continue;
16818                 }
16819             }
16820         }
16821
16822         if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
16823             if (range) {
16824
16825                 /* If the range starts above 255, everything is portable and
16826                  * likely to be so for any forseeable character set, so don't
16827                  * warn. */
16828                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
16829                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
16830                 }
16831                 else if (prevvalue != value) {
16832
16833                     /* Under strict, ranges that stop and/or end in an ASCII
16834                      * printable should have each end point be a portable value
16835                      * for it (preferably like 'A', but we don't warn if it is
16836                      * a (portable) Unicode name or code point), and the range
16837                      * must be be all digits or all letters of the same case.
16838                      * Otherwise, the range is non-portable and unclear as to
16839                      * what it contains */
16840                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
16841                         && (          non_portable_endpoint
16842                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
16843                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
16844                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
16845                     ))) {
16846                         vWARN(RExC_parse, "Ranges of ASCII printables should"
16847                                           " be some subset of \"0-9\","
16848                                           " \"A-Z\", or \"a-z\"");
16849                     }
16850                     else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
16851                         SSize_t index_start;
16852                         SSize_t index_final;
16853
16854                         /* But the nature of Unicode and languages mean we
16855                          * can't do the same checks for above-ASCII ranges,
16856                          * except in the case of digit ones.  These should
16857                          * contain only digits from the same group of 10.  The
16858                          * ASCII case is handled just above.  0x660 is the
16859                          * first digit character beyond ASCII.  Hence here, the
16860                          * range could be a range of digits.  First some
16861                          * unlikely special cases.  Grandfather in that a range
16862                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
16863                          * if its starting value is one of the 10 digits prior
16864                          * to it.  This is because it is an alternate way of
16865                          * writing 19D1, and some people may expect it to be in
16866                          * that group.  But it is bad, because it won't give
16867                          * the expected results.  In Unicode 5.2 it was
16868                          * considered to be in that group (of 11, hence), but
16869                          * this was fixed in the next version */
16870
16871                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
16872                             goto warn_bad_digit_range;
16873                         }
16874                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
16875                                           &&     value <= 0x1D7FF))
16876                         {
16877                             /* This is the only other case currently in Unicode
16878                              * where the algorithm below fails.  The code
16879                              * points just above are the end points of a single
16880                              * range containing only decimal digits.  It is 5
16881                              * different series of 0-9.  All other ranges of
16882                              * digits currently in Unicode are just a single
16883                              * series.  (And mktables will notify us if a later
16884                              * Unicode version breaks this.)
16885                              *
16886                              * If the range being checked is at most 9 long,
16887                              * and the digit values represented are in
16888                              * numerical order, they are from the same series.
16889                              * */
16890                             if (         value - prevvalue > 9
16891                                 ||    (((    value - 0x1D7CE) % 10)
16892                                      <= (prevvalue - 0x1D7CE) % 10))
16893                             {
16894                                 goto warn_bad_digit_range;
16895                             }
16896                         }
16897                         else {
16898
16899                             /* For all other ranges of digits in Unicode, the
16900                              * algorithm is just to check if both end points
16901                              * are in the same series, which is the same range.
16902                              * */
16903                             index_start = _invlist_search(
16904                                                     PL_XPosix_ptrs[_CC_DIGIT],
16905                                                     prevvalue);
16906
16907                             /* Warn if the range starts and ends with a digit,
16908                              * and they are not in the same group of 10. */
16909                             if (   index_start >= 0
16910                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
16911                                 && (index_final =
16912                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16913                                                     value)) != index_start
16914                                 && index_final >= 0
16915                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
16916                             {
16917                               warn_bad_digit_range:
16918                                 vWARN(RExC_parse, "Ranges of digits should be"
16919                                                   " from the same group of"
16920                                                   " 10");
16921                             }
16922                         }
16923                     }
16924                 }
16925             }
16926             if ((! range || prevvalue == value) && non_portable_endpoint) {
16927                 if (isPRINT_A(value)) {
16928                     char literal[3];
16929                     unsigned d = 0;
16930                     if (isBACKSLASHED_PUNCT(value)) {
16931                         literal[d++] = '\\';
16932                     }
16933                     literal[d++] = (char) value;
16934                     literal[d++] = '\0';
16935
16936                     vWARN4dep(RExC_parse,
16937                            "\"%.*s\" is more clearly written simply as \"%s\". "
16938                            "This will be a fatal error in Perl 5.28",
16939                            (int) (RExC_parse - rangebegin),
16940                            rangebegin,
16941                            literal
16942                     );
16943                 }
16944                 else if isMNEMONIC_CNTRL(value) {
16945                     vWARN4dep(RExC_parse,
16946                            "\"%.*s\" is more clearly written simply as \"%s\". "
16947                            "This will be a fatal error in Perl 5.28",
16948                            (int) (RExC_parse - rangebegin),
16949                            rangebegin,
16950                            cntrl_to_mnemonic((U8) value)
16951                     );
16952                 }
16953             }
16954         }
16955
16956         /* Deal with this element of the class */
16957         if (! SIZE_ONLY) {
16958
16959 #ifndef EBCDIC
16960             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16961                                                      prevvalue, value);
16962 #else
16963             /* On non-ASCII platforms, for ranges that span all of 0..255, and
16964              * ones that don't require special handling, we can just add the
16965              * range like we do for ASCII platforms */
16966             if ((UNLIKELY(prevvalue == 0) && value >= 255)
16967                 || ! (prevvalue < 256
16968                       && (unicode_range
16969                           || (! non_portable_endpoint
16970                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
16971                                   || (isUPPER_A(prevvalue)
16972                                       && isUPPER_A(value)))))))
16973             {
16974                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16975                                                          prevvalue, value);
16976             }
16977             else {
16978                 /* Here, requires special handling.  This can be because it is
16979                  * a range whose code points are considered to be Unicode, and
16980                  * so must be individually translated into native, or because
16981                  * its a subrange of 'A-Z' or 'a-z' which each aren't
16982                  * contiguous in EBCDIC, but we have defined them to include
16983                  * only the "expected" upper or lower case ASCII alphabetics.
16984                  * Subranges above 255 are the same in native and Unicode, so
16985                  * can be added as a range */
16986                 U8 start = NATIVE_TO_LATIN1(prevvalue);
16987                 unsigned j;
16988                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
16989                 for (j = start; j <= end; j++) {
16990                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
16991                 }
16992                 if (value > 255) {
16993                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16994                                                              256, value);
16995                 }
16996             }
16997 #endif
16998         }
16999
17000         range = 0; /* this range (if it was one) is done now */
17001     } /* End of loop through all the text within the brackets */
17002
17003
17004     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17005         output_or_return_posix_warnings(pRExC_state, posix_warnings,
17006                                         return_posix_warnings);
17007     }
17008
17009     /* If anything in the class expands to more than one character, we have to
17010      * deal with them by building up a substitute parse string, and recursively
17011      * calling reg() on it, instead of proceeding */
17012     if (multi_char_matches) {
17013         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17014         I32 cp_count;
17015         STRLEN len;
17016         char *save_end = RExC_end;
17017         char *save_parse = RExC_parse;
17018         char *save_start = RExC_start;
17019         STRLEN prefix_end = 0;      /* We copy the character class after a
17020                                        prefix supplied here.  This is the size
17021                                        + 1 of that prefix */
17022         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17023                                        a "|" */
17024         I32 reg_flags;
17025
17026         assert(! invert);
17027         assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
17028
17029 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17030            because too confusing */
17031         if (invert) {
17032             sv_catpv(substitute_parse, "(?:");
17033         }
17034 #endif
17035
17036         /* Look at the longest folds first */
17037         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17038                         cp_count > 0;
17039                         cp_count--)
17040         {
17041
17042             if (av_exists(multi_char_matches, cp_count)) {
17043                 AV** this_array_ptr;
17044                 SV* this_sequence;
17045
17046                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17047                                                  cp_count, FALSE);
17048                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17049                                                                 &PL_sv_undef)
17050                 {
17051                     if (! first_time) {
17052                         sv_catpv(substitute_parse, "|");
17053                     }
17054                     first_time = FALSE;
17055
17056                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17057                 }
17058             }
17059         }
17060
17061         /* If the character class contains anything else besides these
17062          * multi-character folds, have to include it in recursive parsing */
17063         if (element_count) {
17064             sv_catpv(substitute_parse, "|[");
17065             prefix_end = SvCUR(substitute_parse);
17066             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17067
17068             /* Put in a closing ']' only if not going off the end, as otherwise
17069              * we are adding something that really isn't there */
17070             if (RExC_parse < RExC_end) {
17071                 sv_catpv(substitute_parse, "]");
17072             }
17073         }
17074
17075         sv_catpv(substitute_parse, ")");
17076 #if 0
17077         if (invert) {
17078             /* This is a way to get the parse to skip forward a whole named
17079              * sequence instead of matching the 2nd character when it fails the
17080              * first */
17081             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17082         }
17083 #endif
17084
17085         /* Set up the data structure so that any errors will be properly
17086          * reported.  See the comments at the definition of
17087          * REPORT_LOCATION_ARGS for details */
17088         RExC_precomp_adj = orig_parse - RExC_precomp;
17089         RExC_start =  RExC_parse = SvPV(substitute_parse, len);
17090         RExC_adjusted_start = RExC_start + prefix_end;
17091         RExC_end = RExC_parse + len;
17092         RExC_in_multi_char_class = 1;
17093         RExC_emit = (regnode *)orig_emit;
17094
17095         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17096
17097         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
17098
17099         /* And restore so can parse the rest of the pattern */
17100         RExC_parse = save_parse;
17101         RExC_start = RExC_adjusted_start = save_start;
17102         RExC_precomp_adj = 0;
17103         RExC_end = save_end;
17104         RExC_in_multi_char_class = 0;
17105         SvREFCNT_dec_NN(multi_char_matches);
17106         return ret;
17107     }
17108
17109     /* Here, we've gone through the entire class and dealt with multi-char
17110      * folds.  We are now in a position that we can do some checks to see if we
17111      * can optimize this ANYOF node into a simpler one, even in Pass 1.
17112      * Currently we only do two checks:
17113      * 1) is in the unlikely event that the user has specified both, eg. \w and
17114      *    \W under /l, then the class matches everything.  (This optimization
17115      *    is done only to make the optimizer code run later work.)
17116      * 2) if the character class contains only a single element (including a
17117      *    single range), we see if there is an equivalent node for it.
17118      * Other checks are possible */
17119     if (   optimizable
17120         && ! ret_invlist   /* Can't optimize if returning the constructed
17121                               inversion list */
17122         && (UNLIKELY(posixl_matches_all) || element_count == 1))
17123     {
17124         U8 op = END;
17125         U8 arg = 0;
17126
17127         if (UNLIKELY(posixl_matches_all)) {
17128             op = SANY;
17129         }
17130         else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
17131                                                    class, like \w or [:digit:]
17132                                                    or \p{foo} */
17133
17134             /* All named classes are mapped into POSIXish nodes, with its FLAG
17135              * argument giving which class it is */
17136             switch ((I32)namedclass) {
17137                 case ANYOF_UNIPROP:
17138                     break;
17139
17140                 /* These don't depend on the charset modifiers.  They always
17141                  * match under /u rules */
17142                 case ANYOF_NHORIZWS:
17143                 case ANYOF_HORIZWS:
17144                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
17145                     /* FALLTHROUGH */
17146
17147                 case ANYOF_NVERTWS:
17148                 case ANYOF_VERTWS:
17149                     op = POSIXU;
17150                     goto join_posix;
17151
17152                 /* The actual POSIXish node for all the rest depends on the
17153                  * charset modifier.  The ones in the first set depend only on
17154                  * ASCII or, if available on this platform, also locale */
17155                 case ANYOF_ASCII:
17156                 case ANYOF_NASCII:
17157 #ifdef HAS_ISASCII
17158                     op = (LOC) ? POSIXL : POSIXA;
17159 #else
17160                     op = POSIXA;
17161 #endif
17162                     goto join_posix;
17163
17164                 /* The following don't have any matches in the upper Latin1
17165                  * range, hence /d is equivalent to /u for them.  Making it /u
17166                  * saves some branches at runtime */
17167                 case ANYOF_DIGIT:
17168                 case ANYOF_NDIGIT:
17169                 case ANYOF_XDIGIT:
17170                 case ANYOF_NXDIGIT:
17171                     if (! DEPENDS_SEMANTICS) {
17172                         goto treat_as_default;
17173                     }
17174
17175                     op = POSIXU;
17176                     goto join_posix;
17177
17178                 /* The following change to CASED under /i */
17179                 case ANYOF_LOWER:
17180                 case ANYOF_NLOWER:
17181                 case ANYOF_UPPER:
17182                 case ANYOF_NUPPER:
17183                     if (FOLD) {
17184                         namedclass = ANYOF_CASED + (namedclass % 2);
17185                     }
17186                     /* FALLTHROUGH */
17187
17188                 /* The rest have more possibilities depending on the charset.
17189                  * We take advantage of the enum ordering of the charset
17190                  * modifiers to get the exact node type, */
17191                 default:
17192                   treat_as_default:
17193                     op = POSIXD + get_regex_charset(RExC_flags);
17194                     if (op > POSIXA) { /* /aa is same as /a */
17195                         op = POSIXA;
17196                     }
17197
17198                   join_posix:
17199                     /* The odd numbered ones are the complements of the
17200                      * next-lower even number one */
17201                     if (namedclass % 2 == 1) {
17202                         invert = ! invert;
17203                         namedclass--;
17204                     }
17205                     arg = namedclass_to_classnum(namedclass);
17206                     break;
17207             }
17208         }
17209         else if (value == prevvalue) {
17210
17211             /* Here, the class consists of just a single code point */
17212
17213             if (invert) {
17214                 if (! LOC && value == '\n') {
17215                     op = REG_ANY; /* Optimize [^\n] */
17216                     *flagp |= HASWIDTH|SIMPLE;
17217                     MARK_NAUGHTY(1);
17218                 }
17219             }
17220             else if (value < 256 || UTF) {
17221
17222                 /* Optimize a single value into an EXACTish node, but not if it
17223                  * would require converting the pattern to UTF-8. */
17224                 op = compute_EXACTish(pRExC_state);
17225             }
17226         } /* Otherwise is a range */
17227         else if (! LOC) {   /* locale could vary these */
17228             if (prevvalue == '0') {
17229                 if (value == '9') {
17230                     arg = _CC_DIGIT;
17231                     op = POSIXA;
17232                 }
17233             }
17234             else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17235                 /* We can optimize A-Z or a-z, but not if they could match
17236                  * something like the KELVIN SIGN under /i. */
17237                 if (prevvalue == 'A') {
17238                     if (value == 'Z'
17239 #ifdef EBCDIC
17240                         && ! non_portable_endpoint
17241 #endif
17242                     ) {
17243                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17244                         op = POSIXA;
17245                     }
17246                 }
17247                 else if (prevvalue == 'a') {
17248                     if (value == 'z'
17249 #ifdef EBCDIC
17250                         && ! non_portable_endpoint
17251 #endif
17252                     ) {
17253                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17254                         op = POSIXA;
17255                     }
17256                 }
17257             }
17258         }
17259
17260         /* Here, we have changed <op> away from its initial value iff we found
17261          * an optimization */
17262         if (op != END) {
17263
17264             /* Throw away this ANYOF regnode, and emit the calculated one,
17265              * which should correspond to the beginning, not current, state of
17266              * the parse */
17267             const char * cur_parse = RExC_parse;
17268             RExC_parse = (char *)orig_parse;
17269             if ( SIZE_ONLY) {
17270                 if (! LOC) {
17271
17272                     /* To get locale nodes to not use the full ANYOF size would
17273                      * require moving the code above that writes the portions
17274                      * of it that aren't in other nodes to after this point.
17275                      * e.g.  ANYOF_POSIXL_SET */
17276                     RExC_size = orig_size;
17277                 }
17278             }
17279             else {
17280                 RExC_emit = (regnode *)orig_emit;
17281                 if (PL_regkind[op] == POSIXD) {
17282                     if (op == POSIXL) {
17283                         RExC_contains_locale = 1;
17284                     }
17285                     if (invert) {
17286                         op += NPOSIXD - POSIXD;
17287                     }
17288                 }
17289             }
17290
17291             ret = reg_node(pRExC_state, op);
17292
17293             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17294                 if (! SIZE_ONLY) {
17295                     FLAGS(ret) = arg;
17296                 }
17297                 *flagp |= HASWIDTH|SIMPLE;
17298             }
17299             else if (PL_regkind[op] == EXACT) {
17300                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17301                                            TRUE /* downgradable to EXACT */
17302                                            );
17303             }
17304
17305             RExC_parse = (char *) cur_parse;
17306
17307             SvREFCNT_dec(posixes);
17308             SvREFCNT_dec(nposixes);
17309             SvREFCNT_dec(simple_posixes);
17310             SvREFCNT_dec(cp_list);
17311             SvREFCNT_dec(cp_foldable_list);
17312             return ret;
17313         }
17314     }
17315
17316     if (SIZE_ONLY)
17317         return ret;
17318     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17319
17320     /* If folding, we calculate all characters that could fold to or from the
17321      * ones already on the list */
17322     if (cp_foldable_list) {
17323         if (FOLD) {
17324             UV start, end;      /* End points of code point ranges */
17325
17326             SV* fold_intersection = NULL;
17327             SV** use_list;
17328
17329             /* Our calculated list will be for Unicode rules.  For locale
17330              * matching, we have to keep a separate list that is consulted at
17331              * runtime only when the locale indicates Unicode rules.  For
17332              * non-locale, we just use the general list */
17333             if (LOC) {
17334                 use_list = &only_utf8_locale_list;
17335             }
17336             else {
17337                 use_list = &cp_list;
17338             }
17339
17340             /* Only the characters in this class that participate in folds need
17341              * be checked.  Get the intersection of this class and all the
17342              * possible characters that are foldable.  This can quickly narrow
17343              * down a large class */
17344             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17345                                   &fold_intersection);
17346
17347             /* The folds for all the Latin1 characters are hard-coded into this
17348              * program, but we have to go out to disk to get the others. */
17349             if (invlist_highest(cp_foldable_list) >= 256) {
17350
17351                 /* This is a hash that for a particular fold gives all
17352                  * characters that are involved in it */
17353                 if (! PL_utf8_foldclosures) {
17354                     _load_PL_utf8_foldclosures();
17355                 }
17356             }
17357
17358             /* Now look at the foldable characters in this class individually */
17359             invlist_iterinit(fold_intersection);
17360             while (invlist_iternext(fold_intersection, &start, &end)) {
17361                 UV j;
17362
17363                 /* Look at every character in the range */
17364                 for (j = start; j <= end; j++) {
17365                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17366                     STRLEN foldlen;
17367                     SV** listp;
17368
17369                     if (j < 256) {
17370
17371                         if (IS_IN_SOME_FOLD_L1(j)) {
17372
17373                             /* ASCII is always matched; non-ASCII is matched
17374                              * only under Unicode rules (which could happen
17375                              * under /l if the locale is a UTF-8 one */
17376                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17377                                 *use_list = add_cp_to_invlist(*use_list,
17378                                                             PL_fold_latin1[j]);
17379                             }
17380                             else {
17381                                 has_upper_latin1_only_utf8_matches
17382                                     = add_cp_to_invlist(
17383                                             has_upper_latin1_only_utf8_matches,
17384                                             PL_fold_latin1[j]);
17385                             }
17386                         }
17387
17388                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17389                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17390                         {
17391                             add_above_Latin1_folds(pRExC_state,
17392                                                    (U8) j,
17393                                                    use_list);
17394                         }
17395                         continue;
17396                     }
17397
17398                     /* Here is an above Latin1 character.  We don't have the
17399                      * rules hard-coded for it.  First, get its fold.  This is
17400                      * the simple fold, as the multi-character folds have been
17401                      * handled earlier and separated out */
17402                     _to_uni_fold_flags(j, foldbuf, &foldlen,
17403                                                         (ASCII_FOLD_RESTRICTED)
17404                                                         ? FOLD_FLAGS_NOMIX_ASCII
17405                                                         : 0);
17406
17407                     /* Single character fold of above Latin1.  Add everything in
17408                     * its fold closure to the list that this node should match.
17409                     * The fold closures data structure is a hash with the keys
17410                     * being the UTF-8 of every character that is folded to, like
17411                     * 'k', and the values each an array of all code points that
17412                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
17413                     * Multi-character folds are not included */
17414                     if ((listp = hv_fetch(PL_utf8_foldclosures,
17415                                         (char *) foldbuf, foldlen, FALSE)))
17416                     {
17417                         AV* list = (AV*) *listp;
17418                         IV k;
17419                         for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
17420                             SV** c_p = av_fetch(list, k, FALSE);
17421                             UV c;
17422                             assert(c_p);
17423
17424                             c = SvUV(*c_p);
17425
17426                             /* /aa doesn't allow folds between ASCII and non- */
17427                             if ((ASCII_FOLD_RESTRICTED
17428                                 && (isASCII(c) != isASCII(j))))
17429                             {
17430                                 continue;
17431                             }
17432
17433                             /* Folds under /l which cross the 255/256 boundary
17434                              * are added to a separate list.  (These are valid
17435                              * only when the locale is UTF-8.) */
17436                             if (c < 256 && LOC) {
17437                                 *use_list = add_cp_to_invlist(*use_list, c);
17438                                 continue;
17439                             }
17440
17441                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17442                             {
17443                                 cp_list = add_cp_to_invlist(cp_list, c);
17444                             }
17445                             else {
17446                                 /* Similarly folds involving non-ascii Latin1
17447                                 * characters under /d are added to their list */
17448                                 has_upper_latin1_only_utf8_matches
17449                                         = add_cp_to_invlist(
17450                                            has_upper_latin1_only_utf8_matches,
17451                                            c);
17452                             }
17453                         }
17454                     }
17455                 }
17456             }
17457             SvREFCNT_dec_NN(fold_intersection);
17458         }
17459
17460         /* Now that we have finished adding all the folds, there is no reason
17461          * to keep the foldable list separate */
17462         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17463         SvREFCNT_dec_NN(cp_foldable_list);
17464     }
17465
17466     /* And combine the result (if any) with any inversion lists from posix
17467      * classes.  The lists are kept separate up to now because we don't want to
17468      * fold the classes (folding of those is automatically handled by the swash
17469      * fetching code) */
17470     if (simple_posixes) {   /* These are the classes known to be unaffected by
17471                                /a, /aa, and /d */
17472         if (cp_list) {
17473             _invlist_union(cp_list, simple_posixes, &cp_list);
17474             SvREFCNT_dec_NN(simple_posixes);
17475         }
17476         else {
17477             cp_list = simple_posixes;
17478         }
17479     }
17480     if (posixes || nposixes) {
17481
17482         /* We have to adjust /a and /aa */
17483         if (AT_LEAST_ASCII_RESTRICTED) {
17484
17485             /* Under /a and /aa, nothing above ASCII matches these */
17486             if (posixes) {
17487                 _invlist_intersection(posixes,
17488                                     PL_XPosix_ptrs[_CC_ASCII],
17489                                     &posixes);
17490             }
17491
17492             /* Under /a and /aa, everything above ASCII matches these
17493              * complements */
17494             if (nposixes) {
17495                 _invlist_union_complement_2nd(nposixes,
17496                                               PL_XPosix_ptrs[_CC_ASCII],
17497                                               &nposixes);
17498             }
17499         }
17500
17501         if (! DEPENDS_SEMANTICS) {
17502
17503             /* For everything but /d, we can just add the current 'posixes' and
17504              * 'nposixes' to the main list */
17505             if (posixes) {
17506                 if (cp_list) {
17507                     _invlist_union(cp_list, posixes, &cp_list);
17508                     SvREFCNT_dec_NN(posixes);
17509                 }
17510                 else {
17511                     cp_list = posixes;
17512                 }
17513             }
17514             if (nposixes) {
17515                 if (cp_list) {
17516                     _invlist_union(cp_list, nposixes, &cp_list);
17517                     SvREFCNT_dec_NN(nposixes);
17518                 }
17519                 else {
17520                     cp_list = nposixes;
17521                 }
17522             }
17523         }
17524         else {
17525             /* Under /d, things like \w match upper Latin1 characters only if
17526              * the target string is in UTF-8.  But things like \W match all the
17527              * upper Latin1 characters if the target string is not in UTF-8.
17528              *
17529              * Handle the case where there something like \W separately */
17530             if (nposixes) {
17531                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17532
17533                 /* A complemented posix class matches all upper Latin1
17534                  * characters if not in UTF-8.  And it matches just certain
17535                  * ones when in UTF-8.  That means those certain ones are
17536                  * matched regardless, so can just be added to the
17537                  * unconditional list */
17538                 if (cp_list) {
17539                     _invlist_union(cp_list, nposixes, &cp_list);
17540                     SvREFCNT_dec_NN(nposixes);
17541                     nposixes = NULL;
17542                 }
17543                 else {
17544                     cp_list = nposixes;
17545                 }
17546
17547                 /* Likewise for 'posixes' */
17548                 _invlist_union(posixes, cp_list, &cp_list);
17549
17550                 /* Likewise for anything else in the range that matched only
17551                  * under UTF-8 */
17552                 if (has_upper_latin1_only_utf8_matches) {
17553                     _invlist_union(cp_list,
17554                                    has_upper_latin1_only_utf8_matches,
17555                                    &cp_list);
17556                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17557                     has_upper_latin1_only_utf8_matches = NULL;
17558                 }
17559
17560                 /* If we don't match all the upper Latin1 characters regardless
17561                  * of UTF-8ness, we have to set a flag to match the rest when
17562                  * not in UTF-8 */
17563                 _invlist_subtract(only_non_utf8_list, cp_list,
17564                                   &only_non_utf8_list);
17565                 if (_invlist_len(only_non_utf8_list) != 0) {
17566                     ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17567                 }
17568             }
17569             else {
17570                 /* Here there were no complemented posix classes.  That means
17571                  * the upper Latin1 characters in 'posixes' match only when the
17572                  * target string is in UTF-8.  So we have to add them to the
17573                  * list of those types of code points, while adding the
17574                  * remainder to the unconditional list.
17575                  *
17576                  * First calculate what they are */
17577                 SV* nonascii_but_latin1_properties = NULL;
17578                 _invlist_intersection(posixes, PL_UpperLatin1,
17579                                       &nonascii_but_latin1_properties);
17580
17581                 /* And add them to the final list of such characters. */
17582                 _invlist_union(has_upper_latin1_only_utf8_matches,
17583                                nonascii_but_latin1_properties,
17584                                &has_upper_latin1_only_utf8_matches);
17585
17586                 /* Remove them from what now becomes the unconditional list */
17587                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
17588                                   &posixes);
17589
17590                 /* And add those unconditional ones to the final list */
17591                 if (cp_list) {
17592                     _invlist_union(cp_list, posixes, &cp_list);
17593                     SvREFCNT_dec_NN(posixes);
17594                     posixes = NULL;
17595                 }
17596                 else {
17597                     cp_list = posixes;
17598                 }
17599
17600                 SvREFCNT_dec(nonascii_but_latin1_properties);
17601
17602                 /* Get rid of any characters that we now know are matched
17603                  * unconditionally from the conditional list, which may make
17604                  * that list empty */
17605                 _invlist_subtract(has_upper_latin1_only_utf8_matches,
17606                                   cp_list,
17607                                   &has_upper_latin1_only_utf8_matches);
17608                 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17609                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17610                     has_upper_latin1_only_utf8_matches = NULL;
17611                 }
17612             }
17613         }
17614     }
17615
17616     /* And combine the result (if any) with any inversion list from properties.
17617      * The lists are kept separate up to now so that we can distinguish the two
17618      * in regards to matching above-Unicode.  A run-time warning is generated
17619      * if a Unicode property is matched against a non-Unicode code point. But,
17620      * we allow user-defined properties to match anything, without any warning,
17621      * and we also suppress the warning if there is a portion of the character
17622      * class that isn't a Unicode property, and which matches above Unicode, \W
17623      * or [\x{110000}] for example.
17624      * (Note that in this case, unlike the Posix one above, there is no
17625      * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17626      * forces Unicode semantics */
17627     if (properties) {
17628         if (cp_list) {
17629
17630             /* If it matters to the final outcome, see if a non-property
17631              * component of the class matches above Unicode.  If so, the
17632              * warning gets suppressed.  This is true even if just a single
17633              * such code point is specified, as, though not strictly correct if
17634              * another such code point is matched against, the fact that they
17635              * are using above-Unicode code points indicates they should know
17636              * the issues involved */
17637             if (warn_super) {
17638                 warn_super = ! (invert
17639                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17640             }
17641
17642             _invlist_union(properties, cp_list, &cp_list);
17643             SvREFCNT_dec_NN(properties);
17644         }
17645         else {
17646             cp_list = properties;
17647         }
17648
17649         if (warn_super) {
17650             ANYOF_FLAGS(ret)
17651              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17652
17653             /* Because an ANYOF node is the only one that warns, this node
17654              * can't be optimized into something else */
17655             optimizable = FALSE;
17656         }
17657     }
17658
17659     /* Here, we have calculated what code points should be in the character
17660      * class.
17661      *
17662      * Now we can see about various optimizations.  Fold calculation (which we
17663      * did above) needs to take place before inversion.  Otherwise /[^k]/i
17664      * would invert to include K, which under /i would match k, which it
17665      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
17666      * folded until runtime */
17667
17668     /* If we didn't do folding, it's because some information isn't available
17669      * until runtime; set the run-time fold flag for these.  (We don't have to
17670      * worry about properties folding, as that is taken care of by the swash
17671      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
17672      * locales, or the class matches at least one 0-255 range code point */
17673     if (LOC && FOLD) {
17674
17675         /* Some things on the list might be unconditionally included because of
17676          * other components.  Remove them, and clean up the list if it goes to
17677          * 0 elements */
17678         if (only_utf8_locale_list && cp_list) {
17679             _invlist_subtract(only_utf8_locale_list, cp_list,
17680                               &only_utf8_locale_list);
17681
17682             if (_invlist_len(only_utf8_locale_list) == 0) {
17683                 SvREFCNT_dec_NN(only_utf8_locale_list);
17684                 only_utf8_locale_list = NULL;
17685             }
17686         }
17687         if (only_utf8_locale_list) {
17688             ANYOF_FLAGS(ret)
17689                  |=  ANYOFL_FOLD
17690                     |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17691         }
17692         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17693             UV start, end;
17694             invlist_iterinit(cp_list);
17695             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17696                 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17697             }
17698             invlist_iterfinish(cp_list);
17699         }
17700     }
17701     else if (   DEPENDS_SEMANTICS
17702              && (    has_upper_latin1_only_utf8_matches
17703                  || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
17704     {
17705         OP(ret) = ANYOFD;
17706         optimizable = FALSE;
17707     }
17708
17709
17710     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17711      * at compile time.  Besides not inverting folded locale now, we can't
17712      * invert if there are things such as \w, which aren't known until runtime
17713      * */
17714     if (cp_list
17715         && invert
17716         && OP(ret) != ANYOFD
17717         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17718         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17719     {
17720         _invlist_invert(cp_list);
17721
17722         /* Any swash can't be used as-is, because we've inverted things */
17723         if (swash) {
17724             SvREFCNT_dec_NN(swash);
17725             swash = NULL;
17726         }
17727
17728         /* Clear the invert flag since have just done it here */
17729         invert = FALSE;
17730     }
17731
17732     if (ret_invlist) {
17733         assert(cp_list);
17734
17735         *ret_invlist = cp_list;
17736         SvREFCNT_dec(swash);
17737
17738         /* Discard the generated node */
17739         if (SIZE_ONLY) {
17740             RExC_size = orig_size;
17741         }
17742         else {
17743             RExC_emit = orig_emit;
17744         }
17745         return orig_emit;
17746     }
17747
17748     /* Some character classes are equivalent to other nodes.  Such nodes take
17749      * up less room and generally fewer operations to execute than ANYOF nodes.
17750      * Above, we checked for and optimized into some such equivalents for
17751      * certain common classes that are easy to test.  Getting to this point in
17752      * the code means that the class didn't get optimized there.  Since this
17753      * code is only executed in Pass 2, it is too late to save space--it has
17754      * been allocated in Pass 1, and currently isn't given back.  But turning
17755      * things into an EXACTish node can allow the optimizer to join it to any
17756      * adjacent such nodes.  And if the class is equivalent to things like /./,
17757      * expensive run-time swashes can be avoided.  Now that we have more
17758      * complete information, we can find things necessarily missed by the
17759      * earlier code.  Another possible "optimization" that isn't done is that
17760      * something like [Ee] could be changed into an EXACTFU.  khw tried this
17761      * and found that the ANYOF is faster, including for code points not in the
17762      * bitmap.  This still might make sense to do, provided it got joined with
17763      * an adjacent node(s) to create a longer EXACTFU one.  This could be
17764      * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
17765      * routine would know is joinable.  If that didn't happen, the node type
17766      * could then be made a straight ANYOF */
17767
17768     if (optimizable && cp_list && ! invert) {
17769         UV start, end;
17770         U8 op = END;  /* The optimzation node-type */
17771         int posix_class = -1;   /* Illegal value */
17772         const char * cur_parse= RExC_parse;
17773
17774         invlist_iterinit(cp_list);
17775         if (! invlist_iternext(cp_list, &start, &end)) {
17776
17777             /* Here, the list is empty.  This happens, for example, when a
17778              * Unicode property that doesn't match anything is the only element
17779              * in the character class (perluniprops.pod notes such properties).
17780              * */
17781             op = OPFAIL;
17782             *flagp |= HASWIDTH|SIMPLE;
17783         }
17784         else if (start == end) {    /* The range is a single code point */
17785             if (! invlist_iternext(cp_list, &start, &end)
17786
17787                     /* Don't do this optimization if it would require changing
17788                      * the pattern to UTF-8 */
17789                 && (start < 256 || UTF))
17790             {
17791                 /* Here, the list contains a single code point.  Can optimize
17792                  * into an EXACTish node */
17793
17794                 value = start;
17795
17796                 if (! FOLD) {
17797                     op = (LOC)
17798                          ? EXACTL
17799                          : EXACT;
17800                 }
17801                 else if (LOC) {
17802
17803                     /* A locale node under folding with one code point can be
17804                      * an EXACTFL, as its fold won't be calculated until
17805                      * runtime */
17806                     op = EXACTFL;
17807                 }
17808                 else {
17809
17810                     /* Here, we are generally folding, but there is only one
17811                      * code point to match.  If we have to, we use an EXACT
17812                      * node, but it would be better for joining with adjacent
17813                      * nodes in the optimization pass if we used the same
17814                      * EXACTFish node that any such are likely to be.  We can
17815                      * do this iff the code point doesn't participate in any
17816                      * folds.  For example, an EXACTF of a colon is the same as
17817                      * an EXACT one, since nothing folds to or from a colon. */
17818                     if (value < 256) {
17819                         if (IS_IN_SOME_FOLD_L1(value)) {
17820                             op = EXACT;
17821                         }
17822                     }
17823                     else {
17824                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
17825                             op = EXACT;
17826                         }
17827                     }
17828
17829                     /* If we haven't found the node type, above, it means we
17830                      * can use the prevailing one */
17831                     if (op == END) {
17832                         op = compute_EXACTish(pRExC_state);
17833                     }
17834                 }
17835             }
17836         }   /* End of first range contains just a single code point */
17837         else if (start == 0) {
17838             if (end == UV_MAX) {
17839                 op = SANY;
17840                 *flagp |= HASWIDTH|SIMPLE;
17841                 MARK_NAUGHTY(1);
17842             }
17843             else if (end == '\n' - 1
17844                     && invlist_iternext(cp_list, &start, &end)
17845                     && start == '\n' + 1 && end == UV_MAX)
17846             {
17847                 op = REG_ANY;
17848                 *flagp |= HASWIDTH|SIMPLE;
17849                 MARK_NAUGHTY(1);
17850             }
17851         }
17852         invlist_iterfinish(cp_list);
17853
17854         if (op == END) {
17855             const UV cp_list_len = _invlist_len(cp_list);
17856             const UV* cp_list_array = invlist_array(cp_list);
17857
17858             /* Here, didn't find an optimization.  See if this matches any of
17859              * the POSIX classes.  These run slightly faster for above-Unicode
17860              * code points, so don't bother with POSIXA ones nor the 2 that
17861              * have no above-Unicode matches.  We can avoid these checks unless
17862              * the ANYOF matches at least as high as the lowest POSIX one
17863              * (which was manually found to be \v.  The actual code point may
17864              * increase in later Unicode releases, if a higher code point is
17865              * assigned to be \v, but this code will never break.  It would
17866              * just mean we could execute the checks for posix optimizations
17867              * unnecessarily) */
17868
17869             if (cp_list_array[cp_list_len-1] > 0x2029) {
17870                 for (posix_class = 0;
17871                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
17872                      posix_class++)
17873                 {
17874                     int try_inverted;
17875                     if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
17876                         continue;
17877                     }
17878                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
17879
17880                         /* Check if matches normal or inverted */
17881                         if (_invlistEQ(cp_list,
17882                                        PL_XPosix_ptrs[posix_class],
17883                                        try_inverted))
17884                         {
17885                             op = (try_inverted)
17886                                  ? NPOSIXU
17887                                  : POSIXU;
17888                             *flagp |= HASWIDTH|SIMPLE;
17889                             goto found_posix;
17890                         }
17891                     }
17892                 }
17893               found_posix: ;
17894             }
17895         }
17896
17897         if (op != END) {
17898             RExC_parse = (char *)orig_parse;
17899             RExC_emit = (regnode *)orig_emit;
17900
17901             if (regarglen[op]) {
17902                 ret = reganode(pRExC_state, op, 0);
17903             } else {
17904                 ret = reg_node(pRExC_state, op);
17905             }
17906
17907             RExC_parse = (char *)cur_parse;
17908
17909             if (PL_regkind[op] == EXACT) {
17910                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17911                                            TRUE /* downgradable to EXACT */
17912                                           );
17913             }
17914             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17915                 FLAGS(ret) = posix_class;
17916             }
17917
17918             SvREFCNT_dec_NN(cp_list);
17919             return ret;
17920         }
17921     }
17922
17923     /* Here, <cp_list> contains all the code points we can determine at
17924      * compile time that match under all conditions.  Go through it, and
17925      * for things that belong in the bitmap, put them there, and delete from
17926      * <cp_list>.  While we are at it, see if everything above 255 is in the
17927      * list, and if so, set a flag to speed up execution */
17928
17929     populate_ANYOF_from_invlist(ret, &cp_list);
17930
17931     if (invert) {
17932         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
17933     }
17934
17935     /* Here, the bitmap has been populated with all the Latin1 code points that
17936      * always match.  Can now add to the overall list those that match only
17937      * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
17938      * */
17939     if (has_upper_latin1_only_utf8_matches) {
17940         if (cp_list) {
17941             _invlist_union(cp_list,
17942                            has_upper_latin1_only_utf8_matches,
17943                            &cp_list);
17944             SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17945         }
17946         else {
17947             cp_list = has_upper_latin1_only_utf8_matches;
17948         }
17949         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17950     }
17951
17952     /* If there is a swash and more than one element, we can't use the swash in
17953      * the optimization below. */
17954     if (swash && element_count > 1) {
17955         SvREFCNT_dec_NN(swash);
17956         swash = NULL;
17957     }
17958
17959     /* Note that the optimization of using 'swash' if it is the only thing in
17960      * the class doesn't have us change swash at all, so it can include things
17961      * that are also in the bitmap; otherwise we have purposely deleted that
17962      * duplicate information */
17963     set_ANYOF_arg(pRExC_state, ret, cp_list,
17964                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17965                    ? listsv : NULL,
17966                   only_utf8_locale_list,
17967                   swash, has_user_defined_property);
17968
17969     *flagp |= HASWIDTH|SIMPLE;
17970
17971     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
17972         RExC_contains_locale = 1;
17973     }
17974
17975     return ret;
17976 }
17977
17978 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
17979
17980 STATIC void
17981 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
17982                 regnode* const node,
17983                 SV* const cp_list,
17984                 SV* const runtime_defns,
17985                 SV* const only_utf8_locale_list,
17986                 SV* const swash,
17987                 const bool has_user_defined_property)
17988 {
17989     /* Sets the arg field of an ANYOF-type node 'node', using information about
17990      * the node passed-in.  If there is nothing outside the node's bitmap, the
17991      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
17992      * the count returned by add_data(), having allocated and stored an array,
17993      * av, that that count references, as follows:
17994      *  av[0] stores the character class description in its textual form.
17995      *        This is used later (regexec.c:Perl_regclass_swash()) to
17996      *        initialize the appropriate swash, and is also useful for dumping
17997      *        the regnode.  This is set to &PL_sv_undef if the textual
17998      *        description is not needed at run-time (as happens if the other
17999      *        elements completely define the class)
18000      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
18001      *        computed from av[0].  But if no further computation need be done,
18002      *        the swash is stored here now (and av[0] is &PL_sv_undef).
18003      *  av[2] stores the inversion list of code points that match only if the
18004      *        current locale is UTF-8
18005      *  av[3] stores the cp_list inversion list for use in addition or instead
18006      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
18007      *        (Otherwise everything needed is already in av[0] and av[1])
18008      *  av[4] is set if any component of the class is from a user-defined
18009      *        property; used only if av[3] exists */
18010
18011     UV n;
18012
18013     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
18014
18015     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
18016         assert(! (ANYOF_FLAGS(node)
18017                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
18018         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
18019     }
18020     else {
18021         AV * const av = newAV();
18022         SV *rv;
18023
18024         av_store(av, 0, (runtime_defns)
18025                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
18026         if (swash) {
18027             assert(cp_list);
18028             av_store(av, 1, swash);
18029             SvREFCNT_dec_NN(cp_list);
18030         }
18031         else {
18032             av_store(av, 1, &PL_sv_undef);
18033             if (cp_list) {
18034                 av_store(av, 3, cp_list);
18035                 av_store(av, 4, newSVuv(has_user_defined_property));
18036             }
18037         }
18038
18039         if (only_utf8_locale_list) {
18040             av_store(av, 2, only_utf8_locale_list);
18041         }
18042         else {
18043             av_store(av, 2, &PL_sv_undef);
18044         }
18045
18046         rv = newRV_noinc(MUTABLE_SV(av));
18047         n = add_data(pRExC_state, STR_WITH_LEN("s"));
18048         RExC_rxi->data->data[n] = (void*)rv;
18049         ARG_SET(node, n);
18050     }
18051 }
18052
18053 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
18054 SV *
18055 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
18056                                         const regnode* node,
18057                                         bool doinit,
18058                                         SV** listsvp,
18059                                         SV** only_utf8_locale_ptr,
18060                                         SV** output_invlist)
18061
18062 {
18063     /* For internal core use only.
18064      * Returns the swash for the input 'node' in the regex 'prog'.
18065      * If <doinit> is 'true', will attempt to create the swash if not already
18066      *    done.
18067      * If <listsvp> is non-null, will return the printable contents of the
18068      *    swash.  This can be used to get debugging information even before the
18069      *    swash exists, by calling this function with 'doinit' set to false, in
18070      *    which case the components that will be used to eventually create the
18071      *    swash are returned  (in a printable form).
18072      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
18073      *    store an inversion list of code points that should match only if the
18074      *    execution-time locale is a UTF-8 one.
18075      * If <output_invlist> is not NULL, it is where this routine is to store an
18076      *    inversion list of the code points that would be instead returned in
18077      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
18078      *    when this parameter is used, is just the non-code point data that
18079      *    will go into creating the swash.  This currently should be just
18080      *    user-defined properties whose definitions were not known at compile
18081      *    time.  Using this parameter allows for easier manipulation of the
18082      *    swash's data by the caller.  It is illegal to call this function with
18083      *    this parameter set, but not <listsvp>
18084      *
18085      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
18086      * that, in spite of this function's name, the swash it returns may include
18087      * the bitmap data as well */
18088
18089     SV *sw  = NULL;
18090     SV *si  = NULL;         /* Input swash initialization string */
18091     SV* invlist = NULL;
18092
18093     RXi_GET_DECL(prog,progi);
18094     const struct reg_data * const data = prog ? progi->data : NULL;
18095
18096     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
18097     assert(! output_invlist || listsvp);
18098
18099     if (data && data->count) {
18100         const U32 n = ARG(node);
18101
18102         if (data->what[n] == 's') {
18103             SV * const rv = MUTABLE_SV(data->data[n]);
18104             AV * const av = MUTABLE_AV(SvRV(rv));
18105             SV **const ary = AvARRAY(av);
18106             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18107
18108             si = *ary;  /* ary[0] = the string to initialize the swash with */
18109
18110             if (av_tindex_skip_len_mg(av) >= 2) {
18111                 if (only_utf8_locale_ptr
18112                     && ary[2]
18113                     && ary[2] != &PL_sv_undef)
18114                 {
18115                     *only_utf8_locale_ptr = ary[2];
18116                 }
18117                 else {
18118                     assert(only_utf8_locale_ptr);
18119                     *only_utf8_locale_ptr = NULL;
18120                 }
18121
18122                 /* Elements 3 and 4 are either both present or both absent. [3]
18123                  * is any inversion list generated at compile time; [4]
18124                  * indicates if that inversion list has any user-defined
18125                  * properties in it. */
18126                 if (av_tindex_skip_len_mg(av) >= 3) {
18127                     invlist = ary[3];
18128                     if (SvUV(ary[4])) {
18129                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18130                     }
18131                 }
18132                 else {
18133                     invlist = NULL;
18134                 }
18135             }
18136
18137             /* Element [1] is reserved for the set-up swash.  If already there,
18138              * return it; if not, create it and store it there */
18139             if (ary[1] && SvROK(ary[1])) {
18140                 sw = ary[1];
18141             }
18142             else if (doinit && ((si && si != &PL_sv_undef)
18143                                  || (invlist && invlist != &PL_sv_undef))) {
18144                 assert(si);
18145                 sw = _core_swash_init("utf8", /* the utf8 package */
18146                                       "", /* nameless */
18147                                       si,
18148                                       1, /* binary */
18149                                       0, /* not from tr/// */
18150                                       invlist,
18151                                       &swash_init_flags);
18152                 (void)av_store(av, 1, sw);
18153             }
18154         }
18155     }
18156
18157     /* If requested, return a printable version of what this swash matches */
18158     if (listsvp) {
18159         SV* matches_string = NULL;
18160
18161         /* The swash should be used, if possible, to get the data, as it
18162          * contains the resolved data.  But this function can be called at
18163          * compile-time, before everything gets resolved, in which case we
18164          * return the currently best available information, which is the string
18165          * that will eventually be used to do that resolving, 'si' */
18166         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18167             && (si && si != &PL_sv_undef))
18168         {
18169             /* Here, we only have 'si' (and possibly some passed-in data in
18170              * 'invlist', which is handled below)  If the caller only wants
18171              * 'si', use that.  */
18172             if (! output_invlist) {
18173                 matches_string = newSVsv(si);
18174             }
18175             else {
18176                 /* But if the caller wants an inversion list of the node, we
18177                  * need to parse 'si' and place as much as possible in the
18178                  * desired output inversion list, making 'matches_string' only
18179                  * contain the currently unresolvable things */
18180                 const char *si_string = SvPVX(si);
18181                 STRLEN remaining = SvCUR(si);
18182                 UV prev_cp = 0;
18183                 U8 count = 0;
18184
18185                 /* Ignore everything before the first new-line */
18186                 while (*si_string != '\n' && remaining > 0) {
18187                     si_string++;
18188                     remaining--;
18189                 }
18190                 assert(remaining > 0);
18191
18192                 si_string++;
18193                 remaining--;
18194
18195                 while (remaining > 0) {
18196
18197                     /* The data consists of just strings defining user-defined
18198                      * property names, but in prior incarnations, and perhaps
18199                      * somehow from pluggable regex engines, it could still
18200                      * hold hex code point definitions.  Each component of a
18201                      * range would be separated by a tab, and each range by a
18202                      * new-line.  If these are found, instead add them to the
18203                      * inversion list */
18204                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
18205                                      |PERL_SCAN_SILENT_NON_PORTABLE;
18206                     STRLEN len = remaining;
18207                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
18208
18209                     /* If the hex decode routine found something, it should go
18210                      * up to the next \n */
18211                     if (   *(si_string + len) == '\n') {
18212                         if (count) {    /* 2nd code point on line */
18213                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
18214                         }
18215                         else {
18216                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
18217                         }
18218                         count = 0;
18219                         goto prepare_for_next_iteration;
18220                     }
18221
18222                     /* If the hex decode was instead for the lower range limit,
18223                      * save it, and go parse the upper range limit */
18224                     if (*(si_string + len) == '\t') {
18225                         assert(count == 0);
18226
18227                         prev_cp = cp;
18228                         count = 1;
18229                       prepare_for_next_iteration:
18230                         si_string += len + 1;
18231                         remaining -= len + 1;
18232                         continue;
18233                     }
18234
18235                     /* Here, didn't find a legal hex number.  Just add it from
18236                      * here to the next \n */
18237
18238                     remaining -= len;
18239                     while (*(si_string + len) != '\n' && remaining > 0) {
18240                         remaining--;
18241                         len++;
18242                     }
18243                     if (*(si_string + len) == '\n') {
18244                         len++;
18245                         remaining--;
18246                     }
18247                     if (matches_string) {
18248                         sv_catpvn(matches_string, si_string, len - 1);
18249                     }
18250                     else {
18251                         matches_string = newSVpvn(si_string, len - 1);
18252                     }
18253                     si_string += len;
18254                     sv_catpvs(matches_string, " ");
18255                 } /* end of loop through the text */
18256
18257                 assert(matches_string);
18258                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
18259                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18260                 }
18261             } /* end of has an 'si' but no swash */
18262         }
18263
18264         /* If we have a swash in place, its equivalent inversion list was above
18265          * placed into 'invlist'.  If not, this variable may contain a stored
18266          * inversion list which is information beyond what is in 'si' */
18267         if (invlist) {
18268
18269             /* Again, if the caller doesn't want the output inversion list, put
18270              * everything in 'matches-string' */
18271             if (! output_invlist) {
18272                 if ( ! matches_string) {
18273                     matches_string = newSVpvs("\n");
18274                 }
18275                 sv_catsv(matches_string, invlist_contents(invlist,
18276                                                   TRUE /* traditional style */
18277                                                   ));
18278             }
18279             else if (! *output_invlist) {
18280                 *output_invlist = invlist_clone(invlist);
18281             }
18282             else {
18283                 _invlist_union(*output_invlist, invlist, output_invlist);
18284             }
18285         }
18286
18287         *listsvp = matches_string;
18288     }
18289
18290     return sw;
18291 }
18292 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18293
18294 /* reg_skipcomment()
18295
18296    Absorbs an /x style # comment from the input stream,
18297    returning a pointer to the first character beyond the comment, or if the
18298    comment terminates the pattern without anything following it, this returns
18299    one past the final character of the pattern (in other words, RExC_end) and
18300    sets the REG_RUN_ON_COMMENT_SEEN flag.
18301
18302    Note it's the callers responsibility to ensure that we are
18303    actually in /x mode
18304
18305 */
18306
18307 PERL_STATIC_INLINE char*
18308 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18309 {
18310     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18311
18312     assert(*p == '#');
18313
18314     while (p < RExC_end) {
18315         if (*(++p) == '\n') {
18316             return p+1;
18317         }
18318     }
18319
18320     /* we ran off the end of the pattern without ending the comment, so we have
18321      * to add an \n when wrapping */
18322     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18323     return p;
18324 }
18325
18326 STATIC void
18327 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18328                                 char ** p,
18329                                 const bool force_to_xmod
18330                          )
18331 {
18332     /* If the text at the current parse position '*p' is a '(?#...)' comment,
18333      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18334      * is /x whitespace, advance '*p' so that on exit it points to the first
18335      * byte past all such white space and comments */
18336
18337     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18338
18339     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18340
18341     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18342
18343     for (;;) {
18344         if (RExC_end - (*p) >= 3
18345             && *(*p)     == '('
18346             && *(*p + 1) == '?'
18347             && *(*p + 2) == '#')
18348         {
18349             while (*(*p) != ')') {
18350                 if ((*p) == RExC_end)
18351                     FAIL("Sequence (?#... not terminated");
18352                 (*p)++;
18353             }
18354             (*p)++;
18355             continue;
18356         }
18357
18358         if (use_xmod) {
18359             const char * save_p = *p;
18360             while ((*p) < RExC_end) {
18361                 STRLEN len;
18362                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18363                     (*p) += len;
18364                 }
18365                 else if (*(*p) == '#') {
18366                     (*p) = reg_skipcomment(pRExC_state, (*p));
18367                 }
18368                 else {
18369                     break;
18370                 }
18371             }
18372             if (*p != save_p) {
18373                 continue;
18374             }
18375         }
18376
18377         break;
18378     }
18379
18380     return;
18381 }
18382
18383 /* nextchar()
18384
18385    Advances the parse position by one byte, unless that byte is the beginning
18386    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
18387    those two cases, the parse position is advanced beyond all such comments and
18388    white space.
18389
18390    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18391 */
18392
18393 STATIC void
18394 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18395 {
18396     PERL_ARGS_ASSERT_NEXTCHAR;
18397
18398     if (RExC_parse < RExC_end) {
18399         assert(   ! UTF
18400                || UTF8_IS_INVARIANT(*RExC_parse)
18401                || UTF8_IS_START(*RExC_parse));
18402
18403         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18404
18405         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18406                                 FALSE /* Don't force /x */ );
18407     }
18408 }
18409
18410 STATIC regnode *
18411 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18412 {
18413     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18414      * space.  In pass1, it aligns and increments RExC_size; in pass2,
18415      * RExC_emit */
18416
18417     regnode * const ret = RExC_emit;
18418     GET_RE_DEBUG_FLAGS_DECL;
18419
18420     PERL_ARGS_ASSERT_REGNODE_GUTS;
18421
18422     assert(extra_size >= regarglen[op]);
18423
18424     if (SIZE_ONLY) {
18425         SIZE_ALIGN(RExC_size);
18426         RExC_size += 1 + extra_size;
18427         return(ret);
18428     }
18429     if (RExC_emit >= RExC_emit_bound)
18430         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18431                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
18432
18433     NODE_ALIGN_FILL(ret);
18434 #ifndef RE_TRACK_PATTERN_OFFSETS
18435     PERL_UNUSED_ARG(name);
18436 #else
18437     if (RExC_offsets) {         /* MJD */
18438         MJD_OFFSET_DEBUG(
18439               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
18440               name, __LINE__,
18441               PL_reg_name[op],
18442               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18443                 ? "Overwriting end of array!\n" : "OK",
18444               (UV)(RExC_emit - RExC_emit_start),
18445               (UV)(RExC_parse - RExC_start),
18446               (UV)RExC_offsets[0]));
18447         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18448     }
18449 #endif
18450     return(ret);
18451 }
18452
18453 /*
18454 - reg_node - emit a node
18455 */
18456 STATIC regnode *                        /* Location. */
18457 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18458 {
18459     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18460
18461     PERL_ARGS_ASSERT_REG_NODE;
18462
18463     assert(regarglen[op] == 0);
18464
18465     if (PASS2) {
18466         regnode *ptr = ret;
18467         FILL_ADVANCE_NODE(ptr, op);
18468         RExC_emit = ptr;
18469     }
18470     return(ret);
18471 }
18472
18473 /*
18474 - reganode - emit a node with an argument
18475 */
18476 STATIC regnode *                        /* Location. */
18477 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18478 {
18479     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18480
18481     PERL_ARGS_ASSERT_REGANODE;
18482
18483     assert(regarglen[op] == 1);
18484
18485     if (PASS2) {
18486         regnode *ptr = ret;
18487         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18488         RExC_emit = ptr;
18489     }
18490     return(ret);
18491 }
18492
18493 STATIC regnode *
18494 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18495 {
18496     /* emit a node with U32 and I32 arguments */
18497
18498     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18499
18500     PERL_ARGS_ASSERT_REG2LANODE;
18501
18502     assert(regarglen[op] == 2);
18503
18504     if (PASS2) {
18505         regnode *ptr = ret;
18506         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18507         RExC_emit = ptr;
18508     }
18509     return(ret);
18510 }
18511
18512 /*
18513 - reginsert - insert an operator in front of already-emitted operand
18514 *
18515 * Means relocating the operand.
18516 *
18517 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
18518 * set up NEXT_OFF() of the inserted node if needed. Something like this:
18519 *
18520 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
18521 * if (PASS2)
18522 *     NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
18523 *
18524 */
18525 STATIC void
18526 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
18527 {
18528     regnode *src;
18529     regnode *dst;
18530     regnode *place;
18531     const int offset = regarglen[(U8)op];
18532     const int size = NODE_STEP_REGNODE + offset;
18533     GET_RE_DEBUG_FLAGS_DECL;
18534
18535     PERL_ARGS_ASSERT_REGINSERT;
18536     PERL_UNUSED_CONTEXT;
18537     PERL_UNUSED_ARG(depth);
18538 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18539     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18540     if (SIZE_ONLY) {
18541         RExC_size += size;
18542         return;
18543     }
18544     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18545                                     studying. If this is wrong then we need to adjust RExC_recurse
18546                                     below like we do with RExC_open_parens/RExC_close_parens. */
18547     src = RExC_emit;
18548     RExC_emit += size;
18549     dst = RExC_emit;
18550     if (RExC_open_parens) {
18551         int paren;
18552         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
18553         /* remember that RExC_npar is rex->nparens + 1,
18554          * iow it is 1 more than the number of parens seen in
18555          * the pattern so far. */
18556         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18557             /* note, RExC_open_parens[0] is the start of the
18558              * regex, it can't move. RExC_close_parens[0] is the end
18559              * of the regex, it *can* move. */
18560             if ( paren && RExC_open_parens[paren] >= operand ) {
18561                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18562                 RExC_open_parens[paren] += size;
18563             } else {
18564                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18565             }
18566             if ( RExC_close_parens[paren] >= operand ) {
18567                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18568                 RExC_close_parens[paren] += size;
18569             } else {
18570                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18571             }
18572         }
18573     }
18574     if (RExC_end_op)
18575         RExC_end_op += size;
18576
18577     while (src > operand) {
18578         StructCopy(--src, --dst, regnode);
18579 #ifdef RE_TRACK_PATTERN_OFFSETS
18580         if (RExC_offsets) {     /* MJD 20010112 */
18581             MJD_OFFSET_DEBUG(
18582                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
18583                   "reg_insert",
18584                   __LINE__,
18585                   PL_reg_name[op],
18586                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18587                     ? "Overwriting end of array!\n" : "OK",
18588                   (UV)(src - RExC_emit_start),
18589                   (UV)(dst - RExC_emit_start),
18590                   (UV)RExC_offsets[0]));
18591             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18592             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18593         }
18594 #endif
18595     }
18596
18597
18598     place = operand;            /* Op node, where operand used to be. */
18599 #ifdef RE_TRACK_PATTERN_OFFSETS
18600     if (RExC_offsets) {         /* MJD */
18601         MJD_OFFSET_DEBUG(
18602               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
18603               "reginsert",
18604               __LINE__,
18605               PL_reg_name[op],
18606               (UV)(place - RExC_emit_start) > RExC_offsets[0]
18607               ? "Overwriting end of array!\n" : "OK",
18608               (UV)(place - RExC_emit_start),
18609               (UV)(RExC_parse - RExC_start),
18610               (UV)RExC_offsets[0]));
18611         Set_Node_Offset(place, RExC_parse);
18612         Set_Node_Length(place, 1);
18613     }
18614 #endif
18615     src = NEXTOPER(place);
18616     FILL_ADVANCE_NODE(place, op);
18617     Zero(src, offset, regnode);
18618 }
18619
18620 /*
18621 - regtail - set the next-pointer at the end of a node chain of p to val.
18622 - SEE ALSO: regtail_study
18623 */
18624 STATIC void
18625 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18626                 const regnode * const p,
18627                 const regnode * const val,
18628                 const U32 depth)
18629 {
18630     regnode *scan;
18631     GET_RE_DEBUG_FLAGS_DECL;
18632
18633     PERL_ARGS_ASSERT_REGTAIL;
18634 #ifndef DEBUGGING
18635     PERL_UNUSED_ARG(depth);
18636 #endif
18637
18638     if (SIZE_ONLY)
18639         return;
18640
18641     /* Find last node. */
18642     scan = (regnode *) p;
18643     for (;;) {
18644         regnode * const temp = regnext(scan);
18645         DEBUG_PARSE_r({
18646             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18647             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18648             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
18649                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18650                     (temp == NULL ? "->" : ""),
18651                     (temp == NULL ? PL_reg_name[OP(val)] : "")
18652             );
18653         });
18654         if (temp == NULL)
18655             break;
18656         scan = temp;
18657     }
18658
18659     if (reg_off_by_arg[OP(scan)]) {
18660         ARG_SET(scan, val - scan);
18661     }
18662     else {
18663         NEXT_OFF(scan) = val - scan;
18664     }
18665 }
18666
18667 #ifdef DEBUGGING
18668 /*
18669 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18670 - Look for optimizable sequences at the same time.
18671 - currently only looks for EXACT chains.
18672
18673 This is experimental code. The idea is to use this routine to perform
18674 in place optimizations on branches and groups as they are constructed,
18675 with the long term intention of removing optimization from study_chunk so
18676 that it is purely analytical.
18677
18678 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18679 to control which is which.
18680
18681 */
18682 /* TODO: All four parms should be const */
18683
18684 STATIC U8
18685 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18686                       const regnode *val,U32 depth)
18687 {
18688     regnode *scan;
18689     U8 exact = PSEUDO;
18690 #ifdef EXPERIMENTAL_INPLACESCAN
18691     I32 min = 0;
18692 #endif
18693     GET_RE_DEBUG_FLAGS_DECL;
18694
18695     PERL_ARGS_ASSERT_REGTAIL_STUDY;
18696
18697
18698     if (SIZE_ONLY)
18699         return exact;
18700
18701     /* Find last node. */
18702
18703     scan = p;
18704     for (;;) {
18705         regnode * const temp = regnext(scan);
18706 #ifdef EXPERIMENTAL_INPLACESCAN
18707         if (PL_regkind[OP(scan)] == EXACT) {
18708             bool unfolded_multi_char;   /* Unexamined in this routine */
18709             if (join_exact(pRExC_state, scan, &min,
18710                            &unfolded_multi_char, 1, val, depth+1))
18711                 return EXACT;
18712         }
18713 #endif
18714         if ( exact ) {
18715             switch (OP(scan)) {
18716                 case EXACT:
18717                 case EXACTL:
18718                 case EXACTF:
18719                 case EXACTFA_NO_TRIE:
18720                 case EXACTFA:
18721                 case EXACTFU:
18722                 case EXACTFLU8:
18723                 case EXACTFU_SS:
18724                 case EXACTFL:
18725                         if( exact == PSEUDO )
18726                             exact= OP(scan);
18727                         else if ( exact != OP(scan) )
18728                             exact= 0;
18729                 case NOTHING:
18730                     break;
18731                 default:
18732                     exact= 0;
18733             }
18734         }
18735         DEBUG_PARSE_r({
18736             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18737             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18738             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
18739                 SvPV_nolen_const(RExC_mysv),
18740                 REG_NODE_NUM(scan),
18741                 PL_reg_name[exact]);
18742         });
18743         if (temp == NULL)
18744             break;
18745         scan = temp;
18746     }
18747     DEBUG_PARSE_r({
18748         DEBUG_PARSE_MSG("");
18749         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
18750         Perl_re_printf( aTHX_
18751                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
18752                       SvPV_nolen_const(RExC_mysv),
18753                       (IV)REG_NODE_NUM(val),
18754                       (IV)(val - scan)
18755         );
18756     });
18757     if (reg_off_by_arg[OP(scan)]) {
18758         ARG_SET(scan, val - scan);
18759     }
18760     else {
18761         NEXT_OFF(scan) = val - scan;
18762     }
18763
18764     return exact;
18765 }
18766 #endif
18767
18768 /*
18769  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
18770  */
18771 #ifdef DEBUGGING
18772
18773 static void
18774 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
18775 {
18776     int bit;
18777     int set=0;
18778
18779     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18780
18781     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
18782         if (flags & (1<<bit)) {
18783             if (!set++ && lead)
18784                 Perl_re_printf( aTHX_  "%s",lead);
18785             Perl_re_printf( aTHX_  "%s ",PL_reg_intflags_name[bit]);
18786         }
18787     }
18788     if (lead)  {
18789         if (set)
18790             Perl_re_printf( aTHX_  "\n");
18791         else
18792             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18793     }
18794 }
18795
18796 static void
18797 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
18798 {
18799     int bit;
18800     int set=0;
18801     regex_charset cs;
18802
18803     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18804
18805     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
18806         if (flags & (1<<bit)) {
18807             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
18808                 continue;
18809             }
18810             if (!set++ && lead)
18811                 Perl_re_printf( aTHX_  "%s",lead);
18812             Perl_re_printf( aTHX_  "%s ",PL_reg_extflags_name[bit]);
18813         }
18814     }
18815     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
18816             if (!set++ && lead) {
18817                 Perl_re_printf( aTHX_  "%s",lead);
18818             }
18819             switch (cs) {
18820                 case REGEX_UNICODE_CHARSET:
18821                     Perl_re_printf( aTHX_  "UNICODE");
18822                     break;
18823                 case REGEX_LOCALE_CHARSET:
18824                     Perl_re_printf( aTHX_  "LOCALE");
18825                     break;
18826                 case REGEX_ASCII_RESTRICTED_CHARSET:
18827                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
18828                     break;
18829                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
18830                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
18831                     break;
18832                 default:
18833                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
18834                     break;
18835             }
18836     }
18837     if (lead)  {
18838         if (set)
18839             Perl_re_printf( aTHX_  "\n");
18840         else
18841             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18842     }
18843 }
18844 #endif
18845
18846 void
18847 Perl_regdump(pTHX_ const regexp *r)
18848 {
18849 #ifdef DEBUGGING
18850     SV * const sv = sv_newmortal();
18851     SV *dsv= sv_newmortal();
18852     RXi_GET_DECL(r,ri);
18853     GET_RE_DEBUG_FLAGS_DECL;
18854
18855     PERL_ARGS_ASSERT_REGDUMP;
18856
18857     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
18858
18859     /* Header fields of interest. */
18860     if (r->anchored_substr) {
18861         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
18862             RE_SV_DUMPLEN(r->anchored_substr), 30);
18863         Perl_re_printf( aTHX_
18864                       "anchored %s%s at %" IVdf " ",
18865                       s, RE_SV_TAIL(r->anchored_substr),
18866                       (IV)r->anchored_offset);
18867     } else if (r->anchored_utf8) {
18868         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
18869             RE_SV_DUMPLEN(r->anchored_utf8), 30);
18870         Perl_re_printf( aTHX_
18871                       "anchored utf8 %s%s at %" IVdf " ",
18872                       s, RE_SV_TAIL(r->anchored_utf8),
18873                       (IV)r->anchored_offset);
18874     }
18875     if (r->float_substr) {
18876         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
18877             RE_SV_DUMPLEN(r->float_substr), 30);
18878         Perl_re_printf( aTHX_
18879                       "floating %s%s at %" IVdf "..%" UVuf " ",
18880                       s, RE_SV_TAIL(r->float_substr),
18881                       (IV)r->float_min_offset, (UV)r->float_max_offset);
18882     } else if (r->float_utf8) {
18883         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
18884             RE_SV_DUMPLEN(r->float_utf8), 30);
18885         Perl_re_printf( aTHX_
18886                       "floating utf8 %s%s at %" IVdf "..%" UVuf " ",
18887                       s, RE_SV_TAIL(r->float_utf8),
18888                       (IV)r->float_min_offset, (UV)r->float_max_offset);
18889     }
18890     if (r->check_substr || r->check_utf8)
18891         Perl_re_printf( aTHX_
18892                       (const char *)
18893                       (r->check_substr == r->float_substr
18894                        && r->check_utf8 == r->float_utf8
18895                        ? "(checking floating" : "(checking anchored"));
18896     if (r->intflags & PREGf_NOSCAN)
18897         Perl_re_printf( aTHX_  " noscan");
18898     if (r->extflags & RXf_CHECK_ALL)
18899         Perl_re_printf( aTHX_  " isall");
18900     if (r->check_substr || r->check_utf8)
18901         Perl_re_printf( aTHX_  ") ");
18902
18903     if (ri->regstclass) {
18904         regprop(r, sv, ri->regstclass, NULL, NULL);
18905         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
18906     }
18907     if (r->intflags & PREGf_ANCH) {
18908         Perl_re_printf( aTHX_  "anchored");
18909         if (r->intflags & PREGf_ANCH_MBOL)
18910             Perl_re_printf( aTHX_  "(MBOL)");
18911         if (r->intflags & PREGf_ANCH_SBOL)
18912             Perl_re_printf( aTHX_  "(SBOL)");
18913         if (r->intflags & PREGf_ANCH_GPOS)
18914             Perl_re_printf( aTHX_  "(GPOS)");
18915         Perl_re_printf( aTHX_ " ");
18916     }
18917     if (r->intflags & PREGf_GPOS_SEEN)
18918         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
18919     if (r->intflags & PREGf_SKIP)
18920         Perl_re_printf( aTHX_  "plus ");
18921     if (r->intflags & PREGf_IMPLICIT)
18922         Perl_re_printf( aTHX_  "implicit ");
18923     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
18924     if (r->extflags & RXf_EVAL_SEEN)
18925         Perl_re_printf( aTHX_  "with eval ");
18926     Perl_re_printf( aTHX_  "\n");
18927     DEBUG_FLAGS_r({
18928         regdump_extflags("r->extflags: ",r->extflags);
18929         regdump_intflags("r->intflags: ",r->intflags);
18930     });
18931 #else
18932     PERL_ARGS_ASSERT_REGDUMP;
18933     PERL_UNUSED_CONTEXT;
18934     PERL_UNUSED_ARG(r);
18935 #endif  /* DEBUGGING */
18936 }
18937
18938 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
18939 #ifdef DEBUGGING
18940
18941 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
18942      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
18943      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
18944      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
18945      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
18946      || _CC_VERTSPACE != 15
18947 #   error Need to adjust order of anyofs[]
18948 #  endif
18949 static const char * const anyofs[] = {
18950     "\\w",
18951     "\\W",
18952     "\\d",
18953     "\\D",
18954     "[:alpha:]",
18955     "[:^alpha:]",
18956     "[:lower:]",
18957     "[:^lower:]",
18958     "[:upper:]",
18959     "[:^upper:]",
18960     "[:punct:]",
18961     "[:^punct:]",
18962     "[:print:]",
18963     "[:^print:]",
18964     "[:alnum:]",
18965     "[:^alnum:]",
18966     "[:graph:]",
18967     "[:^graph:]",
18968     "[:cased:]",
18969     "[:^cased:]",
18970     "\\s",
18971     "\\S",
18972     "[:blank:]",
18973     "[:^blank:]",
18974     "[:xdigit:]",
18975     "[:^xdigit:]",
18976     "[:cntrl:]",
18977     "[:^cntrl:]",
18978     "[:ascii:]",
18979     "[:^ascii:]",
18980     "\\v",
18981     "\\V"
18982 };
18983 #endif
18984
18985 /*
18986 - regprop - printable representation of opcode, with run time support
18987 */
18988
18989 void
18990 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
18991 {
18992 #ifdef DEBUGGING
18993     int k;
18994     RXi_GET_DECL(prog,progi);
18995     GET_RE_DEBUG_FLAGS_DECL;
18996
18997     PERL_ARGS_ASSERT_REGPROP;
18998
18999     SvPVCLEAR(sv);
19000
19001     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
19002         /* It would be nice to FAIL() here, but this may be called from
19003            regexec.c, and it would be hard to supply pRExC_state. */
19004         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19005                                               (int)OP(o), (int)REGNODE_MAX);
19006     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
19007
19008     k = PL_regkind[OP(o)];
19009
19010     if (k == EXACT) {
19011         sv_catpvs(sv, " ");
19012         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
19013          * is a crude hack but it may be the best for now since
19014          * we have no flag "this EXACTish node was UTF-8"
19015          * --jhi */
19016         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
19017                   PERL_PV_ESCAPE_UNI_DETECT |
19018                   PERL_PV_ESCAPE_NONASCII   |
19019                   PERL_PV_PRETTY_ELLIPSES   |
19020                   PERL_PV_PRETTY_LTGT       |
19021                   PERL_PV_PRETTY_NOCLEAR
19022                   );
19023     } else if (k == TRIE) {
19024         /* print the details of the trie in dumpuntil instead, as
19025          * progi->data isn't available here */
19026         const char op = OP(o);
19027         const U32 n = ARG(o);
19028         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
19029                (reg_ac_data *)progi->data->data[n] :
19030                NULL;
19031         const reg_trie_data * const trie
19032             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
19033
19034         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
19035         DEBUG_TRIE_COMPILE_r({
19036           if (trie->jump)
19037             sv_catpvs(sv, "(JUMP)");
19038           Perl_sv_catpvf(aTHX_ sv,
19039             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
19040             (UV)trie->startstate,
19041             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
19042             (UV)trie->wordcount,
19043             (UV)trie->minlen,
19044             (UV)trie->maxlen,
19045             (UV)TRIE_CHARCOUNT(trie),
19046             (UV)trie->uniquecharcount
19047           );
19048         });
19049         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
19050             sv_catpvs(sv, "[");
19051             (void) put_charclass_bitmap_innards(sv,
19052                                                 ((IS_ANYOF_TRIE(op))
19053                                                  ? ANYOF_BITMAP(o)
19054                                                  : TRIE_BITMAP(trie)),
19055                                                 NULL,
19056                                                 NULL,
19057                                                 NULL,
19058                                                 FALSE
19059                                                );
19060             sv_catpvs(sv, "]");
19061         }
19062     } else if (k == CURLY) {
19063         U32 lo = ARG1(o), hi = ARG2(o);
19064         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
19065             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
19066         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
19067         if (hi == REG_INFTY)
19068             sv_catpvs(sv, "INFTY");
19069         else
19070             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
19071         sv_catpvs(sv, "}");
19072     }
19073     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
19074         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
19075     else if (k == REF || k == OPEN || k == CLOSE
19076              || k == GROUPP || OP(o)==ACCEPT)
19077     {
19078         AV *name_list= NULL;
19079         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
19080         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
19081         if ( RXp_PAREN_NAMES(prog) ) {
19082             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19083         } else if ( pRExC_state ) {
19084             name_list= RExC_paren_name_list;
19085         }
19086         if (name_list) {
19087             if ( k != REF || (OP(o) < NREF)) {
19088                 SV **name= av_fetch(name_list, parno, 0 );
19089                 if (name)
19090                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19091             }
19092             else {
19093                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
19094                 I32 *nums=(I32*)SvPVX(sv_dat);
19095                 SV **name= av_fetch(name_list, nums[0], 0 );
19096                 I32 n;
19097                 if (name) {
19098                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
19099                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
19100                                     (n ? "," : ""), (IV)nums[n]);
19101                     }
19102                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19103                 }
19104             }
19105         }
19106         if ( k == REF && reginfo) {
19107             U32 n = ARG(o);  /* which paren pair */
19108             I32 ln = prog->offs[n].start;
19109             if (prog->lastparen < n || ln == -1)
19110                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
19111             else if (ln == prog->offs[n].end)
19112                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19113             else {
19114                 const char *s = reginfo->strbeg + ln;
19115                 Perl_sv_catpvf(aTHX_ sv, ": ");
19116                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19117                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19118             }
19119         }
19120     } else if (k == GOSUB) {
19121         AV *name_list= NULL;
19122         if ( RXp_PAREN_NAMES(prog) ) {
19123             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19124         } else if ( pRExC_state ) {
19125             name_list= RExC_paren_name_list;
19126         }
19127
19128         /* Paren and offset */
19129         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19130                 (int)((o + (int)ARG2L(o)) - progi->program) );
19131         if (name_list) {
19132             SV **name= av_fetch(name_list, ARG(o), 0 );
19133             if (name)
19134                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19135         }
19136     }
19137     else if (k == LOGICAL)
19138         /* 2: embedded, otherwise 1 */
19139         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19140     else if (k == ANYOF) {
19141         const U8 flags = ANYOF_FLAGS(o);
19142         bool do_sep = FALSE;    /* Do we need to separate various components of
19143                                    the output? */
19144         /* Set if there is still an unresolved user-defined property */
19145         SV *unresolved                = NULL;
19146
19147         /* Things that are ignored except when the runtime locale is UTF-8 */
19148         SV *only_utf8_locale_invlist = NULL;
19149
19150         /* Code points that don't fit in the bitmap */
19151         SV *nonbitmap_invlist = NULL;
19152
19153         /* And things that aren't in the bitmap, but are small enough to be */
19154         SV* bitmap_range_not_in_bitmap = NULL;
19155
19156         const bool inverted = flags & ANYOF_INVERT;
19157
19158         if (OP(o) == ANYOFL) {
19159             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
19160                 sv_catpvs(sv, "{utf8-locale-reqd}");
19161             }
19162             if (flags & ANYOFL_FOLD) {
19163                 sv_catpvs(sv, "{i}");
19164             }
19165         }
19166
19167         /* If there is stuff outside the bitmap, get it */
19168         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
19169             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
19170                                                 &unresolved,
19171                                                 &only_utf8_locale_invlist,
19172                                                 &nonbitmap_invlist);
19173             /* The non-bitmap data may contain stuff that could fit in the
19174              * bitmap.  This could come from a user-defined property being
19175              * finally resolved when this call was done; or much more likely
19176              * because there are matches that require UTF-8 to be valid, and so
19177              * aren't in the bitmap.  This is teased apart later */
19178             _invlist_intersection(nonbitmap_invlist,
19179                                   PL_InBitmap,
19180                                   &bitmap_range_not_in_bitmap);
19181             /* Leave just the things that don't fit into the bitmap */
19182             _invlist_subtract(nonbitmap_invlist,
19183                               PL_InBitmap,
19184                               &nonbitmap_invlist);
19185         }
19186
19187         /* Obey this flag to add all above-the-bitmap code points */
19188         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
19189             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
19190                                                       NUM_ANYOF_CODE_POINTS,
19191                                                       UV_MAX);
19192         }
19193
19194         /* Ready to start outputting.  First, the initial left bracket */
19195         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19196
19197         /* Then all the things that could fit in the bitmap */
19198         do_sep = put_charclass_bitmap_innards(sv,
19199                                               ANYOF_BITMAP(o),
19200                                               bitmap_range_not_in_bitmap,
19201                                               only_utf8_locale_invlist,
19202                                               o,
19203
19204                                               /* Can't try inverting for a
19205                                                * better display if there are
19206                                                * things that haven't been
19207                                                * resolved */
19208                                               unresolved != NULL);
19209         SvREFCNT_dec(bitmap_range_not_in_bitmap);
19210
19211         /* If there are user-defined properties which haven't been defined yet,
19212          * output them.  If the result is not to be inverted, it is clearest to
19213          * output them in a separate [] from the bitmap range stuff.  If the
19214          * result is to be complemented, we have to show everything in one [],
19215          * as the inversion applies to the whole thing.  Use {braces} to
19216          * separate them from anything in the bitmap and anything above the
19217          * bitmap. */
19218         if (unresolved) {
19219             if (inverted) {
19220                 if (! do_sep) { /* If didn't output anything in the bitmap */
19221                     sv_catpvs(sv, "^");
19222                 }
19223                 sv_catpvs(sv, "{");
19224             }
19225             else if (do_sep) {
19226                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19227             }
19228             sv_catsv(sv, unresolved);
19229             if (inverted) {
19230                 sv_catpvs(sv, "}");
19231             }
19232             do_sep = ! inverted;
19233         }
19234
19235         /* And, finally, add the above-the-bitmap stuff */
19236         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
19237             SV* contents;
19238
19239             /* See if truncation size is overridden */
19240             const STRLEN dump_len = (PL_dump_re_max_len)
19241                                     ? PL_dump_re_max_len
19242                                     : 256;
19243
19244             /* This is output in a separate [] */
19245             if (do_sep) {
19246                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19247             }
19248
19249             /* And, for easy of understanding, it is shown in the
19250              * uncomplemented form if possible.  The one exception being if
19251              * there are unresolved items, where the inversion has to be
19252              * delayed until runtime */
19253             if (inverted && ! unresolved) {
19254                 _invlist_invert(nonbitmap_invlist);
19255                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
19256             }
19257
19258             contents = invlist_contents(nonbitmap_invlist,
19259                                         FALSE /* output suitable for catsv */
19260                                        );
19261
19262             /* If the output is shorter than the permissible maximum, just do it. */
19263             if (SvCUR(contents) <= dump_len) {
19264                 sv_catsv(sv, contents);
19265             }
19266             else {
19267                 const char * contents_string = SvPVX(contents);
19268                 STRLEN i = dump_len;
19269
19270                 /* Otherwise, start at the permissible max and work back to the
19271                  * first break possibility */
19272                 while (i > 0 && contents_string[i] != ' ') {
19273                     i--;
19274                 }
19275                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
19276                                        find a legal break */
19277                     i = dump_len;
19278                 }
19279
19280                 sv_catpvn(sv, contents_string, i);
19281                 sv_catpvs(sv, "...");
19282             }
19283
19284             SvREFCNT_dec_NN(contents);
19285             SvREFCNT_dec_NN(nonbitmap_invlist);
19286         }
19287
19288         /* And finally the matching, closing ']' */
19289         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19290
19291         SvREFCNT_dec(unresolved);
19292     }
19293     else if (k == POSIXD || k == NPOSIXD) {
19294         U8 index = FLAGS(o) * 2;
19295         if (index < C_ARRAY_LENGTH(anyofs)) {
19296             if (*anyofs[index] != '[')  {
19297                 sv_catpv(sv, "[");
19298             }
19299             sv_catpv(sv, anyofs[index]);
19300             if (*anyofs[index] != '[')  {
19301                 sv_catpv(sv, "]");
19302             }
19303         }
19304         else {
19305             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19306         }
19307     }
19308     else if (k == BOUND || k == NBOUND) {
19309         /* Must be synced with order of 'bound_type' in regcomp.h */
19310         const char * const bounds[] = {
19311             "",      /* Traditional */
19312             "{gcb}",
19313             "{lb}",
19314             "{sb}",
19315             "{wb}"
19316         };
19317         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19318         sv_catpv(sv, bounds[FLAGS(o)]);
19319     }
19320     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19321         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19322     else if (OP(o) == SBOL)
19323         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19324
19325     /* add on the verb argument if there is one */
19326     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19327         Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
19328                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19329     }
19330 #else
19331     PERL_UNUSED_CONTEXT;
19332     PERL_UNUSED_ARG(sv);
19333     PERL_UNUSED_ARG(o);
19334     PERL_UNUSED_ARG(prog);
19335     PERL_UNUSED_ARG(reginfo);
19336     PERL_UNUSED_ARG(pRExC_state);
19337 #endif  /* DEBUGGING */
19338 }
19339
19340
19341
19342 SV *
19343 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19344 {                               /* Assume that RE_INTUIT is set */
19345     struct regexp *const prog = ReANY(r);
19346     GET_RE_DEBUG_FLAGS_DECL;
19347
19348     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19349     PERL_UNUSED_CONTEXT;
19350
19351     DEBUG_COMPILE_r(
19352         {
19353             const char * const s = SvPV_nolen_const(RX_UTF8(r)
19354                       ? prog->check_utf8 : prog->check_substr);
19355
19356             if (!PL_colorset) reginitcolors();
19357             Perl_re_printf( aTHX_
19358                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19359                       PL_colors[4],
19360                       RX_UTF8(r) ? "utf8 " : "",
19361                       PL_colors[5],PL_colors[0],
19362                       s,
19363                       PL_colors[1],
19364                       (strlen(s) > 60 ? "..." : ""));
19365         } );
19366
19367     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19368     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19369 }
19370
19371 /*
19372    pregfree()
19373
19374    handles refcounting and freeing the perl core regexp structure. When
19375    it is necessary to actually free the structure the first thing it
19376    does is call the 'free' method of the regexp_engine associated to
19377    the regexp, allowing the handling of the void *pprivate; member
19378    first. (This routine is not overridable by extensions, which is why
19379    the extensions free is called first.)
19380
19381    See regdupe and regdupe_internal if you change anything here.
19382 */
19383 #ifndef PERL_IN_XSUB_RE
19384 void
19385 Perl_pregfree(pTHX_ REGEXP *r)
19386 {
19387     SvREFCNT_dec(r);
19388 }
19389
19390 void
19391 Perl_pregfree2(pTHX_ REGEXP *rx)
19392 {
19393     struct regexp *const r = ReANY(rx);
19394     GET_RE_DEBUG_FLAGS_DECL;
19395
19396     PERL_ARGS_ASSERT_PREGFREE2;
19397
19398     if (r->mother_re) {
19399         ReREFCNT_dec(r->mother_re);
19400     } else {
19401         CALLREGFREE_PVT(rx); /* free the private data */
19402         SvREFCNT_dec(RXp_PAREN_NAMES(r));
19403         Safefree(r->xpv_len_u.xpvlenu_pv);
19404     }
19405     if (r->substrs) {
19406         SvREFCNT_dec(r->anchored_substr);
19407         SvREFCNT_dec(r->anchored_utf8);
19408         SvREFCNT_dec(r->float_substr);
19409         SvREFCNT_dec(r->float_utf8);
19410         Safefree(r->substrs);
19411     }
19412     RX_MATCH_COPY_FREE(rx);
19413 #ifdef PERL_ANY_COW
19414     SvREFCNT_dec(r->saved_copy);
19415 #endif
19416     Safefree(r->offs);
19417     SvREFCNT_dec(r->qr_anoncv);
19418     if (r->recurse_locinput)
19419         Safefree(r->recurse_locinput);
19420     rx->sv_u.svu_rx = 0;
19421 }
19422
19423 /*  reg_temp_copy()
19424
19425     This is a hacky workaround to the structural issue of match results
19426     being stored in the regexp structure which is in turn stored in
19427     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19428     could be PL_curpm in multiple contexts, and could require multiple
19429     result sets being associated with the pattern simultaneously, such
19430     as when doing a recursive match with (??{$qr})
19431
19432     The solution is to make a lightweight copy of the regexp structure
19433     when a qr// is returned from the code executed by (??{$qr}) this
19434     lightweight copy doesn't actually own any of its data except for
19435     the starp/end and the actual regexp structure itself.
19436
19437 */
19438
19439
19440 REGEXP *
19441 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
19442 {
19443     struct regexp *ret;
19444     struct regexp *const r = ReANY(rx);
19445     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
19446
19447     PERL_ARGS_ASSERT_REG_TEMP_COPY;
19448
19449     if (!ret_x)
19450         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
19451     else {
19452         SvOK_off((SV *)ret_x);
19453         if (islv) {
19454             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
19455                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
19456                made both spots point to the same regexp body.) */
19457             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19458             assert(!SvPVX(ret_x));
19459             ret_x->sv_u.svu_rx = temp->sv_any;
19460             temp->sv_any = NULL;
19461             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19462             SvREFCNT_dec_NN(temp);
19463             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19464                ing below will not set it. */
19465             SvCUR_set(ret_x, SvCUR(rx));
19466         }
19467     }
19468     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19469        sv_force_normal(sv) is called.  */
19470     SvFAKE_on(ret_x);
19471     ret = ReANY(ret_x);
19472
19473     SvFLAGS(ret_x) |= SvUTF8(rx);
19474     /* We share the same string buffer as the original regexp, on which we
19475        hold a reference count, incremented when mother_re is set below.
19476        The string pointer is copied here, being part of the regexp struct.
19477      */
19478     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
19479            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19480     if (r->offs) {
19481         const I32 npar = r->nparens+1;
19482         Newx(ret->offs, npar, regexp_paren_pair);
19483         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19484     }
19485     if (r->substrs) {
19486         Newx(ret->substrs, 1, struct reg_substr_data);
19487         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19488
19489         SvREFCNT_inc_void(ret->anchored_substr);
19490         SvREFCNT_inc_void(ret->anchored_utf8);
19491         SvREFCNT_inc_void(ret->float_substr);
19492         SvREFCNT_inc_void(ret->float_utf8);
19493
19494         /* check_substr and check_utf8, if non-NULL, point to either their
19495            anchored or float namesakes, and don't hold a second reference.  */
19496     }
19497     RX_MATCH_COPIED_off(ret_x);
19498 #ifdef PERL_ANY_COW
19499     ret->saved_copy = NULL;
19500 #endif
19501     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
19502     SvREFCNT_inc_void(ret->qr_anoncv);
19503     if (r->recurse_locinput)
19504         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19505
19506     return ret_x;
19507 }
19508 #endif
19509
19510 /* regfree_internal()
19511
19512    Free the private data in a regexp. This is overloadable by
19513    extensions. Perl takes care of the regexp structure in pregfree(),
19514    this covers the *pprivate pointer which technically perl doesn't
19515    know about, however of course we have to handle the
19516    regexp_internal structure when no extension is in use.
19517
19518    Note this is called before freeing anything in the regexp
19519    structure.
19520  */
19521
19522 void
19523 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19524 {
19525     struct regexp *const r = ReANY(rx);
19526     RXi_GET_DECL(r,ri);
19527     GET_RE_DEBUG_FLAGS_DECL;
19528
19529     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19530
19531     DEBUG_COMPILE_r({
19532         if (!PL_colorset)
19533             reginitcolors();
19534         {
19535             SV *dsv= sv_newmortal();
19536             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19537                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
19538             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19539                 PL_colors[4],PL_colors[5],s);
19540         }
19541     });
19542 #ifdef RE_TRACK_PATTERN_OFFSETS
19543     if (ri->u.offsets)
19544         Safefree(ri->u.offsets);             /* 20010421 MJD */
19545 #endif
19546     if (ri->code_blocks)
19547         S_free_codeblocks(aTHX_ ri->code_blocks);
19548
19549     if (ri->data) {
19550         int n = ri->data->count;
19551
19552         while (--n >= 0) {
19553           /* If you add a ->what type here, update the comment in regcomp.h */
19554             switch (ri->data->what[n]) {
19555             case 'a':
19556             case 'r':
19557             case 's':
19558             case 'S':
19559             case 'u':
19560                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19561                 break;
19562             case 'f':
19563                 Safefree(ri->data->data[n]);
19564                 break;
19565             case 'l':
19566             case 'L':
19567                 break;
19568             case 'T':
19569                 { /* Aho Corasick add-on structure for a trie node.
19570                      Used in stclass optimization only */
19571                     U32 refcount;
19572                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19573 #ifdef USE_ITHREADS
19574                     dVAR;
19575 #endif
19576                     OP_REFCNT_LOCK;
19577                     refcount = --aho->refcount;
19578                     OP_REFCNT_UNLOCK;
19579                     if ( !refcount ) {
19580                         PerlMemShared_free(aho->states);
19581                         PerlMemShared_free(aho->fail);
19582                          /* do this last!!!! */
19583                         PerlMemShared_free(ri->data->data[n]);
19584                         /* we should only ever get called once, so
19585                          * assert as much, and also guard the free
19586                          * which /might/ happen twice. At the least
19587                          * it will make code anlyzers happy and it
19588                          * doesn't cost much. - Yves */
19589                         assert(ri->regstclass);
19590                         if (ri->regstclass) {
19591                             PerlMemShared_free(ri->regstclass);
19592                             ri->regstclass = 0;
19593                         }
19594                     }
19595                 }
19596                 break;
19597             case 't':
19598                 {
19599                     /* trie structure. */
19600                     U32 refcount;
19601                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19602 #ifdef USE_ITHREADS
19603                     dVAR;
19604 #endif
19605                     OP_REFCNT_LOCK;
19606                     refcount = --trie->refcount;
19607                     OP_REFCNT_UNLOCK;
19608                     if ( !refcount ) {
19609                         PerlMemShared_free(trie->charmap);
19610                         PerlMemShared_free(trie->states);
19611                         PerlMemShared_free(trie->trans);
19612                         if (trie->bitmap)
19613                             PerlMemShared_free(trie->bitmap);
19614                         if (trie->jump)
19615                             PerlMemShared_free(trie->jump);
19616                         PerlMemShared_free(trie->wordinfo);
19617                         /* do this last!!!! */
19618                         PerlMemShared_free(ri->data->data[n]);
19619                     }
19620                 }
19621                 break;
19622             default:
19623                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
19624                                                     ri->data->what[n]);
19625             }
19626         }
19627         Safefree(ri->data->what);
19628         Safefree(ri->data);
19629     }
19630
19631     Safefree(ri);
19632 }
19633
19634 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19635 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19636 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
19637
19638 /*
19639    re_dup_guts - duplicate a regexp.
19640
19641    This routine is expected to clone a given regexp structure. It is only
19642    compiled under USE_ITHREADS.
19643
19644    After all of the core data stored in struct regexp is duplicated
19645    the regexp_engine.dupe method is used to copy any private data
19646    stored in the *pprivate pointer. This allows extensions to handle
19647    any duplication it needs to do.
19648
19649    See pregfree() and regfree_internal() if you change anything here.
19650 */
19651 #if defined(USE_ITHREADS)
19652 #ifndef PERL_IN_XSUB_RE
19653 void
19654 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19655 {
19656     dVAR;
19657     I32 npar;
19658     const struct regexp *r = ReANY(sstr);
19659     struct regexp *ret = ReANY(dstr);
19660
19661     PERL_ARGS_ASSERT_RE_DUP_GUTS;
19662
19663     npar = r->nparens+1;
19664     Newx(ret->offs, npar, regexp_paren_pair);
19665     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19666
19667     if (ret->substrs) {
19668         /* Do it this way to avoid reading from *r after the StructCopy().
19669            That way, if any of the sv_dup_inc()s dislodge *r from the L1
19670            cache, it doesn't matter.  */
19671         const bool anchored = r->check_substr
19672             ? r->check_substr == r->anchored_substr
19673             : r->check_utf8 == r->anchored_utf8;
19674         Newx(ret->substrs, 1, struct reg_substr_data);
19675         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19676
19677         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
19678         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
19679         ret->float_substr = sv_dup_inc(ret->float_substr, param);
19680         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
19681
19682         /* check_substr and check_utf8, if non-NULL, point to either their
19683            anchored or float namesakes, and don't hold a second reference.  */
19684
19685         if (ret->check_substr) {
19686             if (anchored) {
19687                 assert(r->check_utf8 == r->anchored_utf8);
19688                 ret->check_substr = ret->anchored_substr;
19689                 ret->check_utf8 = ret->anchored_utf8;
19690             } else {
19691                 assert(r->check_substr == r->float_substr);
19692                 assert(r->check_utf8 == r->float_utf8);
19693                 ret->check_substr = ret->float_substr;
19694                 ret->check_utf8 = ret->float_utf8;
19695             }
19696         } else if (ret->check_utf8) {
19697             if (anchored) {
19698                 ret->check_utf8 = ret->anchored_utf8;
19699             } else {
19700                 ret->check_utf8 = ret->float_utf8;
19701             }
19702         }
19703     }
19704
19705     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19706     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19707     if (r->recurse_locinput)
19708         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19709
19710     if (ret->pprivate)
19711         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19712
19713     if (RX_MATCH_COPIED(dstr))
19714         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
19715     else
19716         ret->subbeg = NULL;
19717 #ifdef PERL_ANY_COW
19718     ret->saved_copy = NULL;
19719 #endif
19720
19721     /* Whether mother_re be set or no, we need to copy the string.  We
19722        cannot refrain from copying it when the storage points directly to
19723        our mother regexp, because that's
19724                1: a buffer in a different thread
19725                2: something we no longer hold a reference on
19726                so we need to copy it locally.  */
19727     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
19728     ret->mother_re   = NULL;
19729 }
19730 #endif /* PERL_IN_XSUB_RE */
19731
19732 /*
19733    regdupe_internal()
19734
19735    This is the internal complement to regdupe() which is used to copy
19736    the structure pointed to by the *pprivate pointer in the regexp.
19737    This is the core version of the extension overridable cloning hook.
19738    The regexp structure being duplicated will be copied by perl prior
19739    to this and will be provided as the regexp *r argument, however
19740    with the /old/ structures pprivate pointer value. Thus this routine
19741    may override any copying normally done by perl.
19742
19743    It returns a pointer to the new regexp_internal structure.
19744 */
19745
19746 void *
19747 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
19748 {
19749     dVAR;
19750     struct regexp *const r = ReANY(rx);
19751     regexp_internal *reti;
19752     int len;
19753     RXi_GET_DECL(r,ri);
19754
19755     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
19756
19757     len = ProgLen(ri);
19758
19759     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
19760           char, regexp_internal);
19761     Copy(ri->program, reti->program, len+1, regnode);
19762
19763
19764     if (ri->code_blocks) {
19765         int n;
19766         Newx(reti->code_blocks, 1, struct reg_code_blocks);
19767         Newx(reti->code_blocks->cb, ri->code_blocks->count,
19768                     struct reg_code_block);
19769         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
19770              ri->code_blocks->count, struct reg_code_block);
19771         for (n = 0; n < ri->code_blocks->count; n++)
19772              reti->code_blocks->cb[n].src_regex = (REGEXP*)
19773                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
19774         reti->code_blocks->count = ri->code_blocks->count;
19775         reti->code_blocks->refcnt = 1;
19776     }
19777     else
19778         reti->code_blocks = NULL;
19779
19780     reti->regstclass = NULL;
19781
19782     if (ri->data) {
19783         struct reg_data *d;
19784         const int count = ri->data->count;
19785         int i;
19786
19787         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
19788                 char, struct reg_data);
19789         Newx(d->what, count, U8);
19790
19791         d->count = count;
19792         for (i = 0; i < count; i++) {
19793             d->what[i] = ri->data->what[i];
19794             switch (d->what[i]) {
19795                 /* see also regcomp.h and regfree_internal() */
19796             case 'a': /* actually an AV, but the dup function is identical.  */
19797             case 'r':
19798             case 's':
19799             case 'S':
19800             case 'u': /* actually an HV, but the dup function is identical.  */
19801                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
19802                 break;
19803             case 'f':
19804                 /* This is cheating. */
19805                 Newx(d->data[i], 1, regnode_ssc);
19806                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
19807                 reti->regstclass = (regnode*)d->data[i];
19808                 break;
19809             case 'T':
19810                 /* Trie stclasses are readonly and can thus be shared
19811                  * without duplication. We free the stclass in pregfree
19812                  * when the corresponding reg_ac_data struct is freed.
19813                  */
19814                 reti->regstclass= ri->regstclass;
19815                 /* FALLTHROUGH */
19816             case 't':
19817                 OP_REFCNT_LOCK;
19818                 ((reg_trie_data*)ri->data->data[i])->refcount++;
19819                 OP_REFCNT_UNLOCK;
19820                 /* FALLTHROUGH */
19821             case 'l':
19822             case 'L':
19823                 d->data[i] = ri->data->data[i];
19824                 break;
19825             default:
19826                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
19827                                                            ri->data->what[i]);
19828             }
19829         }
19830
19831         reti->data = d;
19832     }
19833     else
19834         reti->data = NULL;
19835
19836     reti->name_list_idx = ri->name_list_idx;
19837
19838 #ifdef RE_TRACK_PATTERN_OFFSETS
19839     if (ri->u.offsets) {
19840         Newx(reti->u.offsets, 2*len+1, U32);
19841         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
19842     }
19843 #else
19844     SetProgLen(reti,len);
19845 #endif
19846
19847     return (void*)reti;
19848 }
19849
19850 #endif    /* USE_ITHREADS */
19851
19852 #ifndef PERL_IN_XSUB_RE
19853
19854 /*
19855  - regnext - dig the "next" pointer out of a node
19856  */
19857 regnode *
19858 Perl_regnext(pTHX_ regnode *p)
19859 {
19860     I32 offset;
19861
19862     if (!p)
19863         return(NULL);
19864
19865     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
19866         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19867                                                 (int)OP(p), (int)REGNODE_MAX);
19868     }
19869
19870     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
19871     if (offset == 0)
19872         return(NULL);
19873
19874     return(p+offset);
19875 }
19876 #endif
19877
19878 STATIC void
19879 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
19880 {
19881     va_list args;
19882     STRLEN l1 = strlen(pat1);
19883     STRLEN l2 = strlen(pat2);
19884     char buf[512];
19885     SV *msv;
19886     const char *message;
19887
19888     PERL_ARGS_ASSERT_RE_CROAK2;
19889
19890     if (l1 > 510)
19891         l1 = 510;
19892     if (l1 + l2 > 510)
19893         l2 = 510 - l1;
19894     Copy(pat1, buf, l1 , char);
19895     Copy(pat2, buf + l1, l2 , char);
19896     buf[l1 + l2] = '\n';
19897     buf[l1 + l2 + 1] = '\0';
19898     va_start(args, pat2);
19899     msv = vmess(buf, &args);
19900     va_end(args);
19901     message = SvPV_const(msv,l1);
19902     if (l1 > 512)
19903         l1 = 512;
19904     Copy(message, buf, l1 , char);
19905     /* l1-1 to avoid \n */
19906     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
19907 }
19908
19909 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
19910
19911 #ifndef PERL_IN_XSUB_RE
19912 void
19913 Perl_save_re_context(pTHX)
19914 {
19915     I32 nparens = -1;
19916     I32 i;
19917
19918     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
19919
19920     if (PL_curpm) {
19921         const REGEXP * const rx = PM_GETRE(PL_curpm);
19922         if (rx)
19923             nparens = RX_NPARENS(rx);
19924     }
19925
19926     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
19927      * that PL_curpm will be null, but that utf8.pm and the modules it
19928      * loads will only use $1..$3.
19929      * The t/porting/re_context.t test file checks this assumption.
19930      */
19931     if (nparens == -1)
19932         nparens = 3;
19933
19934     for (i = 1; i <= nparens; i++) {
19935         char digits[TYPE_CHARS(long)];
19936         const STRLEN len = my_snprintf(digits, sizeof(digits),
19937                                        "%lu", (long)i);
19938         GV *const *const gvp
19939             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
19940
19941         if (gvp) {
19942             GV * const gv = *gvp;
19943             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
19944                 save_scalar(gv);
19945         }
19946     }
19947 }
19948 #endif
19949
19950 #ifdef DEBUGGING
19951
19952 STATIC void
19953 S_put_code_point(pTHX_ SV *sv, UV c)
19954 {
19955     PERL_ARGS_ASSERT_PUT_CODE_POINT;
19956
19957     if (c > 255) {
19958         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
19959     }
19960     else if (isPRINT(c)) {
19961         const char string = (char) c;
19962
19963         /* We use {phrase} as metanotation in the class, so also escape literal
19964          * braces */
19965         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
19966             sv_catpvs(sv, "\\");
19967         sv_catpvn(sv, &string, 1);
19968     }
19969     else if (isMNEMONIC_CNTRL(c)) {
19970         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
19971     }
19972     else {
19973         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
19974     }
19975 }
19976
19977 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
19978
19979 STATIC void
19980 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
19981 {
19982     /* Appends to 'sv' a displayable version of the range of code points from
19983      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
19984      * that have them, when they occur at the beginning or end of the range.
19985      * It uses hex to output the remaining code points, unless 'allow_literals'
19986      * is true, in which case the printable ASCII ones are output as-is (though
19987      * some of these will be escaped by put_code_point()).
19988      *
19989      * NOTE:  This is designed only for printing ranges of code points that fit
19990      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
19991      */
19992
19993     const unsigned int min_range_count = 3;
19994
19995     assert(start <= end);
19996
19997     PERL_ARGS_ASSERT_PUT_RANGE;
19998
19999     while (start <= end) {
20000         UV this_end;
20001         const char * format;
20002
20003         if (end - start < min_range_count) {
20004
20005             /* Output chars individually when they occur in short ranges */
20006             for (; start <= end; start++) {
20007                 put_code_point(sv, start);
20008             }
20009             break;
20010         }
20011
20012         /* If permitted by the input options, and there is a possibility that
20013          * this range contains a printable literal, look to see if there is
20014          * one. */
20015         if (allow_literals && start <= MAX_PRINT_A) {
20016
20017             /* If the character at the beginning of the range isn't an ASCII
20018              * printable, effectively split the range into two parts:
20019              *  1) the portion before the first such printable,
20020              *  2) the rest
20021              * and output them separately. */
20022             if (! isPRINT_A(start)) {
20023                 UV temp_end = start + 1;
20024
20025                 /* There is no point looking beyond the final possible
20026                  * printable, in MAX_PRINT_A */
20027                 UV max = MIN(end, MAX_PRINT_A);
20028
20029                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
20030                     temp_end++;
20031                 }
20032
20033                 /* Here, temp_end points to one beyond the first printable if
20034                  * found, or to one beyond 'max' if not.  If none found, make
20035                  * sure that we use the entire range */
20036                 if (temp_end > MAX_PRINT_A) {
20037                     temp_end = end + 1;
20038                 }
20039
20040                 /* Output the first part of the split range: the part that
20041                  * doesn't have printables, with the parameter set to not look
20042                  * for literals (otherwise we would infinitely recurse) */
20043                 put_range(sv, start, temp_end - 1, FALSE);
20044
20045                 /* The 2nd part of the range (if any) starts here. */
20046                 start = temp_end;
20047
20048                 /* We do a continue, instead of dropping down, because even if
20049                  * the 2nd part is non-empty, it could be so short that we want
20050                  * to output it as individual characters, as tested for at the
20051                  * top of this loop.  */
20052                 continue;
20053             }
20054
20055             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
20056              * output a sub-range of just the digits or letters, then process
20057              * the remaining portion as usual. */
20058             if (isALPHANUMERIC_A(start)) {
20059                 UV mask = (isDIGIT_A(start))
20060                            ? _CC_DIGIT
20061                              : isUPPER_A(start)
20062                                ? _CC_UPPER
20063                                : _CC_LOWER;
20064                 UV temp_end = start + 1;
20065
20066                 /* Find the end of the sub-range that includes just the
20067                  * characters in the same class as the first character in it */
20068                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
20069                     temp_end++;
20070                 }
20071                 temp_end--;
20072
20073                 /* For short ranges, don't duplicate the code above to output
20074                  * them; just call recursively */
20075                 if (temp_end - start < min_range_count) {
20076                     put_range(sv, start, temp_end, FALSE);
20077                 }
20078                 else {  /* Output as a range */
20079                     put_code_point(sv, start);
20080                     sv_catpvs(sv, "-");
20081                     put_code_point(sv, temp_end);
20082                 }
20083                 start = temp_end + 1;
20084                 continue;
20085             }
20086
20087             /* We output any other printables as individual characters */
20088             if (isPUNCT_A(start) || isSPACE_A(start)) {
20089                 while (start <= end && (isPUNCT_A(start)
20090                                         || isSPACE_A(start)))
20091                 {
20092                     put_code_point(sv, start);
20093                     start++;
20094                 }
20095                 continue;
20096             }
20097         } /* End of looking for literals */
20098
20099         /* Here is not to output as a literal.  Some control characters have
20100          * mnemonic names.  Split off any of those at the beginning and end of
20101          * the range to print mnemonically.  It isn't possible for many of
20102          * these to be in a row, so this won't overwhelm with output */
20103         if (   start <= end
20104             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
20105         {
20106             while (isMNEMONIC_CNTRL(start) && start <= end) {
20107                 put_code_point(sv, start);
20108                 start++;
20109             }
20110
20111             /* If this didn't take care of the whole range ... */
20112             if (start <= end) {
20113
20114                 /* Look backwards from the end to find the final non-mnemonic
20115                  * */
20116                 UV temp_end = end;
20117                 while (isMNEMONIC_CNTRL(temp_end)) {
20118                     temp_end--;
20119                 }
20120
20121                 /* And separately output the interior range that doesn't start
20122                  * or end with mnemonics */
20123                 put_range(sv, start, temp_end, FALSE);
20124
20125                 /* Then output the mnemonic trailing controls */
20126                 start = temp_end + 1;
20127                 while (start <= end) {
20128                     put_code_point(sv, start);
20129                     start++;
20130                 }
20131                 break;
20132             }
20133         }
20134
20135         /* As a final resort, output the range or subrange as hex. */
20136
20137         this_end = (end < NUM_ANYOF_CODE_POINTS)
20138                     ? end
20139                     : NUM_ANYOF_CODE_POINTS - 1;
20140 #if NUM_ANYOF_CODE_POINTS > 256
20141         format = (this_end < 256)
20142                  ? "\\x%02" UVXf "-\\x%02" UVXf
20143                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
20144 #else
20145         format = "\\x%02" UVXf "-\\x%02" UVXf;
20146 #endif
20147         GCC_DIAG_IGNORE(-Wformat-nonliteral);
20148         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
20149         GCC_DIAG_RESTORE;
20150         break;
20151     }
20152 }
20153
20154 STATIC void
20155 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
20156 {
20157     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
20158      * 'invlist' */
20159
20160     UV start, end;
20161     bool allow_literals = TRUE;
20162
20163     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
20164
20165     /* Generally, it is more readable if printable characters are output as
20166      * literals, but if a range (nearly) spans all of them, it's best to output
20167      * it as a single range.  This code will use a single range if all but 2
20168      * ASCII printables are in it */
20169     invlist_iterinit(invlist);
20170     while (invlist_iternext(invlist, &start, &end)) {
20171
20172         /* If the range starts beyond the final printable, it doesn't have any
20173          * in it */
20174         if (start > MAX_PRINT_A) {
20175             break;
20176         }
20177
20178         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
20179          * all but two, the range must start and end no later than 2 from
20180          * either end */
20181         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
20182             if (end > MAX_PRINT_A) {
20183                 end = MAX_PRINT_A;
20184             }
20185             if (start < ' ') {
20186                 start = ' ';
20187             }
20188             if (end - start >= MAX_PRINT_A - ' ' - 2) {
20189                 allow_literals = FALSE;
20190             }
20191             break;
20192         }
20193     }
20194     invlist_iterfinish(invlist);
20195
20196     /* Here we have figured things out.  Output each range */
20197     invlist_iterinit(invlist);
20198     while (invlist_iternext(invlist, &start, &end)) {
20199         if (start >= NUM_ANYOF_CODE_POINTS) {
20200             break;
20201         }
20202         put_range(sv, start, end, allow_literals);
20203     }
20204     invlist_iterfinish(invlist);
20205
20206     return;
20207 }
20208
20209 STATIC SV*
20210 S_put_charclass_bitmap_innards_common(pTHX_
20211         SV* invlist,            /* The bitmap */
20212         SV* posixes,            /* Under /l, things like [:word:], \S */
20213         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
20214         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
20215         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
20216         const bool invert       /* Is the result to be inverted? */
20217 )
20218 {
20219     /* Create and return an SV containing a displayable version of the bitmap
20220      * and associated information determined by the input parameters.  If the
20221      * output would have been only the inversion indicator '^', NULL is instead
20222      * returned. */
20223
20224     SV * output;
20225
20226     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
20227
20228     if (invert) {
20229         output = newSVpvs("^");
20230     }
20231     else {
20232         output = newSVpvs("");
20233     }
20234
20235     /* First, the code points in the bitmap that are unconditionally there */
20236     put_charclass_bitmap_innards_invlist(output, invlist);
20237
20238     /* Traditionally, these have been placed after the main code points */
20239     if (posixes) {
20240         sv_catsv(output, posixes);
20241     }
20242
20243     if (only_utf8 && _invlist_len(only_utf8)) {
20244         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
20245         put_charclass_bitmap_innards_invlist(output, only_utf8);
20246     }
20247
20248     if (not_utf8 && _invlist_len(not_utf8)) {
20249         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
20250         put_charclass_bitmap_innards_invlist(output, not_utf8);
20251     }
20252
20253     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
20254         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
20255         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
20256
20257         /* This is the only list in this routine that can legally contain code
20258          * points outside the bitmap range.  The call just above to
20259          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
20260          * output them here.  There's about a half-dozen possible, and none in
20261          * contiguous ranges longer than 2 */
20262         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20263             UV start, end;
20264             SV* above_bitmap = NULL;
20265
20266             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
20267
20268             invlist_iterinit(above_bitmap);
20269             while (invlist_iternext(above_bitmap, &start, &end)) {
20270                 UV i;
20271
20272                 for (i = start; i <= end; i++) {
20273                     put_code_point(output, i);
20274                 }
20275             }
20276             invlist_iterfinish(above_bitmap);
20277             SvREFCNT_dec_NN(above_bitmap);
20278         }
20279     }
20280
20281     if (invert && SvCUR(output) == 1) {
20282         return NULL;
20283     }
20284
20285     return output;
20286 }
20287
20288 STATIC bool
20289 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20290                                      char *bitmap,
20291                                      SV *nonbitmap_invlist,
20292                                      SV *only_utf8_locale_invlist,
20293                                      const regnode * const node,
20294                                      const bool force_as_is_display)
20295 {
20296     /* Appends to 'sv' a displayable version of the innards of the bracketed
20297      * character class defined by the other arguments:
20298      *  'bitmap' points to the bitmap.
20299      *  'nonbitmap_invlist' is an inversion list of the code points that are in
20300      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
20301      *      none.  The reasons for this could be that they require some
20302      *      condition such as the target string being or not being in UTF-8
20303      *      (under /d), or because they came from a user-defined property that
20304      *      was not resolved at the time of the regex compilation (under /u)
20305      *  'only_utf8_locale_invlist' is an inversion list of the code points that
20306      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
20307      *  'node' is the regex pattern node.  It is needed only when the above two
20308      *      parameters are not null, and is passed so that this routine can
20309      *      tease apart the various reasons for them.
20310      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
20311      *      to invert things to see if that leads to a cleaner display.  If
20312      *      FALSE, this routine is free to use its judgment about doing this.
20313      *
20314      * It returns TRUE if there was actually something output.  (It may be that
20315      * the bitmap, etc is empty.)
20316      *
20317      * When called for outputting the bitmap of a non-ANYOF node, just pass the
20318      * bitmap, with the succeeding parameters set to NULL, and the final one to
20319      * FALSE.
20320      */
20321
20322     /* In general, it tries to display the 'cleanest' representation of the
20323      * innards, choosing whether to display them inverted or not, regardless of
20324      * whether the class itself is to be inverted.  However,  there are some
20325      * cases where it can't try inverting, as what actually matches isn't known
20326      * until runtime, and hence the inversion isn't either. */
20327     bool inverting_allowed = ! force_as_is_display;
20328
20329     int i;
20330     STRLEN orig_sv_cur = SvCUR(sv);
20331
20332     SV* invlist;            /* Inversion list we accumulate of code points that
20333                                are unconditionally matched */
20334     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
20335                                UTF-8 */
20336     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
20337                              */
20338     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
20339     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
20340                                        is UTF-8 */
20341
20342     SV* as_is_display;      /* The output string when we take the inputs
20343                                literally */
20344     SV* inverted_display;   /* The output string when we invert the inputs */
20345
20346     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20347
20348     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
20349                                                    to match? */
20350     /* We are biased in favor of displaying things without them being inverted,
20351      * as that is generally easier to understand */
20352     const int bias = 5;
20353
20354     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20355
20356     /* Start off with whatever code points are passed in.  (We clone, so we
20357      * don't change the caller's list) */
20358     if (nonbitmap_invlist) {
20359         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20360         invlist = invlist_clone(nonbitmap_invlist);
20361     }
20362     else {  /* Worst case size is every other code point is matched */
20363         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20364     }
20365
20366     if (flags) {
20367         if (OP(node) == ANYOFD) {
20368
20369             /* This flag indicates that the code points below 0x100 in the
20370              * nonbitmap list are precisely the ones that match only when the
20371              * target is UTF-8 (they should all be non-ASCII). */
20372             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20373             {
20374                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20375                 _invlist_subtract(invlist, only_utf8, &invlist);
20376             }
20377
20378             /* And this flag for matching all non-ASCII 0xFF and below */
20379             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20380             {
20381                 not_utf8 = invlist_clone(PL_UpperLatin1);
20382             }
20383         }
20384         else if (OP(node) == ANYOFL) {
20385
20386             /* If either of these flags are set, what matches isn't
20387              * determinable except during execution, so don't know enough here
20388              * to invert */
20389             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20390                 inverting_allowed = FALSE;
20391             }
20392
20393             /* What the posix classes match also varies at runtime, so these
20394              * will be output symbolically. */
20395             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20396                 int i;
20397
20398                 posixes = newSVpvs("");
20399                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20400                     if (ANYOF_POSIXL_TEST(node,i)) {
20401                         sv_catpv(posixes, anyofs[i]);
20402                     }
20403                 }
20404             }
20405         }
20406     }
20407
20408     /* Accumulate the bit map into the unconditional match list */
20409     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20410         if (BITMAP_TEST(bitmap, i)) {
20411             int start = i++;
20412             for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20413                 /* empty */
20414             }
20415             invlist = _add_range_to_invlist(invlist, start, i-1);
20416         }
20417     }
20418
20419     /* Make sure that the conditional match lists don't have anything in them
20420      * that match unconditionally; otherwise the output is quite confusing.
20421      * This could happen if the code that populates these misses some
20422      * duplication. */
20423     if (only_utf8) {
20424         _invlist_subtract(only_utf8, invlist, &only_utf8);
20425     }
20426     if (not_utf8) {
20427         _invlist_subtract(not_utf8, invlist, &not_utf8);
20428     }
20429
20430     if (only_utf8_locale_invlist) {
20431
20432         /* Since this list is passed in, we have to make a copy before
20433          * modifying it */
20434         only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20435
20436         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20437
20438         /* And, it can get really weird for us to try outputting an inverted
20439          * form of this list when it has things above the bitmap, so don't even
20440          * try */
20441         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20442             inverting_allowed = FALSE;
20443         }
20444     }
20445
20446     /* Calculate what the output would be if we take the input as-is */
20447     as_is_display = put_charclass_bitmap_innards_common(invlist,
20448                                                     posixes,
20449                                                     only_utf8,
20450                                                     not_utf8,
20451                                                     only_utf8_locale,
20452                                                     invert);
20453
20454     /* If have to take the output as-is, just do that */
20455     if (! inverting_allowed) {
20456         if (as_is_display) {
20457             sv_catsv(sv, as_is_display);
20458             SvREFCNT_dec_NN(as_is_display);
20459         }
20460     }
20461     else { /* But otherwise, create the output again on the inverted input, and
20462               use whichever version is shorter */
20463
20464         int inverted_bias, as_is_bias;
20465
20466         /* We will apply our bias to whichever of the the results doesn't have
20467          * the '^' */
20468         if (invert) {
20469             invert = FALSE;
20470             as_is_bias = bias;
20471             inverted_bias = 0;
20472         }
20473         else {
20474             invert = TRUE;
20475             as_is_bias = 0;
20476             inverted_bias = bias;
20477         }
20478
20479         /* Now invert each of the lists that contribute to the output,
20480          * excluding from the result things outside the possible range */
20481
20482         /* For the unconditional inversion list, we have to add in all the
20483          * conditional code points, so that when inverted, they will be gone
20484          * from it */
20485         _invlist_union(only_utf8, invlist, &invlist);
20486         _invlist_union(not_utf8, invlist, &invlist);
20487         _invlist_union(only_utf8_locale, invlist, &invlist);
20488         _invlist_invert(invlist);
20489         _invlist_intersection(invlist, PL_InBitmap, &invlist);
20490
20491         if (only_utf8) {
20492             _invlist_invert(only_utf8);
20493             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20494         }
20495         else if (not_utf8) {
20496
20497             /* If a code point matches iff the target string is not in UTF-8,
20498              * then complementing the result has it not match iff not in UTF-8,
20499              * which is the same thing as matching iff it is UTF-8. */
20500             only_utf8 = not_utf8;
20501             not_utf8 = NULL;
20502         }
20503
20504         if (only_utf8_locale) {
20505             _invlist_invert(only_utf8_locale);
20506             _invlist_intersection(only_utf8_locale,
20507                                   PL_InBitmap,
20508                                   &only_utf8_locale);
20509         }
20510
20511         inverted_display = put_charclass_bitmap_innards_common(
20512                                             invlist,
20513                                             posixes,
20514                                             only_utf8,
20515                                             not_utf8,
20516                                             only_utf8_locale, invert);
20517
20518         /* Use the shortest representation, taking into account our bias
20519          * against showing it inverted */
20520         if (   inverted_display
20521             && (   ! as_is_display
20522                 || (  SvCUR(inverted_display) + inverted_bias
20523                     < SvCUR(as_is_display)    + as_is_bias)))
20524         {
20525             sv_catsv(sv, inverted_display);
20526         }
20527         else if (as_is_display) {
20528             sv_catsv(sv, as_is_display);
20529         }
20530
20531         SvREFCNT_dec(as_is_display);
20532         SvREFCNT_dec(inverted_display);
20533     }
20534
20535     SvREFCNT_dec_NN(invlist);
20536     SvREFCNT_dec(only_utf8);
20537     SvREFCNT_dec(not_utf8);
20538     SvREFCNT_dec(posixes);
20539     SvREFCNT_dec(only_utf8_locale);
20540
20541     return SvCUR(sv) > orig_sv_cur;
20542 }
20543
20544 #define CLEAR_OPTSTART                                                       \
20545     if (optstart) STMT_START {                                               \
20546         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
20547                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
20548         optstart=NULL;                                                       \
20549     } STMT_END
20550
20551 #define DUMPUNTIL(b,e)                                                       \
20552                     CLEAR_OPTSTART;                                          \
20553                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20554
20555 STATIC const regnode *
20556 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20557             const regnode *last, const regnode *plast,
20558             SV* sv, I32 indent, U32 depth)
20559 {
20560     U8 op = PSEUDO;     /* Arbitrary non-END op. */
20561     const regnode *next;
20562     const regnode *optstart= NULL;
20563
20564     RXi_GET_DECL(r,ri);
20565     GET_RE_DEBUG_FLAGS_DECL;
20566
20567     PERL_ARGS_ASSERT_DUMPUNTIL;
20568
20569 #ifdef DEBUG_DUMPUNTIL
20570     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n",indent,node-start,
20571         last ? last-start : 0,plast ? plast-start : 0);
20572 #endif
20573
20574     if (plast && plast < last)
20575         last= plast;
20576
20577     while (PL_regkind[op] != END && (!last || node < last)) {
20578         assert(node);
20579         /* While that wasn't END last time... */
20580         NODE_ALIGN(node);
20581         op = OP(node);
20582         if (op == CLOSE || op == WHILEM)
20583             indent--;
20584         next = regnext((regnode *)node);
20585
20586         /* Where, what. */
20587         if (OP(node) == OPTIMIZED) {
20588             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20589                 optstart = node;
20590             else
20591                 goto after_print;
20592         } else
20593             CLEAR_OPTSTART;
20594
20595         regprop(r, sv, node, NULL, NULL);
20596         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
20597                       (int)(2*indent + 1), "", SvPVX_const(sv));
20598
20599         if (OP(node) != OPTIMIZED) {
20600             if (next == NULL)           /* Next ptr. */
20601                 Perl_re_printf( aTHX_  " (0)");
20602             else if (PL_regkind[(U8)op] == BRANCH
20603                      && PL_regkind[OP(next)] != BRANCH )
20604                 Perl_re_printf( aTHX_  " (FAIL)");
20605             else
20606                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
20607             Perl_re_printf( aTHX_ "\n");
20608         }
20609
20610       after_print:
20611         if (PL_regkind[(U8)op] == BRANCHJ) {
20612             assert(next);
20613             {
20614                 const regnode *nnode = (OP(next) == LONGJMP
20615                                        ? regnext((regnode *)next)
20616                                        : next);
20617                 if (last && nnode > last)
20618                     nnode = last;
20619                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20620             }
20621         }
20622         else if (PL_regkind[(U8)op] == BRANCH) {
20623             assert(next);
20624             DUMPUNTIL(NEXTOPER(node), next);
20625         }
20626         else if ( PL_regkind[(U8)op]  == TRIE ) {
20627             const regnode *this_trie = node;
20628             const char op = OP(node);
20629             const U32 n = ARG(node);
20630             const reg_ac_data * const ac = op>=AHOCORASICK ?
20631                (reg_ac_data *)ri->data->data[n] :
20632                NULL;
20633             const reg_trie_data * const trie =
20634                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20635 #ifdef DEBUGGING
20636             AV *const trie_words
20637                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20638 #endif
20639             const regnode *nextbranch= NULL;
20640             I32 word_idx;
20641             SvPVCLEAR(sv);
20642             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20643                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20644
20645                 Perl_re_indentf( aTHX_  "%s ",
20646                     indent+3,
20647                     elem_ptr
20648                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20649                                 SvCUR(*elem_ptr), 60,
20650                                 PL_colors[0], PL_colors[1],
20651                                 (SvUTF8(*elem_ptr)
20652                                  ? PERL_PV_ESCAPE_UNI
20653                                  : 0)
20654                                 | PERL_PV_PRETTY_ELLIPSES
20655                                 | PERL_PV_PRETTY_LTGT
20656                             )
20657                     : "???"
20658                 );
20659                 if (trie->jump) {
20660                     U16 dist= trie->jump[word_idx+1];
20661                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
20662                                (UV)((dist ? this_trie + dist : next) - start));
20663                     if (dist) {
20664                         if (!nextbranch)
20665                             nextbranch= this_trie + trie->jump[0];
20666                         DUMPUNTIL(this_trie + dist, nextbranch);
20667                     }
20668                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20669                         nextbranch= regnext((regnode *)nextbranch);
20670                 } else {
20671                     Perl_re_printf( aTHX_  "\n");
20672                 }
20673             }
20674             if (last && next > last)
20675                 node= last;
20676             else
20677                 node= next;
20678         }
20679         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
20680             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20681                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20682         }
20683         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20684             assert(next);
20685             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20686         }
20687         else if ( op == PLUS || op == STAR) {
20688             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20689         }
20690         else if (PL_regkind[(U8)op] == ANYOF) {
20691             /* arglen 1 + class block */
20692             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20693                           ? ANYOF_POSIXL_SKIP
20694                           : ANYOF_SKIP);
20695             node = NEXTOPER(node);
20696         }
20697         else if (PL_regkind[(U8)op] == EXACT) {
20698             /* Literal string, where present. */
20699             node += NODE_SZ_STR(node) - 1;
20700             node = NEXTOPER(node);
20701         }
20702         else {
20703             node = NEXTOPER(node);
20704             node += regarglen[(U8)op];
20705         }
20706         if (op == CURLYX || op == OPEN)
20707             indent++;
20708     }
20709     CLEAR_OPTSTART;
20710 #ifdef DEBUG_DUMPUNTIL
20711     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
20712 #endif
20713     return node;
20714 }
20715
20716 #endif  /* DEBUGGING */
20717
20718 /*
20719  * ex: set ts=8 sts=4 sw=4 et:
20720  */