regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* this is a chain of data about sub patterns we are processing that
 105    need to be handled separately/specially in study_chunk. Its so
 106    we can simulate recursion without losing state.  */
 107 struct scan_frame;
 108 typedef struct scan_frame {
 109     regnode *last_regnode;      /* last node to process in this frame */
 110     regnode *next_regnode;      /* next node to process when last is reached */
 111     U32 prev_recursed_depth;
 112     I32 stopparen;              /* what stopparen do we use */
 113
 114     struct scan_frame *this_prev_frame; /* this previous frame */
 115     struct scan_frame *prev_frame;      /* previous frame */
 116     struct scan_frame *next_frame;      /* next frame */
 117 } scan_frame;
 118
 119 /* Certain characters are output as a sequence with the first being a
 120  * backslash. */
 121 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 122
 123
 124 struct RExC_state_t {
 125     U32         flags;                  /* RXf_* are we folding, multilining? */
 126     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 127     char        *precomp;               /* uncompiled string. */
 128     char        *precomp_end;           /* pointer to end of uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     char        *adjusted_start;        /* 'start', adjusted.  See code use */
 137     STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
 138     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 139     regnode     *emit_start;            /* Start of emitted-code area */
 140     regnode     *emit_bound;            /* First regnode outside of the
 141                                            allocated space */
 142     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 143                                            implies compiling, so don't emit */
 144     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 145                                            large enough for the largest
 146                                            non-EXACTish node, so can use it as
 147                                            scratch in pass1 */
 148     I32         naughty;                /* How bad is this pattern? */
 149     I32         sawback;                /* Did we see \1, ...? */
 150     U32         seen;
 151     SSize_t     size;                   /* Code size. */
 152     I32         npar;                   /* Capture buffer count, (OPEN) plus
 153                                            one. ("par" 0 is the whole
 154                                            pattern)*/
 155     I32         nestroot;               /* root parens we are in - used by
 156                                            accept */
 157     I32         extralen;
 158     I32         seen_zerolen;
 159     regnode     **open_parens;          /* pointers to open parens */
 160     regnode     **close_parens;         /* pointers to close parens */
 161     regnode     *end_op;                /* END node in program */
 162     I32         utf8;           /* whether the pattern is utf8 or not */
 163     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 164                                 /* XXX use this for future optimisation of case
 165                                  * where pattern must be upgraded to utf8. */
 166     I32         uni_semantics;  /* If a d charset modifier should use unicode
 167                                    rules, even if the pattern is not in
 168                                    utf8 */
 169     HV          *paren_names;           /* Paren names */
 170
 171     regnode     **recurse;              /* Recurse regops */
 172     I32                recurse_count;                /* Number of recurse regops we have generated */
 173     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 174                                            through */
 175     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 176     I32         in_lookbehind;
 177     I32         contains_locale;
 178     I32         override_recoding;
 179 #ifdef EBCDIC
 180     I32         recode_x_to_native;
 181 #endif
 182     I32         in_multi_char_class;
 183     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 184                                             within pattern */
 185     int         code_index;             /* next code_blocks[] slot */
 186     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 187     scan_frame *frame_head;
 188     scan_frame *frame_last;
 189     U32         frame_count;
 190     AV         *warn_text;
 191 #ifdef ADD_TO_REGEXEC
 192     char        *starttry;              /* -Dr: where regtry was called. */
 193 #define RExC_starttry   (pRExC_state->starttry)
 194 #endif
 195     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 196 #ifdef DEBUGGING
 197     const char  *lastparse;
 198     I32         lastnum;
 199     AV          *paren_name_list;       /* idx -> name */
 200     U32         study_chunk_recursed_count;
 201     SV          *mysv1;
 202     SV          *mysv2;
 203 #define RExC_lastparse  (pRExC_state->lastparse)
 204 #define RExC_lastnum    (pRExC_state->lastnum)
 205 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 206 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 207 #define RExC_mysv       (pRExC_state->mysv1)
 208 #define RExC_mysv1      (pRExC_state->mysv1)
 209 #define RExC_mysv2      (pRExC_state->mysv2)
 210
 211 #endif
 212     bool        seen_unfolded_sharp_s;
 213     bool        strict;
 214     bool        study_started;
 215     bool        in_script_run;
 216 };
 217
 218 #define RExC_flags      (pRExC_state->flags)
 219 #define RExC_pm_flags   (pRExC_state->pm_flags)
 220 #define RExC_precomp    (pRExC_state->precomp)
 221 #define RExC_precomp_adj (pRExC_state->precomp_adj)
 222 #define RExC_adjusted_start  (pRExC_state->adjusted_start)
 223 #define RExC_precomp_end (pRExC_state->precomp_end)
 224 #define RExC_rx_sv      (pRExC_state->rx_sv)
 225 #define RExC_rx         (pRExC_state->rx)
 226 #define RExC_rxi        (pRExC_state->rxi)
 227 #define RExC_start      (pRExC_state->start)
 228 #define RExC_end        (pRExC_state->end)
 229 #define RExC_parse      (pRExC_state->parse)
 230 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 231
 232 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
 233  * EXACTF node, hence was parsed under /di rules.  If later in the parse,
 234  * something forces the pattern into using /ui rules, the sharp s should be
 235  * folded into the sequence 'ss', which takes up more space than previously
 236  * calculated.  This means that the sizing pass needs to be restarted.  (The
 237  * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
 238  * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
 239  * so there is no need to resize [perl #125990]. */
 240 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
 241
 242 #ifdef RE_TRACK_PATTERN_OFFSETS
 243 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 244                                                          others */
 245 #endif
 246 #define RExC_emit       (pRExC_state->emit)
 247 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 248 #define RExC_emit_start (pRExC_state->emit_start)
 249 #define RExC_emit_bound (pRExC_state->emit_bound)
 250 #define RExC_sawback    (pRExC_state->sawback)
 251 #define RExC_seen       (pRExC_state->seen)
 252 #define RExC_size       (pRExC_state->size)
 253 #define RExC_maxlen        (pRExC_state->maxlen)
 254 #define RExC_npar       (pRExC_state->npar)
 255 #define RExC_nestroot   (pRExC_state->nestroot)
 256 #define RExC_extralen   (pRExC_state->extralen)
 257 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 258 #define RExC_utf8       (pRExC_state->utf8)
 259 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 260 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 261 #define RExC_open_parens        (pRExC_state->open_parens)
 262 #define RExC_close_parens       (pRExC_state->close_parens)
 263 #define RExC_end_op     (pRExC_state->end_op)
 264 #define RExC_paren_names        (pRExC_state->paren_names)
 265 #define RExC_recurse    (pRExC_state->recurse)
 266 #define RExC_recurse_count      (pRExC_state->recurse_count)
 267 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 268 #define RExC_study_chunk_recursed_bytes  \
 269                                    (pRExC_state->study_chunk_recursed_bytes)
 270 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 271 #define RExC_contains_locale    (pRExC_state->contains_locale)
 272 #ifdef EBCDIC
 273 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 274 #endif
 275 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 276 #define RExC_frame_head (pRExC_state->frame_head)
 277 #define RExC_frame_last (pRExC_state->frame_last)
 278 #define RExC_frame_count (pRExC_state->frame_count)
 279 #define RExC_strict (pRExC_state->strict)
 280 #define RExC_study_started      (pRExC_state->study_started)
 281 #define RExC_warn_text (pRExC_state->warn_text)
 282 #define RExC_in_script_run      (pRExC_state->in_script_run)
 283
 284 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 285  * a flag to disable back-off on the fixed/floating substrings - if it's
 286  * a high complexity pattern we assume the benefit of avoiding a full match
 287  * is worth the cost of checking for the substrings even if they rarely help.
 288  */
 289 #define RExC_naughty    (pRExC_state->naughty)
 290 #define TOO_NAUGHTY (10)
 291 #define MARK_NAUGHTY(add) \
 292     if (RExC_naughty < TOO_NAUGHTY) \
 293         RExC_naughty += (add)
 294 #define MARK_NAUGHTY_EXP(exp, add) \
 295     if (RExC_naughty < TOO_NAUGHTY) \
 296         RExC_naughty += RExC_naughty / (exp) + (add)
 297
 298 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 299 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 300         ((*s) == '{' && regcurly(s)))
 301
 302 /*
 303  * Flags to be passed up and down.
 304  */
 305 #define WORST           0       /* Worst case. */
 306 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 307
 308 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 309  * character.  (There needs to be a case: in the switch statement in regexec.c
 310  * for any node marked SIMPLE.)  Note that this is not the same thing as
 311  * REGNODE_SIMPLE */
 312 #define SIMPLE          0x02
 313 #define SPSTART         0x04    /* Starts with * or + */
 314 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 315 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 316 #define RESTART_PASS1   0x20    /* Need to restart sizing pass */
 317 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
 318                                    calcuate sizes as UTF-8 */
 319
 320 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 321
 322 /* whether trie related optimizations are enabled */
 323 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 324 #define TRIE_STUDY_OPT
 325 #define FULL_TRIE_STUDY
 326 #define TRIE_STCLASS
 327 #endif
 328
 329
 330
 331 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 332 #define PBITVAL(paren) (1 << ((paren) & 7))
 333 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 334 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 335 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 336
 337 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 338                                      if (!UTF) {                           \
 339                                          assert(PASS1);                    \
 340                                          *flagp = RESTART_PASS1|NEED_UTF8; \
 341                                          return NULL;                      \
 342                                      }                                     \
 343                              } STMT_END
 344
 345 /* Change from /d into /u rules, and restart the parse if we've already seen
 346  * something whose size would increase as a result, by setting *flagp and
 347  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
 348  * we've change to /u during the parse.  */
 349 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 350     STMT_START {                                                            \
 351             if (DEPENDS_SEMANTICS) {                                        \
 352                 assert(PASS1);                                              \
 353                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 354                 RExC_uni_semantics = 1;                                     \
 355                 if (RExC_seen_unfolded_sharp_s) {                           \
 356                     *flagp |= RESTART_PASS1;                                \
 357                     return restart_retval;                                  \
 358                 }                                                           \
 359             }                                                               \
 360     } STMT_END
 361
 362 /* This converts the named class defined in regcomp.h to its equivalent class
 363  * number defined in handy.h. */
 364 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 365 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 366
 367 #define _invlist_union_complement_2nd(a, b, output) \
 368                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 369 #define _invlist_intersection_complement_2nd(a, b, output) \
 370                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 371
 372 /* About scan_data_t.
 373
 374   During optimisation we recurse through the regexp program performing
 375   various inplace (keyhole style) optimisations. In addition study_chunk
 376   and scan_commit populate this data structure with information about
 377   what strings MUST appear in the pattern. We look for the longest
 378   string that must appear at a fixed location, and we look for the
 379   longest string that may appear at a floating location. So for instance
 380   in the pattern:
 381
 382     /FOO[xX]A.*B[xX]BAR/
 383
 384   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 385   strings (because they follow a .* construct). study_chunk will identify
 386   both FOO and BAR as being the longest fixed and floating strings respectively.
 387
 388   The strings can be composites, for instance
 389
 390      /(f)(o)(o)/
 391
 392   will result in a composite fixed substring 'foo'.
 393
 394   For each string some basic information is maintained:
 395
 396   - min_offset
 397     This is the position the string must appear at, or not before.
 398     It also implicitly (when combined with minlenp) tells us how many
 399     characters must match before the string we are searching for.
 400     Likewise when combined with minlenp and the length of the string it
 401     tells us how many characters must appear after the string we have
 402     found.
 403
 404   - max_offset
 405     Only used for floating strings. This is the rightmost point that
 406     the string can appear at. If set to SSize_t_MAX it indicates that the
 407     string can occur infinitely far to the right.
 408     For fixed strings, it is equal to min_offset.
 409
 410   - minlenp
 411     A pointer to the minimum number of characters of the pattern that the
 412     string was found inside. This is important as in the case of positive
 413     lookahead or positive lookbehind we can have multiple patterns
 414     involved. Consider
 415
 416     /(?=FOO).*F/
 417
 418     The minimum length of the pattern overall is 3, the minimum length
 419     of the lookahead part is 3, but the minimum length of the part that
 420     will actually match is 1. So 'FOO's minimum length is 3, but the
 421     minimum length for the F is 1. This is important as the minimum length
 422     is used to determine offsets in front of and behind the string being
 423     looked for.  Since strings can be composites this is the length of the
 424     pattern at the time it was committed with a scan_commit. Note that
 425     the length is calculated by study_chunk, so that the minimum lengths
 426     are not known until the full pattern has been compiled, thus the
 427     pointer to the value.
 428
 429   - lookbehind
 430
 431     In the case of lookbehind the string being searched for can be
 432     offset past the start point of the final matching string.
 433     If this value was just blithely removed from the min_offset it would
 434     invalidate some of the calculations for how many chars must match
 435     before or after (as they are derived from min_offset and minlen and
 436     the length of the string being searched for).
 437     When the final pattern is compiled and the data is moved from the
 438     scan_data_t structure into the regexp structure the information
 439     about lookbehind is factored in, with the information that would
 440     have been lost precalculated in the end_shift field for the
 441     associated string.
 442
 443   The fields pos_min and pos_delta are used to store the minimum offset
 444   and the delta to the maximum offset at the current point in the pattern.
 445
 446 */
 447
 448 struct scan_data_substrs {
 449     SV      *str;       /* longest substring found in pattern */
 450     SSize_t min_offset; /* earliest point in string it can appear */
 451     SSize_t max_offset; /* latest point in string it can appear */
 452     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 453     SSize_t lookbehind; /* is the pos of the string modified by LB */
 454     I32 flags;          /* per substring SF_* and SCF_* flags */
 455 };
 456
 457 typedef struct scan_data_t {
 458     /*I32 len_min;      unused */
 459     /*I32 len_delta;    unused */
 460     SSize_t pos_min;
 461     SSize_t pos_delta;
 462     SV *last_found;
 463     SSize_t last_end;       /* min value, <0 unless valid. */
 464     SSize_t last_start_min;
 465     SSize_t last_start_max;
 466     U8      cur_is_floating; /* whether the last_* values should be set as
 467                               * the next fixed (0) or floating (1)
 468                               * substring */
 469
 470     /* [0] is longest fixed substring so far, [1] is longest float so far */
 471     struct scan_data_substrs  substrs[2];
 472
 473     I32 flags;             /* common SF_* and SCF_* flags */
 474     I32 whilem_c;
 475     SSize_t *last_closep;
 476     regnode_ssc *start_class;
 477 } scan_data_t;
 478
 479 /*
 480  * Forward declarations for pregcomp()'s friends.
 481  */
 482
 483 static const scan_data_t zero_scan_data = {
 484     0, 0, NULL, 0, 0, 0, 0,
 485     {
 486         { NULL, 0, 0, 0, 0, 0 },
 487         { NULL, 0, 0, 0, 0, 0 },
 488     },
 489     0, 0, NULL, NULL
 490 };
 491
 492 /* study flags */
 493
 494 #define SF_BEFORE_SEOL          0x0001
 495 #define SF_BEFORE_MEOL          0x0002
 496 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 497
 498 #define SF_IS_INF               0x0040
 499 #define SF_HAS_PAR              0x0080
 500 #define SF_IN_PAR               0x0100
 501 #define SF_HAS_EVAL             0x0200
 502
 503
 504 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 505  * longest substring in the pattern. When it is not set the optimiser keeps
 506  * track of position, but does not keep track of the actual strings seen,
 507  *
 508  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 509  * /foo/i will not.
 510  *
 511  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 512  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 513  * turned off because of the alternation (BRANCH). */
 514 #define SCF_DO_SUBSTR           0x0400
 515
 516 #define SCF_DO_STCLASS_AND      0x0800
 517 #define SCF_DO_STCLASS_OR       0x1000
 518 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 519 #define SCF_WHILEM_VISITED_POS  0x2000
 520
 521 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 522 #define SCF_SEEN_ACCEPT         0x8000
 523 #define SCF_TRIE_DOING_RESTUDY 0x10000
 524 #define SCF_IN_DEFINE          0x20000
 525
 526
 527
 528
 529 #define UTF cBOOL(RExC_utf8)
 530
 531 /* The enums for all these are ordered so things work out correctly */
 532 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 533 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 534                                                      == REGEX_DEPENDS_CHARSET)
 535 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 536 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 537                                                      >= REGEX_UNICODE_CHARSET)
 538 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 539                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 540 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 541                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 542 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 543                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 544
 545 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 546
 547 /* For programs that want to be strictly Unicode compatible by dying if any
 548  * attempt is made to match a non-Unicode code point against a Unicode
 549  * property.  */
 550 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 551
 552 #define OOB_NAMEDCLASS          -1
 553
 554 /* There is no code point that is out-of-bounds, so this is problematic.  But
 555  * its only current use is to initialize a variable that is always set before
 556  * looked at. */
 557 #define OOB_UNICODE             0xDEADBEEF
 558
 559 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 560
 561
 562 /* length of regex to show in messages that don't mark a position within */
 563 #define RegexLengthToShowInErrorMessages 127
 564
 565 /*
 566  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 567  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 568  * op/pragma/warn/regcomp.
 569  */
 570 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 571 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 572
 573 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 574                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 575
 576 /* The code in this file in places uses one level of recursion with parsing
 577  * rebased to an alternate string constructed by us in memory.  This can take
 578  * the form of something that is completely different from the input, or
 579  * something that uses the input as part of the alternate.  In the first case,
 580  * there should be no possibility of an error, as we are in complete control of
 581  * the alternate string.  But in the second case we don't control the input
 582  * portion, so there may be errors in that.  Here's an example:
 583  *      /[abc\x{DF}def]/ui
 584  * is handled specially because \x{df} folds to a sequence of more than one
 585  * character, 'ss'.  What is done is to create and parse an alternate string,
 586  * which looks like this:
 587  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 588  * where it uses the input unchanged in the middle of something it constructs,
 589  * which is a branch for the DF outside the character class, and clustering
 590  * parens around the whole thing. (It knows enough to skip the DF inside the
 591  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 592  * need to be reported.  The general situation looks like this:
 593  *
 594  *              sI                       tI               xI       eI
 595  * Input:       ----------------------------------------------------
 596  * Constructed:         ---------------------------------------------------
 597  *                      sC               tC               xC       eC     EC
 598  *
 599  * The input string sI..eI is the input pattern.  The string sC..EC is the
 600  * constructed substitute parse string.  The portions sC..tC and eC..EC are
 601  * constructed by us.  The portion tC..eC is an exact duplicate of the input
 602  * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
 603  * while parsing, we find an error at xC.  We want to display a message showing
 604  * the real input string.  Thus we need to find the point xI in it which
 605  * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
 606  * been constructed by us, and so shouldn't have errors.  We get:
 607  *
 608  *      xI = sI + (tI - sI) + (xC - tC)
 609  *
 610  * and, the offset into sI is:
 611  *
 612  *      (xI - sI) = (tI - sI) + (xC - tC)
 613  *
 614  * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
 615  * and we save tC as RExC_adjusted_start.
 616  *
 617  * During normal processing of the input pattern, everything points to that,
 618  * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
 619  */
 620
 621 #define tI_sI           RExC_precomp_adj
 622 #define tC              RExC_adjusted_start
 623 #define sC              RExC_precomp
 624 #define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
 625 #define xI(xC)          (sC + xI_offset(xC))
 626 #define eC              RExC_precomp_end
 627
 628 #define REPORT_LOCATION_ARGS(xC)                                            \
 629     UTF8fARG(UTF,                                                           \
 630              (xI(xC) > eC) /* Don't run off end */                          \
 631               ? eC - sC   /* Length before the <--HERE */                   \
 632               : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ),            \
 633              sC),         /* The input pattern printed up to the <--HERE */ \
 634     UTF8fARG(UTF,                                                           \
 635              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
 636              (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 637
 638 /* Used to point after bad bytes for an error message, but avoid skipping
 639  * past a nul byte. */
 640 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 641
 642 /*
 643  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 644  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 645  * "...".
 646  */
 647 #define _FAIL(code) STMT_START {                                        \
 648     const char *ellipses = "";                                          \
 649     IV len = RExC_precomp_end - RExC_precomp;                                   \
 650                                                                         \
 651     if (!SIZE_ONLY)                                                     \
 652         SAVEFREESV(RExC_rx_sv);                                         \
 653     if (len > RegexLengthToShowInErrorMessages) {                       \
 654         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 655         len = RegexLengthToShowInErrorMessages - 10;                    \
 656         ellipses = "...";                                               \
 657     }                                                                   \
 658     code;                                                               \
 659 } STMT_END
 660
 661 #define FAIL(msg) _FAIL(                            \
 662     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 663             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 664
 665 #define FAIL2(msg,arg) _FAIL(                       \
 666     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 667             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 668
 669 /*
 670  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 671  */
 672 #define Simple_vFAIL(m) STMT_START {                                    \
 673     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 674             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 675 } STMT_END
 676
 677 /*
 678  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 679  */
 680 #define vFAIL(m) STMT_START {                           \
 681     if (!SIZE_ONLY)                                     \
 682         SAVEFREESV(RExC_rx_sv);                         \
 683     Simple_vFAIL(m);                                    \
 684 } STMT_END
 685
 686 /*
 687  * Like Simple_vFAIL(), but accepts two arguments.
 688  */
 689 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 690     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 691                       REPORT_LOCATION_ARGS(RExC_parse));        \
 692 } STMT_END
 693
 694 /*
 695  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 696  */
 697 #define vFAIL2(m,a1) STMT_START {                       \
 698     if (!SIZE_ONLY)                                     \
 699         SAVEFREESV(RExC_rx_sv);                         \
 700     Simple_vFAIL2(m, a1);                               \
 701 } STMT_END
 702
 703
 704 /*
 705  * Like Simple_vFAIL(), but accepts three arguments.
 706  */
 707 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 708     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 709             REPORT_LOCATION_ARGS(RExC_parse));                  \
 710 } STMT_END
 711
 712 /*
 713  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 714  */
 715 #define vFAIL3(m,a1,a2) STMT_START {                    \
 716     if (!SIZE_ONLY)                                     \
 717         SAVEFREESV(RExC_rx_sv);                         \
 718     Simple_vFAIL3(m, a1, a2);                           \
 719 } STMT_END
 720
 721 /*
 722  * Like Simple_vFAIL(), but accepts four arguments.
 723  */
 724 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 725     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 726             REPORT_LOCATION_ARGS(RExC_parse));                  \
 727 } STMT_END
 728
 729 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 730     if (!SIZE_ONLY)                                     \
 731         SAVEFREESV(RExC_rx_sv);                         \
 732     Simple_vFAIL4(m, a1, a2, a3);                       \
 733 } STMT_END
 734
 735 /* A specialized version of vFAIL2 that works with UTF8f */
 736 #define vFAIL2utf8f(m, a1) STMT_START {             \
 737     if (!SIZE_ONLY)                                 \
 738         SAVEFREESV(RExC_rx_sv);                     \
 739     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 740             REPORT_LOCATION_ARGS(RExC_parse));      \
 741 } STMT_END
 742
 743 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 744     if (!SIZE_ONLY)                                     \
 745         SAVEFREESV(RExC_rx_sv);                         \
 746     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 747             REPORT_LOCATION_ARGS(RExC_parse));          \
 748 } STMT_END
 749
 750 /* These have asserts in them because of [perl #122671] Many warnings in
 751  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 752  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 753  * would get output again.  So they should be output in pass2, and these
 754  * asserts make sure new warnings follow that paradigm. */
 755
 756 /* m is not necessarily a "literal string", in this macro */
 757 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 758     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 759                                        "%s" REPORT_LOCATION,            \
 760                                   m, REPORT_LOCATION_ARGS(loc));        \
 761 } STMT_END
 762
 763 #define ckWARNreg(loc,m) STMT_START {                                   \
 764     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 765                                           m REPORT_LOCATION,            \
 766                                           REPORT_LOCATION_ARGS(loc));   \
 767 } STMT_END
 768
 769 #define vWARN(loc, m) STMT_START {                                      \
 770     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 771                                        m REPORT_LOCATION,               \
 772                                        REPORT_LOCATION_ARGS(loc));      \
 773 } STMT_END
 774
 775 #define vWARN_dep(loc, m) STMT_START {                                  \
 776     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),       \
 777                                        m REPORT_LOCATION,               \
 778                                        REPORT_LOCATION_ARGS(loc));      \
 779 } STMT_END
 780
 781 #define ckWARNdep(loc,m) STMT_START {                                   \
 782     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),  \
 783                                             m REPORT_LOCATION,          \
 784                                             REPORT_LOCATION_ARGS(loc)); \
 785 } STMT_END
 786
 787 #define ckWARNregdep(loc,m) STMT_START {                                    \
 788     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,      \
 789                                                       WARN_REGEXP),         \
 790                                              m REPORT_LOCATION,             \
 791                                              REPORT_LOCATION_ARGS(loc));    \
 792 } STMT_END
 793
 794 #define ckWARN2reg_d(loc,m, a1) STMT_START {                                \
 795     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),          \
 796                                             m REPORT_LOCATION,              \
 797                                             a1, REPORT_LOCATION_ARGS(loc)); \
 798 } STMT_END
 799
 800 #define ckWARN2reg(loc, m, a1) STMT_START {                                 \
 801     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 802                                           m REPORT_LOCATION,                \
 803                                           a1, REPORT_LOCATION_ARGS(loc));   \
 804 } STMT_END
 805
 806 #define vWARN3(loc, m, a1, a2) STMT_START {                                 \
 807     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),               \
 808                                        m REPORT_LOCATION,                   \
 809                                        a1, a2, REPORT_LOCATION_ARGS(loc));  \
 810 } STMT_END
 811
 812 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                             \
 813     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 814                                           m REPORT_LOCATION,                \
 815                                           a1, a2,                           \
 816                                           REPORT_LOCATION_ARGS(loc));       \
 817 } STMT_END
 818
 819 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 820     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 821                                        m REPORT_LOCATION,               \
 822                                        a1, a2, a3,                      \
 823                                        REPORT_LOCATION_ARGS(loc));      \
 824 } STMT_END
 825
 826 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 827     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 828                                           m REPORT_LOCATION,            \
 829                                           a1, a2, a3,                   \
 830                                           REPORT_LOCATION_ARGS(loc));   \
 831 } STMT_END
 832
 833 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 834     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 835                                        m REPORT_LOCATION,               \
 836                                        a1, a2, a3, a4,                  \
 837                                        REPORT_LOCATION_ARGS(loc));      \
 838 } STMT_END
 839
 840 /* Macros for recording node offsets.   20001227 mjd@plover.com
 841  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 842  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 843  * Element 0 holds the number n.
 844  * Position is 1 indexed.
 845  */
 846 #ifndef RE_TRACK_PATTERN_OFFSETS
 847 #define Set_Node_Offset_To_R(node,byte)
 848 #define Set_Node_Offset(node,byte)
 849 #define Set_Cur_Node_Offset
 850 #define Set_Node_Length_To_R(node,len)
 851 #define Set_Node_Length(node,len)
 852 #define Set_Node_Cur_Length(node,start)
 853 #define Node_Offset(n)
 854 #define Node_Length(n)
 855 #define Set_Node_Offset_Length(node,offset,len)
 856 #define ProgLen(ri) ri->u.proglen
 857 #define SetProgLen(ri,x) ri->u.proglen = x
 858 #else
 859 #define ProgLen(ri) ri->u.offsets[0]
 860 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 861 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 862     if (! SIZE_ONLY) {                                                  \
 863         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 864                     __LINE__, (int)(node), (int)(byte)));               \
 865         if((node) < 0) {                                                \
 866             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 867                                          (int)(node));                  \
 868         } else {                                                        \
 869             RExC_offsets[2*(node)-1] = (byte);                          \
 870         }                                                               \
 871     }                                                                   \
 872 } STMT_END
 873
 874 #define Set_Node_Offset(node,byte) \
 875     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 876 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 877
 878 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 879     if (! SIZE_ONLY) {                                                  \
 880         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 881                 __LINE__, (int)(node), (int)(len)));                    \
 882         if((node) < 0) {                                                \
 883             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 884                                          (int)(node));                  \
 885         } else {                                                        \
 886             RExC_offsets[2*(node)] = (len);                             \
 887         }                                                               \
 888     }                                                                   \
 889 } STMT_END
 890
 891 #define Set_Node_Length(node,len) \
 892     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 893 #define Set_Node_Cur_Length(node, start)                \
 894     Set_Node_Length(node, RExC_parse - start)
 895
 896 /* Get offsets and lengths */
 897 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 898 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 899
 900 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 901     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 902     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 903 } STMT_END
 904 #endif
 905
 906 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 907 #define EXPERIMENTAL_INPLACESCAN
 908 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 909
 910 #ifdef DEBUGGING
 911 int
 912 Perl_re_printf(pTHX_ const char *fmt, ...)
 913 {
 914     va_list ap;
 915     int result;
 916     PerlIO *f= Perl_debug_log;
 917     PERL_ARGS_ASSERT_RE_PRINTF;
 918     va_start(ap, fmt);
 919     result = PerlIO_vprintf(f, fmt, ap);
 920     va_end(ap);
 921     return result;
 922 }
 923
 924 int
 925 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
 926 {
 927     va_list ap;
 928     int result;
 929     PerlIO *f= Perl_debug_log;
 930     PERL_ARGS_ASSERT_RE_INDENTF;
 931     va_start(ap, depth);
 932     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
 933     result = PerlIO_vprintf(f, fmt, ap);
 934     va_end(ap);
 935     return result;
 936 }
 937 #endif /* DEBUGGING */
 938
 939 #define DEBUG_RExC_seen()                                                   \
 940         DEBUG_OPTIMISE_MORE_r({                                             \
 941             Perl_re_printf( aTHX_ "RExC_seen: ");                                       \
 942                                                                             \
 943             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 944                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                            \
 945                                                                             \
 946             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 947                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");                          \
 948                                                                             \
 949             if (RExC_seen & REG_GPOS_SEEN)                                  \
 950                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                                \
 951                                                                             \
 952             if (RExC_seen & REG_RECURSE_SEEN)                               \
 953                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                             \
 954                                                                             \
 955             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
 956                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");                  \
 957                                                                             \
 958             if (RExC_seen & REG_VERBARG_SEEN)                               \
 959                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                             \
 960                                                                             \
 961             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 962                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                            \
 963                                                                             \
 964             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 965                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");                      \
 966                                                                             \
 967             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 968                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");                      \
 969                                                                             \
 970             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
 971                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");                \
 972                                                                             \
 973             Perl_re_printf( aTHX_ "\n");                                                \
 974         });
 975
 976 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 977   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
 978
 979
 980 #ifdef DEBUGGING
 981 static void
 982 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
 983                                     const char *close_str)
 984 {
 985     if (!flags)
 986         return;
 987
 988     Perl_re_printf( aTHX_  "%s", open_str);
 989     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
 990     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
 991     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
 992     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
 993     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
 994     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
 995     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
 996     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
 997     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
 998     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
 999     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1000     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1001     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1002     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1003     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1004     Perl_re_printf( aTHX_  "%s", close_str);
1005 }
1006
1007
1008 static void
1009 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1010                     U32 depth, int is_inf)
1011 {
1012     GET_RE_DEBUG_FLAGS_DECL;
1013
1014     DEBUG_OPTIMISE_MORE_r({
1015         if (!data)
1016             return;
1017         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1018             depth,
1019             where,
1020             (IV)data->pos_min,
1021             (IV)data->pos_delta,
1022             (UV)data->flags
1023         );
1024
1025         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1026
1027         Perl_re_printf( aTHX_
1028             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1029             (IV)data->whilem_c,
1030             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1031             is_inf ? "INF " : ""
1032         );
1033
1034         if (data->last_found) {
1035             int i;
1036             Perl_re_printf(aTHX_
1037                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1038                     SvPVX_const(data->last_found),
1039                     (IV)data->last_end,
1040                     (IV)data->last_start_min,
1041                     (IV)data->last_start_max
1042             );
1043
1044             for (i = 0; i < 2; i++) {
1045                 Perl_re_printf(aTHX_
1046                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1047                     data->cur_is_floating == i ? "*" : "",
1048                     i ? "Float" : "Fixed",
1049                     SvPVX_const(data->substrs[i].str),
1050                     (IV)data->substrs[i].min_offset,
1051                     (IV)data->substrs[i].max_offset
1052                 );
1053                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1054             }
1055         }
1056
1057         Perl_re_printf( aTHX_ "\n");
1058     });
1059 }
1060
1061
1062 static void
1063 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1064                 regnode *scan, U32 depth, U32 flags)
1065 {
1066     GET_RE_DEBUG_FLAGS_DECL;
1067
1068     DEBUG_OPTIMISE_r({
1069         regnode *Next;
1070
1071         if (!scan)
1072             return;
1073         Next = regnext(scan);
1074         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1075         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1076             depth,
1077             str,
1078             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1079             Next ? (REG_NODE_NUM(Next)) : 0 );
1080         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1081         Perl_re_printf( aTHX_  "\n");
1082    });
1083 }
1084
1085
1086 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1087                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1088
1089 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1090                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1091
1092 #else
1093 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1094 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1095 #endif
1096
1097
1098 /* =========================================================
1099  * BEGIN edit_distance stuff.
1100  *
1101  * This calculates how many single character changes of any type are needed to
1102  * transform a string into another one.  It is taken from version 3.1 of
1103  *
1104  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1105  */
1106
1107 /* Our unsorted dictionary linked list.   */
1108 /* Note we use UVs, not chars. */
1109
1110 struct dictionary{
1111   UV key;
1112   UV value;
1113   struct dictionary* next;
1114 };
1115 typedef struct dictionary item;
1116
1117
1118 PERL_STATIC_INLINE item*
1119 push(UV key,item* curr)
1120 {
1121     item* head;
1122     Newx(head, 1, item);
1123     head->key = key;
1124     head->value = 0;
1125     head->next = curr;
1126     return head;
1127 }
1128
1129
1130 PERL_STATIC_INLINE item*
1131 find(item* head, UV key)
1132 {
1133     item* iterator = head;
1134     while (iterator){
1135         if (iterator->key == key){
1136             return iterator;
1137         }
1138         iterator = iterator->next;
1139     }
1140
1141     return NULL;
1142 }
1143
1144 PERL_STATIC_INLINE item*
1145 uniquePush(item* head,UV key)
1146 {
1147     item* iterator = head;
1148
1149     while (iterator){
1150         if (iterator->key == key) {
1151             return head;
1152         }
1153         iterator = iterator->next;
1154     }
1155
1156     return push(key,head);
1157 }
1158
1159 PERL_STATIC_INLINE void
1160 dict_free(item* head)
1161 {
1162     item* iterator = head;
1163
1164     while (iterator) {
1165         item* temp = iterator;
1166         iterator = iterator->next;
1167         Safefree(temp);
1168     }
1169
1170     head = NULL;
1171 }
1172
1173 /* End of Dictionary Stuff */
1174
1175 /* All calculations/work are done here */
1176 STATIC int
1177 S_edit_distance(const UV* src,
1178                 const UV* tgt,
1179                 const STRLEN x,             /* length of src[] */
1180                 const STRLEN y,             /* length of tgt[] */
1181                 const SSize_t maxDistance
1182 )
1183 {
1184     item *head = NULL;
1185     UV swapCount,swapScore,targetCharCount,i,j;
1186     UV *scores;
1187     UV score_ceil = x + y;
1188
1189     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1190
1191     /* intialize matrix start values */
1192     Newx(scores, ( (x + 2) * (y + 2)), UV);
1193     scores[0] = score_ceil;
1194     scores[1 * (y + 2) + 0] = score_ceil;
1195     scores[0 * (y + 2) + 1] = score_ceil;
1196     scores[1 * (y + 2) + 1] = 0;
1197     head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1198
1199     /* work loops    */
1200     /* i = src index */
1201     /* j = tgt index */
1202     for (i=1;i<=x;i++) {
1203         if (i < x)
1204             head = uniquePush(head,src[i]);
1205         scores[(i+1) * (y + 2) + 1] = i;
1206         scores[(i+1) * (y + 2) + 0] = score_ceil;
1207         swapCount = 0;
1208
1209         for (j=1;j<=y;j++) {
1210             if (i == 1) {
1211                 if(j < y)
1212                 head = uniquePush(head,tgt[j]);
1213                 scores[1 * (y + 2) + (j + 1)] = j;
1214                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1215             }
1216
1217             targetCharCount = find(head,tgt[j-1])->value;
1218             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1219
1220             if (src[i-1] != tgt[j-1]){
1221                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1222             }
1223             else {
1224                 swapCount = j;
1225                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1226             }
1227         }
1228
1229         find(head,src[i-1])->value = i;
1230     }
1231
1232     {
1233         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1234         dict_free(head);
1235         Safefree(scores);
1236         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1237     }
1238 }
1239
1240 /* END of edit_distance() stuff
1241  * ========================================================= */
1242
1243 /* is c a control character for which we have a mnemonic? */
1244 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1245
1246 STATIC const char *
1247 S_cntrl_to_mnemonic(const U8 c)
1248 {
1249     /* Returns the mnemonic string that represents character 'c', if one
1250      * exists; NULL otherwise.  The only ones that exist for the purposes of
1251      * this routine are a few control characters */
1252
1253     switch (c) {
1254         case '\a':       return "\\a";
1255         case '\b':       return "\\b";
1256         case ESC_NATIVE: return "\\e";
1257         case '\f':       return "\\f";
1258         case '\n':       return "\\n";
1259         case '\r':       return "\\r";
1260         case '\t':       return "\\t";
1261     }
1262
1263     return NULL;
1264 }
1265
1266 /* Mark that we cannot extend a found fixed substring at this point.
1267    Update the longest found anchored substring or the longest found
1268    floating substrings if needed. */
1269
1270 STATIC void
1271 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1272                     SSize_t *minlenp, int is_inf)
1273 {
1274     const STRLEN l = CHR_SVLEN(data->last_found);
1275     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1276     const STRLEN old_l = CHR_SVLEN(longest_sv);
1277     GET_RE_DEBUG_FLAGS_DECL;
1278
1279     PERL_ARGS_ASSERT_SCAN_COMMIT;
1280
1281     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1282         const U8 i = data->cur_is_floating;
1283         SvSetMagicSV(longest_sv, data->last_found);
1284         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1285
1286         if (!i) /* fixed */
1287             data->substrs[0].max_offset = data->substrs[0].min_offset;
1288         else { /* float */
1289             data->substrs[1].max_offset = (l
1290                           ? data->last_start_max
1291                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1292                                          ? SSize_t_MAX
1293                                          : data->pos_min + data->pos_delta));
1294             if (is_inf
1295                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1296                 data->substrs[1].max_offset = SSize_t_MAX;
1297         }
1298
1299         if (data->flags & SF_BEFORE_EOL)
1300             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1301         else
1302             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1303         data->substrs[i].minlenp = minlenp;
1304         data->substrs[i].lookbehind = 0;
1305     }
1306
1307     SvCUR_set(data->last_found, 0);
1308     {
1309         SV * const sv = data->last_found;
1310         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1311             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1312             if (mg)
1313                 mg->mg_len = 0;
1314         }
1315     }
1316     data->last_end = -1;
1317     data->flags &= ~SF_BEFORE_EOL;
1318     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1319 }
1320
1321 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1322  * list that describes which code points it matches */
1323
1324 STATIC void
1325 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1326 {
1327     /* Set the SSC 'ssc' to match an empty string or any code point */
1328
1329     PERL_ARGS_ASSERT_SSC_ANYTHING;
1330
1331     assert(is_ANYOF_SYNTHETIC(ssc));
1332
1333     /* mortalize so won't leak */
1334     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1335     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1336 }
1337
1338 STATIC int
1339 S_ssc_is_anything(const regnode_ssc *ssc)
1340 {
1341     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1342      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1343      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1344      * in any way, so there's no point in using it */
1345
1346     UV start, end;
1347     bool ret;
1348
1349     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1350
1351     assert(is_ANYOF_SYNTHETIC(ssc));
1352
1353     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1354         return FALSE;
1355     }
1356
1357     /* See if the list consists solely of the range 0 - Infinity */
1358     invlist_iterinit(ssc->invlist);
1359     ret = invlist_iternext(ssc->invlist, &start, &end)
1360           && start == 0
1361           && end == UV_MAX;
1362
1363     invlist_iterfinish(ssc->invlist);
1364
1365     if (ret) {
1366         return TRUE;
1367     }
1368
1369     /* If e.g., both \w and \W are set, matches everything */
1370     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1371         int i;
1372         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1373             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1374                 return TRUE;
1375             }
1376         }
1377     }
1378
1379     return FALSE;
1380 }
1381
1382 STATIC void
1383 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1384 {
1385     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1386      * string, any code point, or any posix class under locale */
1387
1388     PERL_ARGS_ASSERT_SSC_INIT;
1389
1390     Zero(ssc, 1, regnode_ssc);
1391     set_ANYOF_SYNTHETIC(ssc);
1392     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1393     ssc_anything(ssc);
1394
1395     /* If any portion of the regex is to operate under locale rules that aren't
1396      * fully known at compile time, initialization includes it.  The reason
1397      * this isn't done for all regexes is that the optimizer was written under
1398      * the assumption that locale was all-or-nothing.  Given the complexity and
1399      * lack of documentation in the optimizer, and that there are inadequate
1400      * test cases for locale, many parts of it may not work properly, it is
1401      * safest to avoid locale unless necessary. */
1402     if (RExC_contains_locale) {
1403         ANYOF_POSIXL_SETALL(ssc);
1404     }
1405     else {
1406         ANYOF_POSIXL_ZERO(ssc);
1407     }
1408 }
1409
1410 STATIC int
1411 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1412                         const regnode_ssc *ssc)
1413 {
1414     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1415      * to the list of code points matched, and locale posix classes; hence does
1416      * not check its flags) */
1417
1418     UV start, end;
1419     bool ret;
1420
1421     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1422
1423     assert(is_ANYOF_SYNTHETIC(ssc));
1424
1425     invlist_iterinit(ssc->invlist);
1426     ret = invlist_iternext(ssc->invlist, &start, &end)
1427           && start == 0
1428           && end == UV_MAX;
1429
1430     invlist_iterfinish(ssc->invlist);
1431
1432     if (! ret) {
1433         return FALSE;
1434     }
1435
1436     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1437         return FALSE;
1438     }
1439
1440     return TRUE;
1441 }
1442
1443 STATIC SV*
1444 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1445                                const regnode_charclass* const node)
1446 {
1447     /* Returns a mortal inversion list defining which code points are matched
1448      * by 'node', which is of type ANYOF.  Handles complementing the result if
1449      * appropriate.  If some code points aren't knowable at this time, the
1450      * returned list must, and will, contain every code point that is a
1451      * possibility. */
1452
1453     SV* invlist = NULL;
1454     SV* only_utf8_locale_invlist = NULL;
1455     unsigned int i;
1456     const U32 n = ARG(node);
1457     bool new_node_has_latin1 = FALSE;
1458
1459     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1460
1461     /* Look at the data structure created by S_set_ANYOF_arg() */
1462     if (n != ANYOF_ONLY_HAS_BITMAP) {
1463         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1464         AV * const av = MUTABLE_AV(SvRV(rv));
1465         SV **const ary = AvARRAY(av);
1466         assert(RExC_rxi->data->what[n] == 's');
1467
1468         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1469             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1470         }
1471         else if (ary[0] && ary[0] != &PL_sv_undef) {
1472
1473             /* Here, no compile-time swash, and there are things that won't be
1474              * known until runtime -- we have to assume it could be anything */
1475             invlist = sv_2mortal(_new_invlist(1));
1476             return _add_range_to_invlist(invlist, 0, UV_MAX);
1477         }
1478         else if (ary[3] && ary[3] != &PL_sv_undef) {
1479
1480             /* Here no compile-time swash, and no run-time only data.  Use the
1481              * node's inversion list */
1482             invlist = sv_2mortal(invlist_clone(ary[3]));
1483         }
1484
1485         /* Get the code points valid only under UTF-8 locales */
1486         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1487             && ary[2] && ary[2] != &PL_sv_undef)
1488         {
1489             only_utf8_locale_invlist = ary[2];
1490         }
1491     }
1492
1493     if (! invlist) {
1494         invlist = sv_2mortal(_new_invlist(0));
1495     }
1496
1497     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1498      * code points, and an inversion list for the others, but if there are code
1499      * points that should match only conditionally on the target string being
1500      * UTF-8, those are placed in the inversion list, and not the bitmap.
1501      * Since there are circumstances under which they could match, they are
1502      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1503      * to exclude them here, so that when we invert below, the end result
1504      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1505      * have to do this here before we add the unconditionally matched code
1506      * points */
1507     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1508         _invlist_intersection_complement_2nd(invlist,
1509                                              PL_UpperLatin1,
1510                                              &invlist);
1511     }
1512
1513     /* Add in the points from the bit map */
1514     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1515         if (ANYOF_BITMAP_TEST(node, i)) {
1516             unsigned int start = i++;
1517
1518             for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1519                 /* empty */
1520             }
1521             invlist = _add_range_to_invlist(invlist, start, i-1);
1522             new_node_has_latin1 = TRUE;
1523         }
1524     }
1525
1526     /* If this can match all upper Latin1 code points, have to add them
1527      * as well.  But don't add them if inverting, as when that gets done below,
1528      * it would exclude all these characters, including the ones it shouldn't
1529      * that were added just above */
1530     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1531         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1532     {
1533         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1534     }
1535
1536     /* Similarly for these */
1537     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1538         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1539     }
1540
1541     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1542         _invlist_invert(invlist);
1543     }
1544     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1545
1546         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1547          * locale.  We can skip this if there are no 0-255 at all. */
1548         _invlist_union(invlist, PL_Latin1, &invlist);
1549     }
1550
1551     /* Similarly add the UTF-8 locale possible matches.  These have to be
1552      * deferred until after the non-UTF-8 locale ones are taken care of just
1553      * above, or it leads to wrong results under ANYOF_INVERT */
1554     if (only_utf8_locale_invlist) {
1555         _invlist_union_maybe_complement_2nd(invlist,
1556                                             only_utf8_locale_invlist,
1557                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1558                                             &invlist);
1559     }
1560
1561     return invlist;
1562 }
1563
1564 /* These two functions currently do the exact same thing */
1565 #define ssc_init_zero           ssc_init
1566
1567 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1568 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1569
1570 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1571  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1572  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1573
1574 STATIC void
1575 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1576                 const regnode_charclass *and_with)
1577 {
1578     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1579      * another SSC or a regular ANYOF class.  Can create false positives. */
1580
1581     SV* anded_cp_list;
1582     U8  anded_flags;
1583
1584     PERL_ARGS_ASSERT_SSC_AND;
1585
1586     assert(is_ANYOF_SYNTHETIC(ssc));
1587
1588     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1589      * the code point inversion list and just the relevant flags */
1590     if (is_ANYOF_SYNTHETIC(and_with)) {
1591         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1592         anded_flags = ANYOF_FLAGS(and_with);
1593
1594         /* XXX This is a kludge around what appears to be deficiencies in the
1595          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1596          * there are paths through the optimizer where it doesn't get weeded
1597          * out when it should.  And if we don't make some extra provision for
1598          * it like the code just below, it doesn't get added when it should.
1599          * This solution is to add it only when AND'ing, which is here, and
1600          * only when what is being AND'ed is the pristine, original node
1601          * matching anything.  Thus it is like adding it to ssc_anything() but
1602          * only when the result is to be AND'ed.  Probably the same solution
1603          * could be adopted for the same problem we have with /l matching,
1604          * which is solved differently in S_ssc_init(), and that would lead to
1605          * fewer false positives than that solution has.  But if this solution
1606          * creates bugs, the consequences are only that a warning isn't raised
1607          * that should be; while the consequences for having /l bugs is
1608          * incorrect matches */
1609         if (ssc_is_anything((regnode_ssc *)and_with)) {
1610             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1611         }
1612     }
1613     else {
1614         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1615         if (OP(and_with) == ANYOFD) {
1616             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1617         }
1618         else {
1619             anded_flags = ANYOF_FLAGS(and_with)
1620             &( ANYOF_COMMON_FLAGS
1621               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1622               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1623             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1624                 anded_flags &=
1625                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1626             }
1627         }
1628     }
1629
1630     ANYOF_FLAGS(ssc) &= anded_flags;
1631
1632     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1633      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1634      * 'and_with' may be inverted.  When not inverted, we have the situation of
1635      * computing:
1636      *  (C1 | P1) & (C2 | P2)
1637      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1638      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1639      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1640      *                    <=  ((C1 & C2) | P1 | P2)
1641      * Alternatively, the last few steps could be:
1642      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1643      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1644      *                    <=  (C1 | C2 | (P1 & P2))
1645      * We favor the second approach if either P1 or P2 is non-empty.  This is
1646      * because these components are a barrier to doing optimizations, as what
1647      * they match cannot be known until the moment of matching as they are
1648      * dependent on the current locale, 'AND"ing them likely will reduce or
1649      * eliminate them.
1650      * But we can do better if we know that C1,P1 are in their initial state (a
1651      * frequent occurrence), each matching everything:
1652      *  (<everything>) & (C2 | P2) =  C2 | P2
1653      * Similarly, if C2,P2 are in their initial state (again a frequent
1654      * occurrence), the result is a no-op
1655      *  (C1 | P1) & (<everything>) =  C1 | P1
1656      *
1657      * Inverted, we have
1658      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1659      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1660      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1661      * */
1662
1663     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1664         && ! is_ANYOF_SYNTHETIC(and_with))
1665     {
1666         unsigned int i;
1667
1668         ssc_intersection(ssc,
1669                          anded_cp_list,
1670                          FALSE /* Has already been inverted */
1671                          );
1672
1673         /* If either P1 or P2 is empty, the intersection will be also; can skip
1674          * the loop */
1675         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1676             ANYOF_POSIXL_ZERO(ssc);
1677         }
1678         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1679
1680             /* Note that the Posix class component P from 'and_with' actually
1681              * looks like:
1682              *      P = Pa | Pb | ... | Pn
1683              * where each component is one posix class, such as in [\w\s].
1684              * Thus
1685              *      ~P = ~(Pa | Pb | ... | Pn)
1686              *         = ~Pa & ~Pb & ... & ~Pn
1687              *        <= ~Pa | ~Pb | ... | ~Pn
1688              * The last is something we can easily calculate, but unfortunately
1689              * is likely to have many false positives.  We could do better
1690              * in some (but certainly not all) instances if two classes in
1691              * P have known relationships.  For example
1692              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1693              * So
1694              *      :lower: & :print: = :lower:
1695              * And similarly for classes that must be disjoint.  For example,
1696              * since \s and \w can have no elements in common based on rules in
1697              * the POSIX standard,
1698              *      \w & ^\S = nothing
1699              * Unfortunately, some vendor locales do not meet the Posix
1700              * standard, in particular almost everything by Microsoft.
1701              * The loop below just changes e.g., \w into \W and vice versa */
1702
1703             regnode_charclass_posixl temp;
1704             int add = 1;    /* To calculate the index of the complement */
1705
1706             Zero(&temp, 1, regnode_charclass_posixl);
1707             ANYOF_POSIXL_ZERO(&temp);
1708             for (i = 0; i < ANYOF_MAX; i++) {
1709                 assert(i % 2 != 0
1710                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1711                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1712
1713                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1714                     ANYOF_POSIXL_SET(&temp, i + add);
1715                 }
1716                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1717             }
1718             ANYOF_POSIXL_AND(&temp, ssc);
1719
1720         } /* else ssc already has no posixes */
1721     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1722          in its initial state */
1723     else if (! is_ANYOF_SYNTHETIC(and_with)
1724              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1725     {
1726         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1727          * copy it over 'ssc' */
1728         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1729             if (is_ANYOF_SYNTHETIC(and_with)) {
1730                 StructCopy(and_with, ssc, regnode_ssc);
1731             }
1732             else {
1733                 ssc->invlist = anded_cp_list;
1734                 ANYOF_POSIXL_ZERO(ssc);
1735                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1736                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1737                 }
1738             }
1739         }
1740         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1741                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1742         {
1743             /* One or the other of P1, P2 is non-empty. */
1744             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1745                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1746             }
1747             ssc_union(ssc, anded_cp_list, FALSE);
1748         }
1749         else { /* P1 = P2 = empty */
1750             ssc_intersection(ssc, anded_cp_list, FALSE);
1751         }
1752     }
1753 }
1754
1755 STATIC void
1756 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1757                const regnode_charclass *or_with)
1758 {
1759     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1760      * another SSC or a regular ANYOF class.  Can create false positives if
1761      * 'or_with' is to be inverted. */
1762
1763     SV* ored_cp_list;
1764     U8 ored_flags;
1765
1766     PERL_ARGS_ASSERT_SSC_OR;
1767
1768     assert(is_ANYOF_SYNTHETIC(ssc));
1769
1770     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1771      * the code point inversion list and just the relevant flags */
1772     if (is_ANYOF_SYNTHETIC(or_with)) {
1773         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1774         ored_flags = ANYOF_FLAGS(or_with);
1775     }
1776     else {
1777         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1778         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1779         if (OP(or_with) != ANYOFD) {
1780             ored_flags
1781             |= ANYOF_FLAGS(or_with)
1782              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1783                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1784             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1785                 ored_flags |=
1786                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1787             }
1788         }
1789     }
1790
1791     ANYOF_FLAGS(ssc) |= ored_flags;
1792
1793     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1794      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1795      * 'or_with' may be inverted.  When not inverted, we have the simple
1796      * situation of computing:
1797      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1798      * If P1|P2 yields a situation with both a class and its complement are
1799      * set, like having both \w and \W, this matches all code points, and we
1800      * can delete these from the P component of the ssc going forward.  XXX We
1801      * might be able to delete all the P components, but I (khw) am not certain
1802      * about this, and it is better to be safe.
1803      *
1804      * Inverted, we have
1805      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1806      *                         <=  (C1 | P1) | ~C2
1807      *                         <=  (C1 | ~C2) | P1
1808      * (which results in actually simpler code than the non-inverted case)
1809      * */
1810
1811     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1812         && ! is_ANYOF_SYNTHETIC(or_with))
1813     {
1814         /* We ignore P2, leaving P1 going forward */
1815     }   /* else  Not inverted */
1816     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1817         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1818         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1819             unsigned int i;
1820             for (i = 0; i < ANYOF_MAX; i += 2) {
1821                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1822                 {
1823                     ssc_match_all_cp(ssc);
1824                     ANYOF_POSIXL_CLEAR(ssc, i);
1825                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1826                 }
1827             }
1828         }
1829     }
1830
1831     ssc_union(ssc,
1832               ored_cp_list,
1833               FALSE /* Already has been inverted */
1834               );
1835 }
1836
1837 PERL_STATIC_INLINE void
1838 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1839 {
1840     PERL_ARGS_ASSERT_SSC_UNION;
1841
1842     assert(is_ANYOF_SYNTHETIC(ssc));
1843
1844     _invlist_union_maybe_complement_2nd(ssc->invlist,
1845                                         invlist,
1846                                         invert2nd,
1847                                         &ssc->invlist);
1848 }
1849
1850 PERL_STATIC_INLINE void
1851 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1852                          SV* const invlist,
1853                          const bool invert2nd)
1854 {
1855     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1856
1857     assert(is_ANYOF_SYNTHETIC(ssc));
1858
1859     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1860                                                invlist,
1861                                                invert2nd,
1862                                                &ssc->invlist);
1863 }
1864
1865 PERL_STATIC_INLINE void
1866 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1867 {
1868     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1869
1870     assert(is_ANYOF_SYNTHETIC(ssc));
1871
1872     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1873 }
1874
1875 PERL_STATIC_INLINE void
1876 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1877 {
1878     /* AND just the single code point 'cp' into the SSC 'ssc' */
1879
1880     SV* cp_list = _new_invlist(2);
1881
1882     PERL_ARGS_ASSERT_SSC_CP_AND;
1883
1884     assert(is_ANYOF_SYNTHETIC(ssc));
1885
1886     cp_list = add_cp_to_invlist(cp_list, cp);
1887     ssc_intersection(ssc, cp_list,
1888                      FALSE /* Not inverted */
1889                      );
1890     SvREFCNT_dec_NN(cp_list);
1891 }
1892
1893 PERL_STATIC_INLINE void
1894 S_ssc_clear_locale(regnode_ssc *ssc)
1895 {
1896     /* Set the SSC 'ssc' to not match any locale things */
1897     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1898
1899     assert(is_ANYOF_SYNTHETIC(ssc));
1900
1901     ANYOF_POSIXL_ZERO(ssc);
1902     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1903 }
1904
1905 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1906
1907 STATIC bool
1908 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1909 {
1910     /* The synthetic start class is used to hopefully quickly winnow down
1911      * places where a pattern could start a match in the target string.  If it
1912      * doesn't really narrow things down that much, there isn't much point to
1913      * having the overhead of using it.  This function uses some very crude
1914      * heuristics to decide if to use the ssc or not.
1915      *
1916      * It returns TRUE if 'ssc' rules out more than half what it considers to
1917      * be the "likely" possible matches, but of course it doesn't know what the
1918      * actual things being matched are going to be; these are only guesses
1919      *
1920      * For /l matches, it assumes that the only likely matches are going to be
1921      *      in the 0-255 range, uniformly distributed, so half of that is 127
1922      * For /a and /d matches, it assumes that the likely matches will be just
1923      *      the ASCII range, so half of that is 63
1924      * For /u and there isn't anything matching above the Latin1 range, it
1925      *      assumes that that is the only range likely to be matched, and uses
1926      *      half that as the cut-off: 127.  If anything matches above Latin1,
1927      *      it assumes that all of Unicode could match (uniformly), except for
1928      *      non-Unicode code points and things in the General Category "Other"
1929      *      (unassigned, private use, surrogates, controls and formats).  This
1930      *      is a much large number. */
1931
1932     U32 count = 0;      /* Running total of number of code points matched by
1933                            'ssc' */
1934     UV start, end;      /* Start and end points of current range in inversion
1935                            list */
1936     const U32 max_code_points = (LOC)
1937                                 ?  256
1938                                 : ((   ! UNI_SEMANTICS
1939                                      || invlist_highest(ssc->invlist) < 256)
1940                                   ? 128
1941                                   : NON_OTHER_COUNT);
1942     const U32 max_match = max_code_points / 2;
1943
1944     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1945
1946     invlist_iterinit(ssc->invlist);
1947     while (invlist_iternext(ssc->invlist, &start, &end)) {
1948         if (start >= max_code_points) {
1949             break;
1950         }
1951         end = MIN(end, max_code_points - 1);
1952         count += end - start + 1;
1953         if (count >= max_match) {
1954             invlist_iterfinish(ssc->invlist);
1955             return FALSE;
1956         }
1957     }
1958
1959     return TRUE;
1960 }
1961
1962
1963 STATIC void
1964 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1965 {
1966     /* The inversion list in the SSC is marked mortal; now we need a more
1967      * permanent copy, which is stored the same way that is done in a regular
1968      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1969      * map */
1970
1971     SV* invlist = invlist_clone(ssc->invlist);
1972
1973     PERL_ARGS_ASSERT_SSC_FINALIZE;
1974
1975     assert(is_ANYOF_SYNTHETIC(ssc));
1976
1977     /* The code in this file assumes that all but these flags aren't relevant
1978      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1979      * by the time we reach here */
1980     assert(! (ANYOF_FLAGS(ssc)
1981         & ~( ANYOF_COMMON_FLAGS
1982             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1983             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1984
1985     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1986
1987     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1988                                 NULL, NULL, NULL, FALSE);
1989
1990     /* Make sure is clone-safe */
1991     ssc->invlist = NULL;
1992
1993     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1994         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1995     }
1996
1997     if (RExC_contains_locale) {
1998         OP(ssc) = ANYOFL;
1999     }
2000
2001     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2002 }
2003
2004 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2005 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2006 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2007 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2008                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2009                                : 0 )
2010
2011
2012 #ifdef DEBUGGING
2013 /*
2014    dump_trie(trie,widecharmap,revcharmap)
2015    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2016    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2017
2018    These routines dump out a trie in a somewhat readable format.
2019    The _interim_ variants are used for debugging the interim
2020    tables that are used to generate the final compressed
2021    representation which is what dump_trie expects.
2022
2023    Part of the reason for their existence is to provide a form
2024    of documentation as to how the different representations function.
2025
2026 */
2027
2028 /*
2029   Dumps the final compressed table form of the trie to Perl_debug_log.
2030   Used for debugging make_trie().
2031 */
2032
2033 STATIC void
2034 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2035             AV *revcharmap, U32 depth)
2036 {
2037     U32 state;
2038     SV *sv=sv_newmortal();
2039     int colwidth= widecharmap ? 6 : 4;
2040     U16 word;
2041     GET_RE_DEBUG_FLAGS_DECL;
2042
2043     PERL_ARGS_ASSERT_DUMP_TRIE;
2044
2045     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2046         depth+1, "Match","Base","Ofs" );
2047
2048     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2049         SV ** const tmp = av_fetch( revcharmap, state, 0);
2050         if ( tmp ) {
2051             Perl_re_printf( aTHX_  "%*s",
2052                 colwidth,
2053                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2054                             PL_colors[0], PL_colors[1],
2055                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2056                             PERL_PV_ESCAPE_FIRSTCHAR
2057                 )
2058             );
2059         }
2060     }
2061     Perl_re_printf( aTHX_  "\n");
2062     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2063
2064     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2065         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2066     Perl_re_printf( aTHX_  "\n");
2067
2068     for( state = 1 ; state < trie->statecount ; state++ ) {
2069         const U32 base = trie->states[ state ].trans.base;
2070
2071         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2072
2073         if ( trie->states[ state ].wordnum ) {
2074             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2075         } else {
2076             Perl_re_printf( aTHX_  "%6s", "" );
2077         }
2078
2079         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2080
2081         if ( base ) {
2082             U32 ofs = 0;
2083
2084             while( ( base + ofs  < trie->uniquecharcount ) ||
2085                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2086                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2087                                                                     != state))
2088                     ofs++;
2089
2090             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2091
2092             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2093                 if ( ( base + ofs >= trie->uniquecharcount )
2094                         && ( base + ofs - trie->uniquecharcount
2095                                                         < trie->lasttrans )
2096                         && trie->trans[ base + ofs
2097                                     - trie->uniquecharcount ].check == state )
2098                 {
2099                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2100                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2101                    );
2102                 } else {
2103                     Perl_re_printf( aTHX_  "%*s",colwidth,"   ." );
2104                 }
2105             }
2106
2107             Perl_re_printf( aTHX_  "]");
2108
2109         }
2110         Perl_re_printf( aTHX_  "\n" );
2111     }
2112     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2113                                 depth);
2114     for (word=1; word <= trie->wordcount; word++) {
2115         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2116             (int)word, (int)(trie->wordinfo[word].prev),
2117             (int)(trie->wordinfo[word].len));
2118     }
2119     Perl_re_printf( aTHX_  "\n" );
2120 }
2121 /*
2122   Dumps a fully constructed but uncompressed trie in list form.
2123   List tries normally only are used for construction when the number of
2124   possible chars (trie->uniquecharcount) is very high.
2125   Used for debugging make_trie().
2126 */
2127 STATIC void
2128 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2129                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2130                          U32 depth)
2131 {
2132     U32 state;
2133     SV *sv=sv_newmortal();
2134     int colwidth= widecharmap ? 6 : 4;
2135     GET_RE_DEBUG_FLAGS_DECL;
2136
2137     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2138
2139     /* print out the table precompression.  */
2140     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2141             depth+1 );
2142     Perl_re_indentf( aTHX_  "%s",
2143             depth+1, "------:-----+-----------------\n" );
2144
2145     for( state=1 ; state < next_alloc ; state ++ ) {
2146         U16 charid;
2147
2148         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2149             depth+1, (UV)state  );
2150         if ( ! trie->states[ state ].wordnum ) {
2151             Perl_re_printf( aTHX_  "%5s| ","");
2152         } else {
2153             Perl_re_printf( aTHX_  "W%4x| ",
2154                 trie->states[ state ].wordnum
2155             );
2156         }
2157         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2158             SV ** const tmp = av_fetch( revcharmap,
2159                                         TRIE_LIST_ITEM(state,charid).forid, 0);
2160             if ( tmp ) {
2161                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2162                     colwidth,
2163                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2164                               colwidth,
2165                               PL_colors[0], PL_colors[1],
2166                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2167                               | PERL_PV_ESCAPE_FIRSTCHAR
2168                     ) ,
2169                     TRIE_LIST_ITEM(state,charid).forid,
2170                     (UV)TRIE_LIST_ITEM(state,charid).newstate
2171                 );
2172                 if (!(charid % 10))
2173                     Perl_re_printf( aTHX_  "\n%*s| ",
2174                         (int)((depth * 2) + 14), "");
2175             }
2176         }
2177         Perl_re_printf( aTHX_  "\n");
2178     }
2179 }
2180
2181 /*
2182   Dumps a fully constructed but uncompressed trie in table form.
2183   This is the normal DFA style state transition table, with a few
2184   twists to facilitate compression later.
2185   Used for debugging make_trie().
2186 */
2187 STATIC void
2188 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2189                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2190                           U32 depth)
2191 {
2192     U32 state;
2193     U16 charid;
2194     SV *sv=sv_newmortal();
2195     int colwidth= widecharmap ? 6 : 4;
2196     GET_RE_DEBUG_FLAGS_DECL;
2197
2198     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2199
2200     /*
2201        print out the table precompression so that we can do a visual check
2202        that they are identical.
2203      */
2204
2205     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2206
2207     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2208         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2209         if ( tmp ) {
2210             Perl_re_printf( aTHX_  "%*s",
2211                 colwidth,
2212                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2213                             PL_colors[0], PL_colors[1],
2214                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2215                             PERL_PV_ESCAPE_FIRSTCHAR
2216                 )
2217             );
2218         }
2219     }
2220
2221     Perl_re_printf( aTHX_ "\n");
2222     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2223
2224     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2225         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2226     }
2227
2228     Perl_re_printf( aTHX_  "\n" );
2229
2230     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2231
2232         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2233             depth+1,
2234             (UV)TRIE_NODENUM( state ) );
2235
2236         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2237             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2238             if (v)
2239                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2240             else
2241                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2242         }
2243         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2244             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2245                                             (UV)trie->trans[ state ].check );
2246         } else {
2247             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2248                                             (UV)trie->trans[ state ].check,
2249             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2250         }
2251     }
2252 }
2253
2254 #endif
2255
2256
2257 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2258   startbranch: the first branch in the whole branch sequence
2259   first      : start branch of sequence of branch-exact nodes.
2260                May be the same as startbranch
2261   last       : Thing following the last branch.
2262                May be the same as tail.
2263   tail       : item following the branch sequence
2264   count      : words in the sequence
2265   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2266   depth      : indent depth
2267
2268 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2269
2270 A trie is an N'ary tree where the branches are determined by digital
2271 decomposition of the key. IE, at the root node you look up the 1st character and
2272 follow that branch repeat until you find the end of the branches. Nodes can be
2273 marked as "accepting" meaning they represent a complete word. Eg:
2274
2275   /he|she|his|hers/
2276
2277 would convert into the following structure. Numbers represent states, letters
2278 following numbers represent valid transitions on the letter from that state, if
2279 the number is in square brackets it represents an accepting state, otherwise it
2280 will be in parenthesis.
2281
2282       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2283       |    |
2284       |   (2)
2285       |    |
2286      (1)   +-i->(6)-+-s->[7]
2287       |
2288       +-s->(3)-+-h->(4)-+-e->[5]
2289
2290       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2291
2292 This shows that when matching against the string 'hers' we will begin at state 1
2293 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2294 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2295 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2296 single traverse. We store a mapping from accepting to state to which word was
2297 matched, and then when we have multiple possibilities we try to complete the
2298 rest of the regex in the order in which they occurred in the alternation.
2299
2300 The only prior NFA like behaviour that would be changed by the TRIE support is
2301 the silent ignoring of duplicate alternations which are of the form:
2302
2303  / (DUPE|DUPE) X? (?{ ... }) Y /x
2304
2305 Thus EVAL blocks following a trie may be called a different number of times with
2306 and without the optimisation. With the optimisations dupes will be silently
2307 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2308 the following demonstrates:
2309
2310  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2311
2312 which prints out 'word' three times, but
2313
2314  'words'=~/(word|word|word)(?{ print $1 })S/
2315
2316 which doesnt print it out at all. This is due to other optimisations kicking in.
2317
2318 Example of what happens on a structural level:
2319
2320 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2321
2322    1: CURLYM[1] {1,32767}(18)
2323    5:   BRANCH(8)
2324    6:     EXACT <ac>(16)
2325    8:   BRANCH(11)
2326    9:     EXACT <ad>(16)
2327   11:   BRANCH(14)
2328   12:     EXACT <ab>(16)
2329   16:   SUCCEED(0)
2330   17:   NOTHING(18)
2331   18: END(0)
2332
2333 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2334 and should turn into:
2335
2336    1: CURLYM[1] {1,32767}(18)
2337    5:   TRIE(16)
2338         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2339           <ac>
2340           <ad>
2341           <ab>
2342   16:   SUCCEED(0)
2343   17:   NOTHING(18)
2344   18: END(0)
2345
2346 Cases where tail != last would be like /(?foo|bar)baz/:
2347
2348    1: BRANCH(4)
2349    2:   EXACT <foo>(8)
2350    4: BRANCH(7)
2351    5:   EXACT <bar>(8)
2352    7: TAIL(8)
2353    8: EXACT <baz>(10)
2354   10: END(0)
2355
2356 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2357 and would end up looking like:
2358
2359     1: TRIE(8)
2360       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2361         <foo>
2362         <bar>
2363    7: TAIL(8)
2364    8: EXACT <baz>(10)
2365   10: END(0)
2366
2367     d = uvchr_to_utf8_flags(d, uv, 0);
2368
2369 is the recommended Unicode-aware way of saying
2370
2371     *(d++) = uv;
2372 */
2373
2374 #define TRIE_STORE_REVCHAR(val)                                            \
2375     STMT_START {                                                           \
2376         if (UTF) {                                                         \
2377             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2378             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2379             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2380             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2381             SvPOK_on(zlopp);                                               \
2382             SvUTF8_on(zlopp);                                              \
2383             av_push(revcharmap, zlopp);                                    \
2384         } else {                                                           \
2385             char ooooff = (char)val;                                           \
2386             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2387         }                                                                  \
2388         } STMT_END
2389
2390 /* This gets the next character from the input, folding it if not already
2391  * folded. */
2392 #define TRIE_READ_CHAR STMT_START {                                           \
2393     wordlen++;                                                                \
2394     if ( UTF ) {                                                              \
2395         /* if it is UTF then it is either already folded, or does not need    \
2396          * folding */                                                         \
2397         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2398     }                                                                         \
2399     else if (folder == PL_fold_latin1) {                                      \
2400         /* This folder implies Unicode rules, which in the range expressible  \
2401          *  by not UTF is the lower case, with the two exceptions, one of     \
2402          *  which should have been taken care of before calling this */       \
2403         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2404         uvc = toLOWER_L1(*uc);                                                \
2405         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2406         len = 1;                                                              \
2407     } else {                                                                  \
2408         /* raw data, will be folded later if needed */                        \
2409         uvc = (U32)*uc;                                                       \
2410         len = 1;                                                              \
2411     }                                                                         \
2412 } STMT_END
2413
2414
2415
2416 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2417     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2418         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2419         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2420         TRIE_LIST_LEN( state ) = ging;                          \
2421     }                                                           \
2422     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2423     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2424     TRIE_LIST_CUR( state )++;                                   \
2425 } STMT_END
2426
2427 #define TRIE_LIST_NEW(state) STMT_START {                       \
2428     Newx( trie->states[ state ].trans.list,                     \
2429         4, reg_trie_trans_le );                                 \
2430      TRIE_LIST_CUR( state ) = 1;                                \
2431      TRIE_LIST_LEN( state ) = 4;                                \
2432 } STMT_END
2433
2434 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2435     U16 dupe= trie->states[ state ].wordnum;                    \
2436     regnode * const noper_next = regnext( noper );              \
2437                                                                 \
2438     DEBUG_r({                                                   \
2439         /* store the word for dumping */                        \
2440         SV* tmp;                                                \
2441         if (OP(noper) != NOTHING)                               \
2442             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2443         else                                                    \
2444             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2445         av_push( trie_words, tmp );                             \
2446     });                                                         \
2447                                                                 \
2448     curword++;                                                  \
2449     trie->wordinfo[curword].prev   = 0;                         \
2450     trie->wordinfo[curword].len    = wordlen;                   \
2451     trie->wordinfo[curword].accept = state;                     \
2452                                                                 \
2453     if ( noper_next < tail ) {                                  \
2454         if (!trie->jump)                                        \
2455             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2456                                                  sizeof(U16) ); \
2457         trie->jump[curword] = (U16)(noper_next - convert);      \
2458         if (!jumper)                                            \
2459             jumper = noper_next;                                \
2460         if (!nextbranch)                                        \
2461             nextbranch= regnext(cur);                           \
2462     }                                                           \
2463                                                                 \
2464     if ( dupe ) {                                               \
2465         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2466         /* chain, so that when the bits of chain are later    */\
2467         /* linked together, the dups appear in the chain      */\
2468         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2469         trie->wordinfo[dupe].prev = curword;                    \
2470     } else {                                                    \
2471         /* we haven't inserted this word yet.                */ \
2472         trie->states[ state ].wordnum = curword;                \
2473     }                                                           \
2474 } STMT_END
2475
2476
2477 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2478      ( ( base + charid >=  ucharcount                                   \
2479          && base + charid < ubound                                      \
2480          && state == trie->trans[ base - ucharcount + charid ].check    \
2481          && trie->trans[ base - ucharcount + charid ].next )            \
2482            ? trie->trans[ base - ucharcount + charid ].next             \
2483            : ( state==1 ? special : 0 )                                 \
2484       )
2485
2486 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2487 STMT_START {                                                \
2488     TRIE_BITMAP_SET(trie, uvc);                             \
2489     /* store the folded codepoint */                        \
2490     if ( folder )                                           \
2491         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2492                                                             \
2493     if ( !UTF ) {                                           \
2494         /* store first byte of utf8 representation of */    \
2495         /* variant codepoints */                            \
2496         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2497             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2498         }                                                   \
2499     }                                                       \
2500 } STMT_END
2501 #define MADE_TRIE       1
2502 #define MADE_JUMP_TRIE  2
2503 #define MADE_EXACT_TRIE 4
2504
2505 STATIC I32
2506 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2507                   regnode *first, regnode *last, regnode *tail,
2508                   U32 word_count, U32 flags, U32 depth)
2509 {
2510     /* first pass, loop through and scan words */
2511     reg_trie_data *trie;
2512     HV *widecharmap = NULL;
2513     AV *revcharmap = newAV();
2514     regnode *cur;
2515     STRLEN len = 0;
2516     UV uvc = 0;
2517     U16 curword = 0;
2518     U32 next_alloc = 0;
2519     regnode *jumper = NULL;
2520     regnode *nextbranch = NULL;
2521     regnode *convert = NULL;
2522     U32 *prev_states; /* temp array mapping each state to previous one */
2523     /* we just use folder as a flag in utf8 */
2524     const U8 * folder = NULL;
2525
2526     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2527      * which stands for one trie structure, one hash, optionally followed
2528      * by two arrays */
2529 #ifdef DEBUGGING
2530     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2531     AV *trie_words = NULL;
2532     /* along with revcharmap, this only used during construction but both are
2533      * useful during debugging so we store them in the struct when debugging.
2534      */
2535 #else
2536     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2537     STRLEN trie_charcount=0;
2538 #endif
2539     SV *re_trie_maxbuff;
2540     GET_RE_DEBUG_FLAGS_DECL;
2541
2542     PERL_ARGS_ASSERT_MAKE_TRIE;
2543 #ifndef DEBUGGING
2544     PERL_UNUSED_ARG(depth);
2545 #endif
2546
2547     switch (flags) {
2548         case EXACT: case EXACTL: break;
2549         case EXACTFA:
2550         case EXACTFU_SS:
2551         case EXACTFU:
2552         case EXACTFLU8: folder = PL_fold_latin1; break;
2553         case EXACTF:  folder = PL_fold; break;
2554         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2555     }
2556
2557     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2558     trie->refcount = 1;
2559     trie->startstate = 1;
2560     trie->wordcount = word_count;
2561     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2562     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2563     if (flags == EXACT || flags == EXACTL)
2564         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2565     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2566                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2567
2568     DEBUG_r({
2569         trie_words = newAV();
2570     });
2571
2572     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2573     assert(re_trie_maxbuff);
2574     if (!SvIOK(re_trie_maxbuff)) {
2575         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2576     }
2577     DEBUG_TRIE_COMPILE_r({
2578         Perl_re_indentf( aTHX_
2579           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2580           depth+1,
2581           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2582           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2583     });
2584
2585    /* Find the node we are going to overwrite */
2586     if ( first == startbranch && OP( last ) != BRANCH ) {
2587         /* whole branch chain */
2588         convert = first;
2589     } else {
2590         /* branch sub-chain */
2591         convert = NEXTOPER( first );
2592     }
2593
2594     /*  -- First loop and Setup --
2595
2596        We first traverse the branches and scan each word to determine if it
2597        contains widechars, and how many unique chars there are, this is
2598        important as we have to build a table with at least as many columns as we
2599        have unique chars.
2600
2601        We use an array of integers to represent the character codes 0..255
2602        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2603        the native representation of the character value as the key and IV's for
2604        the coded index.
2605
2606        *TODO* If we keep track of how many times each character is used we can
2607        remap the columns so that the table compression later on is more
2608        efficient in terms of memory by ensuring the most common value is in the
2609        middle and the least common are on the outside.  IMO this would be better
2610        than a most to least common mapping as theres a decent chance the most
2611        common letter will share a node with the least common, meaning the node
2612        will not be compressible. With a middle is most common approach the worst
2613        case is when we have the least common nodes twice.
2614
2615      */
2616
2617     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2618         regnode *noper = NEXTOPER( cur );
2619         const U8 *uc;
2620         const U8 *e;
2621         int foldlen = 0;
2622         U32 wordlen      = 0;         /* required init */
2623         STRLEN minchars = 0;
2624         STRLEN maxchars = 0;
2625         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2626                                                bitmap?*/
2627
2628         if (OP(noper) == NOTHING) {
2629             /* skip past a NOTHING at the start of an alternation
2630              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2631              */
2632             regnode *noper_next= regnext(noper);
2633             if (noper_next < tail)
2634                 noper= noper_next;
2635         }
2636
2637         if ( noper < tail &&
2638                 (
2639                     OP(noper) == flags ||
2640                     (
2641                         flags == EXACTFU &&
2642                         OP(noper) == EXACTFU_SS
2643                     )
2644                 )
2645         ) {
2646             uc= (U8*)STRING(noper);
2647             e= uc + STR_LEN(noper);
2648         } else {
2649             trie->minlen= 0;
2650             continue;
2651         }
2652
2653
2654         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2655             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2656                                           regardless of encoding */
2657             if (OP( noper ) == EXACTFU_SS) {
2658                 /* false positives are ok, so just set this */
2659                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2660             }
2661         }
2662
2663         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2664                                            branch */
2665             TRIE_CHARCOUNT(trie)++;
2666             TRIE_READ_CHAR;
2667
2668             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2669              * is in effect.  Under /i, this character can match itself, or
2670              * anything that folds to it.  If not under /i, it can match just
2671              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2672              * all fold to k, and all are single characters.   But some folds
2673              * expand to more than one character, so for example LATIN SMALL
2674              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2675              * the string beginning at 'uc' is 'ffi', it could be matched by
2676              * three characters, or just by the one ligature character. (It
2677              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2678              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2679              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2680              * match.)  The trie needs to know the minimum and maximum number
2681              * of characters that could match so that it can use size alone to
2682              * quickly reject many match attempts.  The max is simple: it is
2683              * the number of folded characters in this branch (since a fold is
2684              * never shorter than what folds to it. */
2685
2686             maxchars++;
2687
2688             /* And the min is equal to the max if not under /i (indicated by
2689              * 'folder' being NULL), or there are no multi-character folds.  If
2690              * there is a multi-character fold, the min is incremented just
2691              * once, for the character that folds to the sequence.  Each
2692              * character in the sequence needs to be added to the list below of
2693              * characters in the trie, but we count only the first towards the
2694              * min number of characters needed.  This is done through the
2695              * variable 'foldlen', which is returned by the macros that look
2696              * for these sequences as the number of bytes the sequence
2697              * occupies.  Each time through the loop, we decrement 'foldlen' by
2698              * how many bytes the current char occupies.  Only when it reaches
2699              * 0 do we increment 'minchars' or look for another multi-character
2700              * sequence. */
2701             if (folder == NULL) {
2702                 minchars++;
2703             }
2704             else if (foldlen > 0) {
2705                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2706             }
2707             else {
2708                 minchars++;
2709
2710                 /* See if *uc is the beginning of a multi-character fold.  If
2711                  * so, we decrement the length remaining to look at, to account
2712                  * for the current character this iteration.  (We can use 'uc'
2713                  * instead of the fold returned by TRIE_READ_CHAR because for
2714                  * non-UTF, the latin1_safe macro is smart enough to account
2715                  * for all the unfolded characters, and because for UTF, the
2716                  * string will already have been folded earlier in the
2717                  * compilation process */
2718                 if (UTF) {
2719                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2720                         foldlen -= UTF8SKIP(uc);
2721                     }
2722                 }
2723                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2724                     foldlen--;
2725                 }
2726             }
2727
2728             /* The current character (and any potential folds) should be added
2729              * to the possible matching characters for this position in this
2730              * branch */
2731             if ( uvc < 256 ) {
2732                 if ( folder ) {
2733                     U8 folded= folder[ (U8) uvc ];
2734                     if ( !trie->charmap[ folded ] ) {
2735                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2736                         TRIE_STORE_REVCHAR( folded );
2737                     }
2738                 }
2739                 if ( !trie->charmap[ uvc ] ) {
2740                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2741                     TRIE_STORE_REVCHAR( uvc );
2742                 }
2743                 if ( set_bit ) {
2744                     /* store the codepoint in the bitmap, and its folded
2745                      * equivalent. */
2746                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2747                     set_bit = 0; /* We've done our bit :-) */
2748                 }
2749             } else {
2750
2751                 /* XXX We could come up with the list of code points that fold
2752                  * to this using PL_utf8_foldclosures, except not for
2753                  * multi-char folds, as there may be multiple combinations
2754                  * there that could work, which needs to wait until runtime to
2755                  * resolve (The comment about LIGATURE FFI above is such an
2756                  * example */
2757
2758                 SV** svpp;
2759                 if ( !widecharmap )
2760                     widecharmap = newHV();
2761
2762                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2763
2764                 if ( !svpp )
2765                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2766
2767                 if ( !SvTRUE( *svpp ) ) {
2768                     sv_setiv( *svpp, ++trie->uniquecharcount );
2769                     TRIE_STORE_REVCHAR(uvc);
2770                 }
2771             }
2772         } /* end loop through characters in this branch of the trie */
2773
2774         /* We take the min and max for this branch and combine to find the min
2775          * and max for all branches processed so far */
2776         if( cur == first ) {
2777             trie->minlen = minchars;
2778             trie->maxlen = maxchars;
2779         } else if (minchars < trie->minlen) {
2780             trie->minlen = minchars;
2781         } else if (maxchars > trie->maxlen) {
2782             trie->maxlen = maxchars;
2783         }
2784     } /* end first pass */
2785     DEBUG_TRIE_COMPILE_r(
2786         Perl_re_indentf( aTHX_
2787                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2788                 depth+1,
2789                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2790                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2791                 (int)trie->minlen, (int)trie->maxlen )
2792     );
2793
2794     /*
2795         We now know what we are dealing with in terms of unique chars and
2796         string sizes so we can calculate how much memory a naive
2797         representation using a flat table  will take. If it's over a reasonable
2798         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2799         conservative but potentially much slower representation using an array
2800         of lists.
2801
2802         At the end we convert both representations into the same compressed
2803         form that will be used in regexec.c for matching with. The latter
2804         is a form that cannot be used to construct with but has memory
2805         properties similar to the list form and access properties similar
2806         to the table form making it both suitable for fast searches and
2807         small enough that its feasable to store for the duration of a program.
2808
2809         See the comment in the code where the compressed table is produced
2810         inplace from the flat tabe representation for an explanation of how
2811         the compression works.
2812
2813     */
2814
2815
2816     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2817     prev_states[1] = 0;
2818
2819     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2820                                                     > SvIV(re_trie_maxbuff) )
2821     {
2822         /*
2823             Second Pass -- Array Of Lists Representation
2824
2825             Each state will be represented by a list of charid:state records
2826             (reg_trie_trans_le) the first such element holds the CUR and LEN
2827             points of the allocated array. (See defines above).
2828
2829             We build the initial structure using the lists, and then convert
2830             it into the compressed table form which allows faster lookups
2831             (but cant be modified once converted).
2832         */
2833
2834         STRLEN transcount = 1;
2835
2836         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2837             depth+1));
2838
2839         trie->states = (reg_trie_state *)
2840             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2841                                   sizeof(reg_trie_state) );
2842         TRIE_LIST_NEW(1);
2843         next_alloc = 2;
2844
2845         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2846
2847             regnode *noper   = NEXTOPER( cur );
2848             U32 state        = 1;         /* required init */
2849             U16 charid       = 0;         /* sanity init */
2850             U32 wordlen      = 0;         /* required init */
2851
2852             if (OP(noper) == NOTHING) {
2853                 regnode *noper_next= regnext(noper);
2854                 if (noper_next < tail)
2855                     noper= noper_next;
2856             }
2857
2858             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2859                 const U8 *uc= (U8*)STRING(noper);
2860                 const U8 *e= uc + STR_LEN(noper);
2861
2862                 for ( ; uc < e ; uc += len ) {
2863
2864                     TRIE_READ_CHAR;
2865
2866                     if ( uvc < 256 ) {
2867                         charid = trie->charmap[ uvc ];
2868                     } else {
2869                         SV** const svpp = hv_fetch( widecharmap,
2870                                                     (char*)&uvc,
2871                                                     sizeof( UV ),
2872                                                     0);
2873                         if ( !svpp ) {
2874                             charid = 0;
2875                         } else {
2876                             charid=(U16)SvIV( *svpp );
2877                         }
2878                     }
2879                     /* charid is now 0 if we dont know the char read, or
2880                      * nonzero if we do */
2881                     if ( charid ) {
2882
2883                         U16 check;
2884                         U32 newstate = 0;
2885
2886                         charid--;
2887                         if ( !trie->states[ state ].trans.list ) {
2888                             TRIE_LIST_NEW( state );
2889                         }
2890                         for ( check = 1;
2891                               check <= TRIE_LIST_USED( state );
2892                               check++ )
2893                         {
2894                             if ( TRIE_LIST_ITEM( state, check ).forid
2895                                                                     == charid )
2896                             {
2897                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2898                                 break;
2899                             }
2900                         }
2901                         if ( ! newstate ) {
2902                             newstate = next_alloc++;
2903                             prev_states[newstate] = state;
2904                             TRIE_LIST_PUSH( state, charid, newstate );
2905                             transcount++;
2906                         }
2907                         state = newstate;
2908                     } else {
2909                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
2910                     }
2911                 }
2912             }
2913             TRIE_HANDLE_WORD(state);
2914
2915         } /* end second pass */
2916
2917         /* next alloc is the NEXT state to be allocated */
2918         trie->statecount = next_alloc;
2919         trie->states = (reg_trie_state *)
2920             PerlMemShared_realloc( trie->states,
2921                                    next_alloc
2922                                    * sizeof(reg_trie_state) );
2923
2924         /* and now dump it out before we compress it */
2925         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2926                                                          revcharmap, next_alloc,
2927                                                          depth+1)
2928         );
2929
2930         trie->trans = (reg_trie_trans *)
2931             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2932         {
2933             U32 state;
2934             U32 tp = 0;
2935             U32 zp = 0;
2936
2937
2938             for( state=1 ; state < next_alloc ; state ++ ) {
2939                 U32 base=0;
2940
2941                 /*
2942                 DEBUG_TRIE_COMPILE_MORE_r(
2943                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
2944                 );
2945                 */
2946
2947                 if (trie->states[state].trans.list) {
2948                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2949                     U16 maxid=minid;
2950                     U16 idx;
2951
2952                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2953                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2954                         if ( forid < minid ) {
2955                             minid=forid;
2956                         } else if ( forid > maxid ) {
2957                             maxid=forid;
2958                         }
2959                     }
2960                     if ( transcount < tp + maxid - minid + 1) {
2961                         transcount *= 2;
2962                         trie->trans = (reg_trie_trans *)
2963                             PerlMemShared_realloc( trie->trans,
2964                                                      transcount
2965                                                      * sizeof(reg_trie_trans) );
2966                         Zero( trie->trans + (transcount / 2),
2967                               transcount / 2,
2968                               reg_trie_trans );
2969                     }
2970                     base = trie->uniquecharcount + tp - minid;
2971                     if ( maxid == minid ) {
2972                         U32 set = 0;
2973                         for ( ; zp < tp ; zp++ ) {
2974                             if ( ! trie->trans[ zp ].next ) {
2975                                 base = trie->uniquecharcount + zp - minid;
2976                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2977                                                                    1).newstate;
2978                                 trie->trans[ zp ].check = state;
2979                                 set = 1;
2980                                 break;
2981                             }
2982                         }
2983                         if ( !set ) {
2984                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2985                                                                    1).newstate;
2986                             trie->trans[ tp ].check = state;
2987                             tp++;
2988                             zp = tp;
2989                         }
2990                     } else {
2991                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2992                             const U32 tid = base
2993                                            - trie->uniquecharcount
2994                                            + TRIE_LIST_ITEM( state, idx ).forid;
2995                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2996                                                                 idx ).newstate;
2997                             trie->trans[ tid ].check = state;
2998                         }
2999                         tp += ( maxid - minid + 1 );
3000                     }
3001                     Safefree(trie->states[ state ].trans.list);
3002                 }
3003                 /*
3004                 DEBUG_TRIE_COMPILE_MORE_r(
3005                     Perl_re_printf( aTHX_  " base: %d\n",base);
3006                 );
3007                 */
3008                 trie->states[ state ].trans.base=base;
3009             }
3010             trie->lasttrans = tp + 1;
3011         }
3012     } else {
3013         /*
3014            Second Pass -- Flat Table Representation.
3015
3016            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3017            each.  We know that we will need Charcount+1 trans at most to store
3018            the data (one row per char at worst case) So we preallocate both
3019            structures assuming worst case.
3020
3021            We then construct the trie using only the .next slots of the entry
3022            structs.
3023
3024            We use the .check field of the first entry of the node temporarily
3025            to make compression both faster and easier by keeping track of how
3026            many non zero fields are in the node.
3027
3028            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3029            transition.
3030
3031            There are two terms at use here: state as a TRIE_NODEIDX() which is
3032            a number representing the first entry of the node, and state as a
3033            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3034            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3035            if there are 2 entrys per node. eg:
3036
3037              A B       A B
3038           1. 2 4    1. 3 7
3039           2. 0 3    3. 0 5
3040           3. 0 0    5. 0 0
3041           4. 0 0    7. 0 0
3042
3043            The table is internally in the right hand, idx form. However as we
3044            also have to deal with the states array which is indexed by nodenum
3045            we have to use TRIE_NODENUM() to convert.
3046
3047         */
3048         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3049             depth+1));
3050
3051         trie->trans = (reg_trie_trans *)
3052             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3053                                   * trie->uniquecharcount + 1,
3054                                   sizeof(reg_trie_trans) );
3055         trie->states = (reg_trie_state *)
3056             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3057                                   sizeof(reg_trie_state) );
3058         next_alloc = trie->uniquecharcount + 1;
3059
3060
3061         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3062
3063             regnode *noper   = NEXTOPER( cur );
3064
3065             U32 state        = 1;         /* required init */
3066
3067             U16 charid       = 0;         /* sanity init */
3068             U32 accept_state = 0;         /* sanity init */
3069
3070             U32 wordlen      = 0;         /* required init */
3071
3072             if (OP(noper) == NOTHING) {
3073                 regnode *noper_next= regnext(noper);
3074                 if (noper_next < tail)
3075                     noper= noper_next;
3076             }
3077
3078             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3079                 const U8 *uc= (U8*)STRING(noper);
3080                 const U8 *e= uc + STR_LEN(noper);
3081
3082                 for ( ; uc < e ; uc += len ) {
3083
3084                     TRIE_READ_CHAR;
3085
3086                     if ( uvc < 256 ) {
3087                         charid = trie->charmap[ uvc ];
3088                     } else {
3089                         SV* const * const svpp = hv_fetch( widecharmap,
3090                                                            (char*)&uvc,
3091                                                            sizeof( UV ),
3092                                                            0);
3093                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3094                     }
3095                     if ( charid ) {
3096                         charid--;
3097                         if ( !trie->trans[ state + charid ].next ) {
3098                             trie->trans[ state + charid ].next = next_alloc;
3099                             trie->trans[ state ].check++;
3100                             prev_states[TRIE_NODENUM(next_alloc)]
3101                                     = TRIE_NODENUM(state);
3102                             next_alloc += trie->uniquecharcount;
3103                         }
3104                         state = trie->trans[ state + charid ].next;
3105                     } else {
3106                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3107                     }
3108                     /* charid is now 0 if we dont know the char read, or
3109                      * nonzero if we do */
3110                 }
3111             }
3112             accept_state = TRIE_NODENUM( state );
3113             TRIE_HANDLE_WORD(accept_state);
3114
3115         } /* end second pass */
3116
3117         /* and now dump it out before we compress it */
3118         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3119                                                           revcharmap,
3120                                                           next_alloc, depth+1));
3121
3122         {
3123         /*
3124            * Inplace compress the table.*
3125
3126            For sparse data sets the table constructed by the trie algorithm will
3127            be mostly 0/FAIL transitions or to put it another way mostly empty.
3128            (Note that leaf nodes will not contain any transitions.)
3129
3130            This algorithm compresses the tables by eliminating most such
3131            transitions, at the cost of a modest bit of extra work during lookup:
3132
3133            - Each states[] entry contains a .base field which indicates the
3134            index in the state[] array wheres its transition data is stored.
3135
3136            - If .base is 0 there are no valid transitions from that node.
3137
3138            - If .base is nonzero then charid is added to it to find an entry in
3139            the trans array.
3140
3141            -If trans[states[state].base+charid].check!=state then the
3142            transition is taken to be a 0/Fail transition. Thus if there are fail
3143            transitions at the front of the node then the .base offset will point
3144            somewhere inside the previous nodes data (or maybe even into a node
3145            even earlier), but the .check field determines if the transition is
3146            valid.
3147
3148            XXX - wrong maybe?
3149            The following process inplace converts the table to the compressed
3150            table: We first do not compress the root node 1,and mark all its
3151            .check pointers as 1 and set its .base pointer as 1 as well. This
3152            allows us to do a DFA construction from the compressed table later,
3153            and ensures that any .base pointers we calculate later are greater
3154            than 0.
3155
3156            - We set 'pos' to indicate the first entry of the second node.
3157
3158            - We then iterate over the columns of the node, finding the first and
3159            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3160            and set the .check pointers accordingly, and advance pos
3161            appropriately and repreat for the next node. Note that when we copy
3162            the next pointers we have to convert them from the original
3163            NODEIDX form to NODENUM form as the former is not valid post
3164            compression.
3165
3166            - If a node has no transitions used we mark its base as 0 and do not
3167            advance the pos pointer.
3168
3169            - If a node only has one transition we use a second pointer into the
3170            structure to fill in allocated fail transitions from other states.
3171            This pointer is independent of the main pointer and scans forward
3172            looking for null transitions that are allocated to a state. When it
3173            finds one it writes the single transition into the "hole".  If the
3174            pointer doesnt find one the single transition is appended as normal.
3175
3176            - Once compressed we can Renew/realloc the structures to release the
3177            excess space.
3178
3179            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3180            specifically Fig 3.47 and the associated pseudocode.
3181
3182            demq
3183         */
3184         const U32 laststate = TRIE_NODENUM( next_alloc );
3185         U32 state, charid;
3186         U32 pos = 0, zp=0;
3187         trie->statecount = laststate;
3188
3189         for ( state = 1 ; state < laststate ; state++ ) {
3190             U8 flag = 0;
3191             const U32 stateidx = TRIE_NODEIDX( state );
3192             const U32 o_used = trie->trans[ stateidx ].check;
3193             U32 used = trie->trans[ stateidx ].check;
3194             trie->trans[ stateidx ].check = 0;
3195
3196             for ( charid = 0;
3197                   used && charid < trie->uniquecharcount;
3198                   charid++ )
3199             {
3200                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3201                     if ( trie->trans[ stateidx + charid ].next ) {
3202                         if (o_used == 1) {
3203                             for ( ; zp < pos ; zp++ ) {
3204                                 if ( ! trie->trans[ zp ].next ) {
3205                                     break;
3206                                 }
3207                             }
3208                             trie->states[ state ].trans.base
3209                                                     = zp
3210                                                       + trie->uniquecharcount
3211                                                       - charid ;
3212                             trie->trans[ zp ].next
3213                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3214                                                              + charid ].next );
3215                             trie->trans[ zp ].check = state;
3216                             if ( ++zp > pos ) pos = zp;
3217                             break;
3218                         }
3219                         used--;
3220                     }
3221                     if ( !flag ) {
3222                         flag = 1;
3223                         trie->states[ state ].trans.base
3224                                        = pos + trie->uniquecharcount - charid ;
3225                     }
3226                     trie->trans[ pos ].next
3227                         = SAFE_TRIE_NODENUM(
3228                                        trie->trans[ stateidx + charid ].next );
3229                     trie->trans[ pos ].check = state;
3230                     pos++;
3231                 }
3232             }
3233         }
3234         trie->lasttrans = pos + 1;
3235         trie->states = (reg_trie_state *)
3236             PerlMemShared_realloc( trie->states, laststate
3237                                    * sizeof(reg_trie_state) );
3238         DEBUG_TRIE_COMPILE_MORE_r(
3239             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3240                 depth+1,
3241                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3242                        + 1 ),
3243                 (IV)next_alloc,
3244                 (IV)pos,
3245                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3246             );
3247
3248         } /* end table compress */
3249     }
3250     DEBUG_TRIE_COMPILE_MORE_r(
3251             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3252                 depth+1,
3253                 (UV)trie->statecount,
3254                 (UV)trie->lasttrans)
3255     );
3256     /* resize the trans array to remove unused space */
3257     trie->trans = (reg_trie_trans *)
3258         PerlMemShared_realloc( trie->trans, trie->lasttrans
3259                                * sizeof(reg_trie_trans) );
3260
3261     {   /* Modify the program and insert the new TRIE node */
3262         U8 nodetype =(U8)(flags & 0xFF);
3263         char *str=NULL;
3264
3265 #ifdef DEBUGGING
3266         regnode *optimize = NULL;
3267 #ifdef RE_TRACK_PATTERN_OFFSETS
3268
3269         U32 mjd_offset = 0;
3270         U32 mjd_nodelen = 0;
3271 #endif /* RE_TRACK_PATTERN_OFFSETS */
3272 #endif /* DEBUGGING */
3273         /*
3274            This means we convert either the first branch or the first Exact,
3275            depending on whether the thing following (in 'last') is a branch
3276            or not and whther first is the startbranch (ie is it a sub part of
3277            the alternation or is it the whole thing.)
3278            Assuming its a sub part we convert the EXACT otherwise we convert
3279            the whole branch sequence, including the first.
3280          */
3281         /* Find the node we are going to overwrite */
3282         if ( first != startbranch || OP( last ) == BRANCH ) {
3283             /* branch sub-chain */
3284             NEXT_OFF( first ) = (U16)(last - first);
3285 #ifdef RE_TRACK_PATTERN_OFFSETS
3286             DEBUG_r({
3287                 mjd_offset= Node_Offset((convert));
3288                 mjd_nodelen= Node_Length((convert));
3289             });
3290 #endif
3291             /* whole branch chain */
3292         }
3293 #ifdef RE_TRACK_PATTERN_OFFSETS
3294         else {
3295             DEBUG_r({
3296                 const  regnode *nop = NEXTOPER( convert );
3297                 mjd_offset= Node_Offset((nop));
3298                 mjd_nodelen= Node_Length((nop));
3299             });
3300         }
3301         DEBUG_OPTIMISE_r(
3302             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3303                 depth+1,
3304                 (UV)mjd_offset, (UV)mjd_nodelen)
3305         );
3306 #endif
3307         /* But first we check to see if there is a common prefix we can
3308            split out as an EXACT and put in front of the TRIE node.  */
3309         trie->startstate= 1;
3310         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3311             /* we want to find the first state that has more than
3312              * one transition, if that state is not the first state
3313              * then we have a common prefix which we can remove.
3314              */
3315             U32 state;
3316             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3317                 U32 ofs = 0;
3318                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3319                                        transition, -1 means none */
3320                 U32 count = 0;
3321                 const U32 base = trie->states[ state ].trans.base;
3322
3323                 /* does this state terminate an alternation? */
3324                 if ( trie->states[state].wordnum )
3325                         count = 1;
3326
3327                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3328                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3329                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3330                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3331                     {
3332                         if ( ++count > 1 ) {
3333                             /* we have more than one transition */
3334                             SV **tmp;
3335                             U8 *ch;
3336                             /* if this is the first state there is no common prefix
3337                              * to extract, so we can exit */
3338                             if ( state == 1 ) break;
3339                             tmp = av_fetch( revcharmap, ofs, 0);
3340                             ch = (U8*)SvPV_nolen_const( *tmp );
3341
3342                             /* if we are on count 2 then we need to initialize the
3343                              * bitmap, and store the previous char if there was one
3344                              * in it*/
3345                             if ( count == 2 ) {
3346                                 /* clear the bitmap */
3347                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3348                                 DEBUG_OPTIMISE_r(
3349                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3350                                         depth+1,
3351                                         (UV)state));
3352                                 if (first_ofs >= 0) {
3353                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3354                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3355
3356                                     TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3357                                     DEBUG_OPTIMISE_r(
3358                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3359                                     );
3360                                 }
3361                             }
3362                             /* store the current firstchar in the bitmap */
3363                             TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3364                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3365                         }
3366                         first_ofs = ofs;
3367                     }
3368                 }
3369                 if ( count == 1 ) {
3370                     /* This state has only one transition, its transition is part
3371                      * of a common prefix - we need to concatenate the char it
3372                      * represents to what we have so far. */
3373                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3374                     STRLEN len;
3375                     char *ch = SvPV( *tmp, len );
3376                     DEBUG_OPTIMISE_r({
3377                         SV *sv=sv_newmortal();
3378                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3379                             depth+1,
3380                             (UV)state, (UV)first_ofs,
3381                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3382                                 PL_colors[0], PL_colors[1],
3383                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3384                                 PERL_PV_ESCAPE_FIRSTCHAR
3385                             )
3386                         );
3387                     });
3388                     if ( state==1 ) {
3389                         OP( convert ) = nodetype;
3390                         str=STRING(convert);
3391                         STR_LEN(convert)=0;
3392                     }
3393                     STR_LEN(convert) += len;
3394                     while (len--)
3395                         *str++ = *ch++;
3396                 } else {
3397 #ifdef DEBUGGING
3398                     if (state>1)
3399                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3400 #endif
3401                     break;
3402                 }
3403             }
3404             trie->prefixlen = (state-1);
3405             if (str) {
3406                 regnode *n = convert+NODE_SZ_STR(convert);
3407                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3408                 trie->startstate = state;
3409                 trie->minlen -= (state - 1);
3410                 trie->maxlen -= (state - 1);
3411 #ifdef DEBUGGING
3412                /* At least the UNICOS C compiler choked on this
3413                 * being argument to DEBUG_r(), so let's just have
3414                 * it right here. */
3415                if (
3416 #ifdef PERL_EXT_RE_BUILD
3417                    1
3418 #else
3419                    DEBUG_r_TEST
3420 #endif
3421                    ) {
3422                    regnode *fix = convert;
3423                    U32 word = trie->wordcount;
3424                    mjd_nodelen++;
3425                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3426                    while( ++fix < n ) {
3427                        Set_Node_Offset_Length(fix, 0, 0);
3428                    }
3429                    while (word--) {
3430                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3431                        if (tmp) {
3432                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3433                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3434                            else
3435                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3436                        }
3437                    }
3438                }
3439 #endif
3440                 if (trie->maxlen) {
3441                     convert = n;
3442                 } else {
3443                     NEXT_OFF(convert) = (U16)(tail - convert);
3444                     DEBUG_r(optimize= n);
3445                 }
3446             }
3447         }
3448         if (!jumper)
3449             jumper = last;
3450         if ( trie->maxlen ) {
3451             NEXT_OFF( convert ) = (U16)(tail - convert);
3452             ARG_SET( convert, data_slot );
3453             /* Store the offset to the first unabsorbed branch in
3454                jump[0], which is otherwise unused by the jump logic.
3455                We use this when dumping a trie and during optimisation. */
3456             if (trie->jump)
3457                 trie->jump[0] = (U16)(nextbranch - convert);
3458
3459             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3460              *   and there is a bitmap
3461              *   and the first "jump target" node we found leaves enough room
3462              * then convert the TRIE node into a TRIEC node, with the bitmap
3463              * embedded inline in the opcode - this is hypothetically faster.
3464              */
3465             if ( !trie->states[trie->startstate].wordnum
3466                  && trie->bitmap
3467                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3468             {
3469                 OP( convert ) = TRIEC;
3470                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3471                 PerlMemShared_free(trie->bitmap);
3472                 trie->bitmap= NULL;
3473             } else
3474                 OP( convert ) = TRIE;
3475
3476             /* store the type in the flags */
3477             convert->flags = nodetype;
3478             DEBUG_r({
3479             optimize = convert
3480                       + NODE_STEP_REGNODE
3481                       + regarglen[ OP( convert ) ];
3482             });
3483             /* XXX We really should free up the resource in trie now,
3484                    as we won't use them - (which resources?) dmq */
3485         }
3486         /* needed for dumping*/
3487         DEBUG_r(if (optimize) {
3488             regnode *opt = convert;
3489
3490             while ( ++opt < optimize) {
3491                 Set_Node_Offset_Length(opt,0,0);
3492             }
3493             /*
3494                 Try to clean up some of the debris left after the
3495                 optimisation.
3496              */
3497             while( optimize < jumper ) {
3498                 mjd_nodelen += Node_Length((optimize));
3499                 OP( optimize ) = OPTIMIZED;
3500                 Set_Node_Offset_Length(optimize,0,0);
3501                 optimize++;
3502             }
3503             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3504         });
3505     } /* end node insert */
3506
3507     /*  Finish populating the prev field of the wordinfo array.  Walk back
3508      *  from each accept state until we find another accept state, and if
3509      *  so, point the first word's .prev field at the second word. If the
3510      *  second already has a .prev field set, stop now. This will be the
3511      *  case either if we've already processed that word's accept state,
3512      *  or that state had multiple words, and the overspill words were
3513      *  already linked up earlier.
3514      */
3515     {
3516         U16 word;
3517         U32 state;
3518         U16 prev;
3519
3520         for (word=1; word <= trie->wordcount; word++) {
3521             prev = 0;
3522             if (trie->wordinfo[word].prev)
3523                 continue;
3524             state = trie->wordinfo[word].accept;
3525             while (state) {
3526                 state = prev_states[state];
3527                 if (!state)
3528                     break;
3529                 prev = trie->states[state].wordnum;
3530                 if (prev)
3531                     break;
3532             }
3533             trie->wordinfo[word].prev = prev;
3534         }
3535         Safefree(prev_states);
3536     }
3537
3538
3539     /* and now dump out the compressed format */
3540     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3541
3542     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3543 #ifdef DEBUGGING
3544     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3545     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3546 #else
3547     SvREFCNT_dec_NN(revcharmap);
3548 #endif
3549     return trie->jump
3550            ? MADE_JUMP_TRIE
3551            : trie->startstate>1
3552              ? MADE_EXACT_TRIE
3553              : MADE_TRIE;
3554 }
3555
3556 STATIC regnode *
3557 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3558 {
3559 /* The Trie is constructed and compressed now so we can build a fail array if
3560  * it's needed
3561
3562    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3563    3.32 in the
3564    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3565    Ullman 1985/88
3566    ISBN 0-201-10088-6
3567
3568    We find the fail state for each state in the trie, this state is the longest
3569    proper suffix of the current state's 'word' that is also a proper prefix of
3570    another word in our trie. State 1 represents the word '' and is thus the
3571    default fail state. This allows the DFA not to have to restart after its
3572    tried and failed a word at a given point, it simply continues as though it
3573    had been matching the other word in the first place.
3574    Consider
3575       'abcdgu'=~/abcdefg|cdgu/
3576    When we get to 'd' we are still matching the first word, we would encounter
3577    'g' which would fail, which would bring us to the state representing 'd' in
3578    the second word where we would try 'g' and succeed, proceeding to match
3579    'cdgu'.
3580  */
3581  /* add a fail transition */
3582     const U32 trie_offset = ARG(source);
3583     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3584     U32 *q;
3585     const U32 ucharcount = trie->uniquecharcount;
3586     const U32 numstates = trie->statecount;
3587     const U32 ubound = trie->lasttrans + ucharcount;
3588     U32 q_read = 0;
3589     U32 q_write = 0;
3590     U32 charid;
3591     U32 base = trie->states[ 1 ].trans.base;
3592     U32 *fail;
3593     reg_ac_data *aho;
3594     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3595     regnode *stclass;
3596     GET_RE_DEBUG_FLAGS_DECL;
3597
3598     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3599     PERL_UNUSED_CONTEXT;
3600 #ifndef DEBUGGING
3601     PERL_UNUSED_ARG(depth);
3602 #endif
3603
3604     if ( OP(source) == TRIE ) {
3605         struct regnode_1 *op = (struct regnode_1 *)
3606             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3607         StructCopy(source,op,struct regnode_1);
3608         stclass = (regnode *)op;
3609     } else {
3610         struct regnode_charclass *op = (struct regnode_charclass *)
3611             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3612         StructCopy(source,op,struct regnode_charclass);
3613         stclass = (regnode *)op;
3614     }
3615     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3616
3617     ARG_SET( stclass, data_slot );
3618     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3619     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3620     aho->trie=trie_offset;
3621     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3622     Copy( trie->states, aho->states, numstates, reg_trie_state );
3623     Newx( q, numstates, U32);
3624     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3625     aho->refcount = 1;
3626     fail = aho->fail;
3627     /* initialize fail[0..1] to be 1 so that we always have
3628        a valid final fail state */
3629     fail[ 0 ] = fail[ 1 ] = 1;
3630
3631     for ( charid = 0; charid < ucharcount ; charid++ ) {
3632         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3633         if ( newstate ) {
3634             q[ q_write ] = newstate;
3635             /* set to point at the root */
3636             fail[ q[ q_write++ ] ]=1;
3637         }
3638     }
3639     while ( q_read < q_write) {
3640         const U32 cur = q[ q_read++ % numstates ];
3641         base = trie->states[ cur ].trans.base;
3642
3643         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3644             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3645             if (ch_state) {
3646                 U32 fail_state = cur;
3647                 U32 fail_base;
3648                 do {
3649                     fail_state = fail[ fail_state ];
3650                     fail_base = aho->states[ fail_state ].trans.base;
3651                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3652
3653                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3654                 fail[ ch_state ] = fail_state;
3655                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3656                 {
3657                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3658                 }
3659                 q[ q_write++ % numstates] = ch_state;
3660             }
3661         }
3662     }
3663     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3664        when we fail in state 1, this allows us to use the
3665        charclass scan to find a valid start char. This is based on the principle
3666        that theres a good chance the string being searched contains lots of stuff
3667        that cant be a start char.
3668      */
3669     fail[ 0 ] = fail[ 1 ] = 0;
3670     DEBUG_TRIE_COMPILE_r({
3671         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3672                       depth, (UV)numstates
3673         );
3674         for( q_read=1; q_read<numstates; q_read++ ) {
3675             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3676         }
3677         Perl_re_printf( aTHX_  "\n");
3678     });
3679     Safefree(q);
3680     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3681     return stclass;
3682 }
3683
3684
3685 /* The below joins as many adjacent EXACTish nodes as possible into a single
3686  * one.  The regop may be changed if the node(s) contain certain sequences that
3687  * require special handling.  The joining is only done if:
3688  * 1) there is room in the current conglomerated node to entirely contain the
3689  *    next one.
3690  * 2) they are the exact same node type
3691  *
3692  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3693  * these get optimized out
3694  *
3695  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3696  * as possible, even if that means splitting an existing node so that its first
3697  * part is moved to the preceeding node.  This would maximise the efficiency of
3698  * memEQ during matching.  Elsewhere in this file, khw proposes splitting
3699  * EXACTFish nodes into portions that don't change under folding vs those that
3700  * do.  Those portions that don't change may be the only things in the pattern that
3701  * could be used to find fixed and floating strings.
3702  *
3703  * If a node is to match under /i (folded), the number of characters it matches
3704  * can be different than its character length if it contains a multi-character
3705  * fold.  *min_subtract is set to the total delta number of characters of the
3706  * input nodes.
3707  *
3708  * And *unfolded_multi_char is set to indicate whether or not the node contains
3709  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3710  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3711  * SMALL LETTER SHARP S, as only if the target string being matched against
3712  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3713  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3714  * whose components are all above the Latin1 range are not run-time locale
3715  * dependent, and have already been folded by the time this function is
3716  * called.)
3717  *
3718  * This is as good a place as any to discuss the design of handling these
3719  * multi-character fold sequences.  It's been wrong in Perl for a very long
3720  * time.  There are three code points in Unicode whose multi-character folds
3721  * were long ago discovered to mess things up.  The previous designs for
3722  * dealing with these involved assigning a special node for them.  This
3723  * approach doesn't always work, as evidenced by this example:
3724  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3725  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3726  * would match just the \xDF, it won't be able to handle the case where a
3727  * successful match would have to cross the node's boundary.  The new approach
3728  * that hopefully generally solves the problem generates an EXACTFU_SS node
3729  * that is "sss" in this case.
3730  *
3731  * It turns out that there are problems with all multi-character folds, and not
3732  * just these three.  Now the code is general, for all such cases.  The
3733  * approach taken is:
3734  * 1)   This routine examines each EXACTFish node that could contain multi-
3735  *      character folded sequences.  Since a single character can fold into
3736  *      such a sequence, the minimum match length for this node is less than
3737  *      the number of characters in the node.  This routine returns in
3738  *      *min_subtract how many characters to subtract from the the actual
3739  *      length of the string to get a real minimum match length; it is 0 if
3740  *      there are no multi-char foldeds.  This delta is used by the caller to
3741  *      adjust the min length of the match, and the delta between min and max,
3742  *      so that the optimizer doesn't reject these possibilities based on size
3743  *      constraints.
3744  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3745  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3746  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3747  *      there is a possible fold length change.  That means that a regular
3748  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3749  *      with length changes, and so can be processed faster.  regexec.c takes
3750  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3751  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3752  *      known until runtime).  This saves effort in regex matching.  However,
3753  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3754  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3755  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3756  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3757  *      possibilities for the non-UTF8 patterns are quite simple, except for
3758  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3759  *      members of a fold-pair, and arrays are set up for all of them so that
3760  *      the other member of the pair can be found quickly.  Code elsewhere in
3761  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3762  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3763  *      described in the next item.
3764  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3765  *      validity of the fold won't be known until runtime, and so must remain
3766  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3767  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3768  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3769  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3770  *      The reason this is a problem is that the optimizer part of regexec.c
3771  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3772  *      that a character in the pattern corresponds to at most a single
3773  *      character in the target string.  (And I do mean character, and not byte
3774  *      here, unlike other parts of the documentation that have never been
3775  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3776  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3777  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3778  *      nodes, violate the assumption, and they are the only instances where it
3779  *      is violated.  I'm reluctant to try to change the assumption, as the
3780  *      code involved is impenetrable to me (khw), so instead the code here
3781  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3782  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3783  *      boolean indicating whether or not the node contains such a fold.  When
3784  *      it is true, the caller sets a flag that later causes the optimizer in
3785  *      this file to not set values for the floating and fixed string lengths,
3786  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3787  *      assumption.  Thus, there is no optimization based on string lengths for
3788  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3789  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3790  *      assumption is wrong only in these cases is that all other non-UTF-8
3791  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3792  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3793  *      EXACTF nodes because we don't know at compile time if it actually
3794  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3795  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3796  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3797  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3798  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3799  *      string would require the pattern to be forced into UTF-8, the overhead
3800  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3801  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3802  *      locale.)
3803  *
3804  *      Similarly, the code that generates tries doesn't currently handle
3805  *      not-already-folded multi-char folds, and it looks like a pain to change
3806  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3807  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3808  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3809  *      using /iaa matching will be doing so almost entirely with ASCII
3810  *      strings, so this should rarely be encountered in practice */
3811
3812 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3813     if (PL_regkind[OP(scan)] == EXACT) \
3814         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3815
3816 STATIC U32
3817 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3818                    UV *min_subtract, bool *unfolded_multi_char,
3819                    U32 flags,regnode *val, U32 depth)
3820 {
3821     /* Merge several consecutive EXACTish nodes into one. */
3822     regnode *n = regnext(scan);
3823     U32 stringok = 1;
3824     regnode *next = scan + NODE_SZ_STR(scan);
3825     U32 merged = 0;
3826     U32 stopnow = 0;
3827 #ifdef DEBUGGING
3828     regnode *stop = scan;
3829     GET_RE_DEBUG_FLAGS_DECL;
3830 #else
3831     PERL_UNUSED_ARG(depth);
3832 #endif
3833
3834     PERL_ARGS_ASSERT_JOIN_EXACT;
3835 #ifndef EXPERIMENTAL_INPLACESCAN
3836     PERL_UNUSED_ARG(flags);
3837     PERL_UNUSED_ARG(val);
3838 #endif
3839     DEBUG_PEEP("join", scan, depth, 0);
3840
3841     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3842      * EXACT ones that are mergeable to the current one. */
3843     while (n
3844            && (PL_regkind[OP(n)] == NOTHING
3845                || (stringok && OP(n) == OP(scan)))
3846            && NEXT_OFF(n)
3847            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3848     {
3849
3850         if (OP(n) == TAIL || n > next)
3851             stringok = 0;
3852         if (PL_regkind[OP(n)] == NOTHING) {
3853             DEBUG_PEEP("skip:", n, depth, 0);
3854             NEXT_OFF(scan) += NEXT_OFF(n);
3855             next = n + NODE_STEP_REGNODE;
3856 #ifdef DEBUGGING
3857             if (stringok)
3858                 stop = n;
3859 #endif
3860             n = regnext(n);
3861         }
3862         else if (stringok) {
3863             const unsigned int oldl = STR_LEN(scan);
3864             regnode * const nnext = regnext(n);
3865
3866             /* XXX I (khw) kind of doubt that this works on platforms (should
3867              * Perl ever run on one) where U8_MAX is above 255 because of lots
3868              * of other assumptions */
3869             /* Don't join if the sum can't fit into a single node */
3870             if (oldl + STR_LEN(n) > U8_MAX)
3871                 break;
3872
3873             DEBUG_PEEP("merg", n, depth, 0);
3874             merged++;
3875
3876             NEXT_OFF(scan) += NEXT_OFF(n);
3877             STR_LEN(scan) += STR_LEN(n);
3878             next = n + NODE_SZ_STR(n);
3879             /* Now we can overwrite *n : */
3880             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3881 #ifdef DEBUGGING
3882             stop = next - 1;
3883 #endif
3884             n = nnext;
3885             if (stopnow) break;
3886         }
3887
3888 #ifdef EXPERIMENTAL_INPLACESCAN
3889         if (flags && !NEXT_OFF(n)) {
3890             DEBUG_PEEP("atch", val, depth, 0);
3891             if (reg_off_by_arg[OP(n)]) {
3892                 ARG_SET(n, val - n);
3893             }
3894             else {
3895                 NEXT_OFF(n) = val - n;
3896             }
3897             stopnow = 1;
3898         }
3899 #endif
3900     }
3901
3902     *min_subtract = 0;
3903     *unfolded_multi_char = FALSE;
3904
3905     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3906      * can now analyze for sequences of problematic code points.  (Prior to
3907      * this final joining, sequences could have been split over boundaries, and
3908      * hence missed).  The sequences only happen in folding, hence for any
3909      * non-EXACT EXACTish node */
3910     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3911         U8* s0 = (U8*) STRING(scan);
3912         U8* s = s0;
3913         U8* s_end = s0 + STR_LEN(scan);
3914
3915         int total_count_delta = 0;  /* Total delta number of characters that
3916                                        multi-char folds expand to */
3917
3918         /* One pass is made over the node's string looking for all the
3919          * possibilities.  To avoid some tests in the loop, there are two main
3920          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3921          * non-UTF-8 */
3922         if (UTF) {
3923             U8* folded = NULL;
3924
3925             if (OP(scan) == EXACTFL) {
3926                 U8 *d;
3927
3928                 /* An EXACTFL node would already have been changed to another
3929                  * node type unless there is at least one character in it that
3930                  * is problematic; likely a character whose fold definition
3931                  * won't be known until runtime, and so has yet to be folded.
3932                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3933                  * to handle the UTF-8 case, we need to create a temporary
3934                  * folded copy using UTF-8 locale rules in order to analyze it.
3935                  * This is because our macros that look to see if a sequence is
3936                  * a multi-char fold assume everything is folded (otherwise the
3937                  * tests in those macros would be too complicated and slow).
3938                  * Note that here, the non-problematic folds will have already
3939                  * been done, so we can just copy such characters.  We actually
3940                  * don't completely fold the EXACTFL string.  We skip the
3941                  * unfolded multi-char folds, as that would just create work
3942                  * below to figure out the size they already are */
3943
3944                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3945                 d = folded;
3946                 while (s < s_end) {
3947                     STRLEN s_len = UTF8SKIP(s);
3948                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3949                         Copy(s, d, s_len, U8);
3950                         d += s_len;
3951                     }
3952                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3953                         *unfolded_multi_char = TRUE;
3954                         Copy(s, d, s_len, U8);
3955                         d += s_len;
3956                     }
3957                     else if (isASCII(*s)) {
3958                         *(d++) = toFOLD(*s);
3959                     }
3960                     else {
3961                         STRLEN len;
3962                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
3963                         d += len;
3964                     }
3965                     s += s_len;
3966                 }
3967
3968                 /* Point the remainder of the routine to look at our temporary
3969                  * folded copy */
3970                 s = folded;
3971                 s_end = d;
3972             } /* End of creating folded copy of EXACTFL string */
3973
3974             /* Examine the string for a multi-character fold sequence.  UTF-8
3975              * patterns have all characters pre-folded by the time this code is
3976              * executed */
3977             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3978                                      length sequence we are looking for is 2 */
3979             {
3980                 int count = 0;  /* How many characters in a multi-char fold */
3981                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3982                 if (! len) {    /* Not a multi-char fold: get next char */
3983                     s += UTF8SKIP(s);
3984                     continue;
3985                 }
3986
3987                 /* Nodes with 'ss' require special handling, except for
3988                  * EXACTFA-ish for which there is no multi-char fold to this */
3989                 if (len == 2 && *s == 's' && *(s+1) == 's'
3990                     && OP(scan) != EXACTFA
3991                     && OP(scan) != EXACTFA_NO_TRIE)
3992                 {
3993                     count = 2;
3994                     if (OP(scan) != EXACTFL) {
3995                         OP(scan) = EXACTFU_SS;
3996                     }
3997                     s += 2;
3998                 }
3999                 else { /* Here is a generic multi-char fold. */
4000                     U8* multi_end  = s + len;
4001
4002                     /* Count how many characters are in it.  In the case of
4003                      * /aa, no folds which contain ASCII code points are
4004                      * allowed, so check for those, and skip if found. */
4005                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
4006                         count = utf8_length(s, multi_end);
4007                         s = multi_end;
4008                     }
4009                     else {
4010                         while (s < multi_end) {
4011                             if (isASCII(*s)) {
4012                                 s++;
4013                                 goto next_iteration;
4014                             }
4015                             else {
4016                                 s += UTF8SKIP(s);
4017                             }
4018                             count++;
4019                         }
4020                     }
4021                 }
4022
4023                 /* The delta is how long the sequence is minus 1 (1 is how long
4024                  * the character that folds to the sequence is) */
4025                 total_count_delta += count - 1;
4026               next_iteration: ;
4027             }
4028
4029             /* We created a temporary folded copy of the string in EXACTFL
4030              * nodes.  Therefore we need to be sure it doesn't go below zero,
4031              * as the real string could be shorter */
4032             if (OP(scan) == EXACTFL) {
4033                 int total_chars = utf8_length((U8*) STRING(scan),
4034                                            (U8*) STRING(scan) + STR_LEN(scan));
4035                 if (total_count_delta > total_chars) {
4036                     total_count_delta = total_chars;
4037                 }
4038             }
4039
4040             *min_subtract += total_count_delta;
4041             Safefree(folded);
4042         }
4043         else if (OP(scan) == EXACTFA) {
4044
4045             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
4046              * fold to the ASCII range (and there are no existing ones in the
4047              * upper latin1 range).  But, as outlined in the comments preceding
4048              * this function, we need to flag any occurrences of the sharp s.
4049              * This character forbids trie formation (because of added
4050              * complexity) */
4051 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4052    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4053                                       || UNICODE_DOT_DOT_VERSION > 0)
4054             while (s < s_end) {
4055                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4056                     OP(scan) = EXACTFA_NO_TRIE;
4057                     *unfolded_multi_char = TRUE;
4058                     break;
4059                 }
4060                 s++;
4061             }
4062         }
4063         else {
4064
4065             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
4066              * folds that are all Latin1.  As explained in the comments
4067              * preceding this function, we look also for the sharp s in EXACTF
4068              * and EXACTFL nodes; it can be in the final position.  Otherwise
4069              * we can stop looking 1 byte earlier because have to find at least
4070              * two characters for a multi-fold */
4071             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4072                               ? s_end
4073                               : s_end -1;
4074
4075             while (s < upper) {
4076                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4077                 if (! len) {    /* Not a multi-char fold. */
4078                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4079                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4080                     {
4081                         *unfolded_multi_char = TRUE;
4082                     }
4083                     s++;
4084                     continue;
4085                 }
4086
4087                 if (len == 2
4088                     && isALPHA_FOLD_EQ(*s, 's')
4089                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4090                 {
4091
4092                     /* EXACTF nodes need to know that the minimum length
4093                      * changed so that a sharp s in the string can match this
4094                      * ss in the pattern, but they remain EXACTF nodes, as they
4095                      * won't match this unless the target string is is UTF-8,
4096                      * which we don't know until runtime.  EXACTFL nodes can't
4097                      * transform into EXACTFU nodes */
4098                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4099                         OP(scan) = EXACTFU_SS;
4100                     }
4101                 }
4102
4103                 *min_subtract += len - 1;
4104                 s += len;
4105             }
4106 #endif
4107         }
4108     }
4109
4110 #ifdef DEBUGGING
4111     /* Allow dumping but overwriting the collection of skipped
4112      * ops and/or strings with fake optimized ops */
4113     n = scan + NODE_SZ_STR(scan);
4114     while (n <= stop) {
4115         OP(n) = OPTIMIZED;
4116         FLAGS(n) = 0;
4117         NEXT_OFF(n) = 0;
4118         n++;
4119     }
4120 #endif
4121     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4122     return stopnow;
4123 }
4124
4125 /* REx optimizer.  Converts nodes into quicker variants "in place".
4126    Finds fixed substrings.  */
4127
4128 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4129    to the position after last scanned or to NULL. */
4130
4131 #define INIT_AND_WITHP \
4132     assert(!and_withp); \
4133     Newx(and_withp,1, regnode_ssc); \
4134     SAVEFREEPV(and_withp)
4135
4136
4137 static void
4138 S_unwind_scan_frames(pTHX_ const void *p)
4139 {
4140     scan_frame *f= (scan_frame *)p;
4141     do {
4142         scan_frame *n= f->next_frame;
4143         Safefree(f);
4144         f= n;
4145     } while (f);
4146 }
4147
4148 /* the return from this sub is the minimum length that could possibly match */
4149 STATIC SSize_t
4150 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4151                         SSize_t *minlenp, SSize_t *deltap,
4152                         regnode *last,
4153                         scan_data_t *data,
4154                         I32 stopparen,
4155                         U32 recursed_depth,
4156                         regnode_ssc *and_withp,
4157                         U32 flags, U32 depth)
4158                         /* scanp: Start here (read-write). */
4159                         /* deltap: Write maxlen-minlen here. */
4160                         /* last: Stop before this one. */
4161                         /* data: string data about the pattern */
4162                         /* stopparen: treat close N as END */
4163                         /* recursed: which subroutines have we recursed into */
4164                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4165 {
4166     /* There must be at least this number of characters to match */
4167     SSize_t min = 0;
4168     I32 pars = 0, code;
4169     regnode *scan = *scanp, *next;
4170     SSize_t delta = 0;
4171     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4172     int is_inf_internal = 0;            /* The studied chunk is infinite */
4173     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4174     scan_data_t data_fake;
4175     SV *re_trie_maxbuff = NULL;
4176     regnode *first_non_open = scan;
4177     SSize_t stopmin = SSize_t_MAX;
4178     scan_frame *frame = NULL;
4179     GET_RE_DEBUG_FLAGS_DECL;
4180
4181     PERL_ARGS_ASSERT_STUDY_CHUNK;
4182     RExC_study_started= 1;
4183
4184     Zero(&data_fake, 1, scan_data_t);
4185
4186     if ( depth == 0 ) {
4187         while (first_non_open && OP(first_non_open) == OPEN)
4188             first_non_open=regnext(first_non_open);
4189     }
4190
4191
4192   fake_study_recurse:
4193     DEBUG_r(
4194         RExC_study_chunk_recursed_count++;
4195     );
4196     DEBUG_OPTIMISE_MORE_r(
4197     {
4198         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4199             depth, (long)stopparen,
4200             (unsigned long)RExC_study_chunk_recursed_count,
4201             (unsigned long)depth, (unsigned long)recursed_depth,
4202             scan,
4203             last);
4204         if (recursed_depth) {
4205             U32 i;
4206             U32 j;
4207             for ( j = 0 ; j < recursed_depth ; j++ ) {
4208                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4209                     if (
4210                         PAREN_TEST(RExC_study_chunk_recursed +
4211                                    ( j * RExC_study_chunk_recursed_bytes), i )
4212                         && (
4213                             !j ||
4214                             !PAREN_TEST(RExC_study_chunk_recursed +
4215                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4216                         )
4217                     ) {
4218                         Perl_re_printf( aTHX_ " %d",(int)i);
4219                         break;
4220                     }
4221                 }
4222                 if ( j + 1 < recursed_depth ) {
4223                     Perl_re_printf( aTHX_  ",");
4224                 }
4225             }
4226         }
4227         Perl_re_printf( aTHX_ "\n");
4228     }
4229     );
4230     while ( scan && OP(scan) != END && scan < last ){
4231         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4232                                    node length to get a real minimum (because
4233                                    the folded version may be shorter) */
4234         bool unfolded_multi_char = FALSE;
4235         /* Peephole optimizer: */
4236         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4237         DEBUG_PEEP("Peep", scan, depth, flags);
4238
4239
4240         /* The reason we do this here is that we need to deal with things like
4241          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4242          * parsing code, as each (?:..) is handled by a different invocation of
4243          * reg() -- Yves
4244          */
4245         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4246
4247         /* Follow the next-chain of the current node and optimize
4248            away all the NOTHINGs from it.  */
4249         if (OP(scan) != CURLYX) {
4250             const int max = (reg_off_by_arg[OP(scan)]
4251                        ? I32_MAX
4252                        /* I32 may be smaller than U16 on CRAYs! */
4253                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4254             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4255             int noff;
4256             regnode *n = scan;
4257
4258             /* Skip NOTHING and LONGJMP. */
4259             while ((n = regnext(n))
4260                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4261                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4262                    && off + noff < max)
4263                 off += noff;
4264             if (reg_off_by_arg[OP(scan)])
4265                 ARG(scan) = off;
4266             else
4267                 NEXT_OFF(scan) = off;
4268         }
4269
4270         /* The principal pseudo-switch.  Cannot be a switch, since we
4271            look into several different things.  */
4272         if ( OP(scan) == DEFINEP ) {
4273             SSize_t minlen = 0;
4274             SSize_t deltanext = 0;
4275             SSize_t fake_last_close = 0;
4276             I32 f = SCF_IN_DEFINE;
4277
4278             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4279             scan = regnext(scan);
4280             assert( OP(scan) == IFTHEN );
4281             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4282
4283             data_fake.last_closep= &fake_last_close;
4284             minlen = *minlenp;
4285             next = regnext(scan);
4286             scan = NEXTOPER(NEXTOPER(scan));
4287             DEBUG_PEEP("scan", scan, depth, flags);
4288             DEBUG_PEEP("next", next, depth, flags);
4289
4290             /* we suppose the run is continuous, last=next...
4291              * NOTE we dont use the return here! */
4292             /* DEFINEP study_chunk() recursion */
4293             (void)study_chunk(pRExC_state, &scan, &minlen,
4294                               &deltanext, next, &data_fake, stopparen,
4295                               recursed_depth, NULL, f, depth+1);
4296
4297             scan = next;
4298         } else
4299         if (
4300             OP(scan) == BRANCH  ||
4301             OP(scan) == BRANCHJ ||
4302             OP(scan) == IFTHEN
4303         ) {
4304             next = regnext(scan);
4305             code = OP(scan);
4306
4307             /* The op(next)==code check below is to see if we
4308              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4309              * IFTHEN is special as it might not appear in pairs.
4310              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4311              * we dont handle it cleanly. */
4312             if (OP(next) == code || code == IFTHEN) {
4313                 /* NOTE - There is similar code to this block below for
4314                  * handling TRIE nodes on a re-study.  If you change stuff here
4315                  * check there too. */
4316                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4317                 regnode_ssc accum;
4318                 regnode * const startbranch=scan;
4319
4320                 if (flags & SCF_DO_SUBSTR) {
4321                     /* Cannot merge strings after this. */
4322                     scan_commit(pRExC_state, data, minlenp, is_inf);
4323                 }
4324
4325                 if (flags & SCF_DO_STCLASS)
4326                     ssc_init_zero(pRExC_state, &accum);
4327
4328                 while (OP(scan) == code) {
4329                     SSize_t deltanext, minnext, fake;
4330                     I32 f = 0;
4331                     regnode_ssc this_class;
4332
4333                     DEBUG_PEEP("Branch", scan, depth, flags);
4334
4335                     num++;
4336                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4337                     if (data) {
4338                         data_fake.whilem_c = data->whilem_c;
4339                         data_fake.last_closep = data->last_closep;
4340                     }
4341                     else
4342                         data_fake.last_closep = &fake;
4343
4344                     data_fake.pos_delta = delta;
4345                     next = regnext(scan);
4346
4347                     scan = NEXTOPER(scan); /* everything */
4348                     if (code != BRANCH)    /* everything but BRANCH */
4349                         scan = NEXTOPER(scan);
4350
4351                     if (flags & SCF_DO_STCLASS) {
4352                         ssc_init(pRExC_state, &this_class);
4353                         data_fake.start_class = &this_class;
4354                         f = SCF_DO_STCLASS_AND;
4355                     }
4356                     if (flags & SCF_WHILEM_VISITED_POS)
4357                         f |= SCF_WHILEM_VISITED_POS;
4358
4359                     /* we suppose the run is continuous, last=next...*/
4360                     /* recurse study_chunk() for each BRANCH in an alternation */
4361                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4362                                       &deltanext, next, &data_fake, stopparen,
4363                                       recursed_depth, NULL, f,depth+1);
4364
4365                     if (min1 > minnext)
4366                         min1 = minnext;
4367                     if (deltanext == SSize_t_MAX) {
4368                         is_inf = is_inf_internal = 1;
4369                         max1 = SSize_t_MAX;
4370                     } else if (max1 < minnext + deltanext)
4371                         max1 = minnext + deltanext;
4372                     scan = next;
4373                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4374                         pars++;
4375                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4376                         if ( stopmin > minnext)
4377                             stopmin = min + min1;
4378                         flags &= ~SCF_DO_SUBSTR;
4379                         if (data)
4380                             data->flags |= SCF_SEEN_ACCEPT;
4381                     }
4382                     if (data) {
4383                         if (data_fake.flags & SF_HAS_EVAL)
4384                             data->flags |= SF_HAS_EVAL;
4385                         data->whilem_c = data_fake.whilem_c;
4386                     }
4387                     if (flags & SCF_DO_STCLASS)
4388                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4389                 }
4390                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4391                     min1 = 0;
4392                 if (flags & SCF_DO_SUBSTR) {
4393                     data->pos_min += min1;
4394                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4395                         data->pos_delta = SSize_t_MAX;
4396                     else
4397                         data->pos_delta += max1 - min1;
4398                     if (max1 != min1 || is_inf)
4399                         data->cur_is_floating = 1;
4400                 }
4401                 min += min1;
4402                 if (delta == SSize_t_MAX
4403                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4404                     delta = SSize_t_MAX;
4405                 else
4406                     delta += max1 - min1;
4407                 if (flags & SCF_DO_STCLASS_OR) {
4408                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4409                     if (min1) {
4410                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4411                         flags &= ~SCF_DO_STCLASS;
4412                     }
4413                 }
4414                 else if (flags & SCF_DO_STCLASS_AND) {
4415                     if (min1) {
4416                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4417                         flags &= ~SCF_DO_STCLASS;
4418                     }
4419                     else {
4420                         /* Switch to OR mode: cache the old value of
4421                          * data->start_class */
4422                         INIT_AND_WITHP;
4423                         StructCopy(data->start_class, and_withp, regnode_ssc);
4424                         flags &= ~SCF_DO_STCLASS_AND;
4425                         StructCopy(&accum, data->start_class, regnode_ssc);
4426                         flags |= SCF_DO_STCLASS_OR;
4427                     }
4428                 }
4429
4430                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4431                         OP( startbranch ) == BRANCH )
4432                 {
4433                 /* demq.
4434
4435                    Assuming this was/is a branch we are dealing with: 'scan'
4436                    now points at the item that follows the branch sequence,
4437                    whatever it is. We now start at the beginning of the
4438                    sequence and look for subsequences of
4439
4440                    BRANCH->EXACT=>x1
4441                    BRANCH->EXACT=>x2
4442                    tail
4443
4444                    which would be constructed from a pattern like
4445                    /A|LIST|OF|WORDS/
4446
4447                    If we can find such a subsequence we need to turn the first
4448                    element into a trie and then add the subsequent branch exact
4449                    strings to the trie.
4450
4451                    We have two cases
4452
4453                      1. patterns where the whole set of branches can be
4454                         converted.
4455
4456                      2. patterns where only a subset can be converted.
4457
4458                    In case 1 we can replace the whole set with a single regop
4459                    for the trie. In case 2 we need to keep the start and end
4460                    branches so
4461
4462                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4463                      becomes BRANCH TRIE; BRANCH X;
4464
4465                   There is an additional case, that being where there is a
4466                   common prefix, which gets split out into an EXACT like node
4467                   preceding the TRIE node.
4468
4469                   If x(1..n)==tail then we can do a simple trie, if not we make
4470                   a "jump" trie, such that when we match the appropriate word
4471                   we "jump" to the appropriate tail node. Essentially we turn
4472                   a nested if into a case structure of sorts.
4473
4474                 */
4475
4476                     int made=0;
4477                     if (!re_trie_maxbuff) {
4478                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4479                         if (!SvIOK(re_trie_maxbuff))
4480                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4481                     }
4482                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4483                         regnode *cur;
4484                         regnode *first = (regnode *)NULL;
4485                         regnode *last = (regnode *)NULL;
4486                         regnode *tail = scan;
4487                         U8 trietype = 0;
4488                         U32 count=0;
4489
4490                         /* var tail is used because there may be a TAIL
4491                            regop in the way. Ie, the exacts will point to the
4492                            thing following the TAIL, but the last branch will
4493                            point at the TAIL. So we advance tail. If we
4494                            have nested (?:) we may have to move through several
4495                            tails.
4496                          */
4497
4498                         while ( OP( tail ) == TAIL ) {
4499                             /* this is the TAIL generated by (?:) */
4500                             tail = regnext( tail );
4501                         }
4502
4503
4504                         DEBUG_TRIE_COMPILE_r({
4505                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4506                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4507                               depth+1,
4508                               "Looking for TRIE'able sequences. Tail node is ",
4509                               (UV)(tail - RExC_emit_start),
4510                               SvPV_nolen_const( RExC_mysv )
4511                             );
4512                         });
4513
4514                         /*
4515
4516                             Step through the branches
4517                                 cur represents each branch,
4518                                 noper is the first thing to be matched as part
4519                                       of that branch
4520                                 noper_next is the regnext() of that node.
4521
4522                             We normally handle a case like this
4523                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4524                             support building with NOJUMPTRIE, which restricts
4525                             the trie logic to structures like /FOO|BAR/.
4526
4527                             If noper is a trieable nodetype then the branch is
4528                             a possible optimization target. If we are building
4529                             under NOJUMPTRIE then we require that noper_next is
4530                             the same as scan (our current position in the regex
4531                             program).
4532
4533                             Once we have two or more consecutive such branches
4534                             we can create a trie of the EXACT's contents and
4535                             stitch it in place into the program.
4536
4537                             If the sequence represents all of the branches in
4538                             the alternation we replace the entire thing with a
4539                             single TRIE node.
4540
4541                             Otherwise when it is a subsequence we need to
4542                             stitch it in place and replace only the relevant
4543                             branches. This means the first branch has to remain
4544                             as it is used by the alternation logic, and its
4545                             next pointer, and needs to be repointed at the item
4546                             on the branch chain following the last branch we
4547                             have optimized away.
4548
4549                             This could be either a BRANCH, in which case the
4550                             subsequence is internal, or it could be the item
4551                             following the branch sequence in which case the
4552                             subsequence is at the end (which does not
4553                             necessarily mean the first node is the start of the
4554                             alternation).
4555
4556                             TRIE_TYPE(X) is a define which maps the optype to a
4557                             trietype.
4558
4559                                 optype          |  trietype
4560                                 ----------------+-----------
4561                                 NOTHING         | NOTHING
4562                                 EXACT           | EXACT
4563                                 EXACTFU         | EXACTFU
4564                                 EXACTFU_SS      | EXACTFU
4565                                 EXACTFA         | EXACTFA
4566                                 EXACTL          | EXACTL
4567                                 EXACTFLU8       | EXACTFLU8
4568
4569
4570                         */
4571 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4572                        ? NOTHING                                            \
4573                        : ( EXACT == (X) )                                   \
4574                          ? EXACT                                            \
4575                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4576                            ? EXACTFU                                        \
4577                            : ( EXACTFA == (X) )                             \
4578                              ? EXACTFA                                      \
4579                              : ( EXACTL == (X) )                            \
4580                                ? EXACTL                                     \
4581                                : ( EXACTFLU8 == (X) )                        \
4582                                  ? EXACTFLU8                                 \
4583                                  : 0 )
4584
4585                         /* dont use tail as the end marker for this traverse */
4586                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4587                             regnode * const noper = NEXTOPER( cur );
4588                             U8 noper_type = OP( noper );
4589                             U8 noper_trietype = TRIE_TYPE( noper_type );
4590 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4591                             regnode * const noper_next = regnext( noper );
4592                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4593                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4594 #endif
4595
4596                             DEBUG_TRIE_COMPILE_r({
4597                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4598                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4599                                    depth+1,
4600                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4601
4602                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4603                                 Perl_re_printf( aTHX_  " -> %d:%s",
4604                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4605
4606                                 if ( noper_next ) {
4607                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4608                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4609                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4610                                 }
4611                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4612                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4613                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4614                                 );
4615                             });
4616
4617                             /* Is noper a trieable nodetype that can be merged
4618                              * with the current trie (if there is one)? */
4619                             if ( noper_trietype
4620                                   &&
4621                                   (
4622                                         ( noper_trietype == NOTHING )
4623                                         || ( trietype == NOTHING )
4624                                         || ( trietype == noper_trietype )
4625                                   )
4626 #ifdef NOJUMPTRIE
4627                                   && noper_next >= tail
4628 #endif
4629                                   && count < U16_MAX)
4630                             {
4631                                 /* Handle mergable triable node Either we are
4632                                  * the first node in a new trieable sequence,
4633                                  * in which case we do some bookkeeping,
4634                                  * otherwise we update the end pointer. */
4635                                 if ( !first ) {
4636                                     first = cur;
4637                                     if ( noper_trietype == NOTHING ) {
4638 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4639                                         regnode * const noper_next = regnext( noper );
4640                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4641                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4642 #endif
4643
4644                                         if ( noper_next_trietype ) {
4645                                             trietype = noper_next_trietype;
4646                                         } else if (noper_next_type)  {
4647                                             /* a NOTHING regop is 1 regop wide.
4648                                              * We need at least two for a trie
4649                                              * so we can't merge this in */
4650                                             first = NULL;
4651                                         }
4652                                     } else {
4653                                         trietype = noper_trietype;
4654                                     }
4655                                 } else {
4656                                     if ( trietype == NOTHING )
4657                                         trietype = noper_trietype;
4658                                     last = cur;
4659                                 }
4660                                 if (first)
4661                                     count++;
4662                             } /* end handle mergable triable node */
4663                             else {
4664                                 /* handle unmergable node -
4665                                  * noper may either be a triable node which can
4666                                  * not be tried together with the current trie,
4667                                  * or a non triable node */
4668                                 if ( last ) {
4669                                     /* If last is set and trietype is not
4670                                      * NOTHING then we have found at least two
4671                                      * triable branch sequences in a row of a
4672                                      * similar trietype so we can turn them
4673                                      * into a trie. If/when we allow NOTHING to
4674                                      * start a trie sequence this condition
4675                                      * will be required, and it isn't expensive
4676                                      * so we leave it in for now. */
4677                                     if ( trietype && trietype != NOTHING )
4678                                         make_trie( pRExC_state,
4679                                                 startbranch, first, cur, tail,
4680                                                 count, trietype, depth+1 );
4681                                     last = NULL; /* note: we clear/update
4682                                                     first, trietype etc below,
4683                                                     so we dont do it here */
4684                                 }
4685                                 if ( noper_trietype
4686 #ifdef NOJUMPTRIE
4687                                      && noper_next >= tail
4688 #endif
4689                                 ){
4690                                     /* noper is triable, so we can start a new
4691                                      * trie sequence */
4692                                     count = 1;
4693                                     first = cur;
4694                                     trietype = noper_trietype;
4695                                 } else if (first) {
4696                                     /* if we already saw a first but the
4697                                      * current node is not triable then we have
4698                                      * to reset the first information. */
4699                                     count = 0;
4700                                     first = NULL;
4701                                     trietype = 0;
4702                                 }
4703                             } /* end handle unmergable node */
4704                         } /* loop over branches */
4705                         DEBUG_TRIE_COMPILE_r({
4706                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4707                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4708                               depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4709                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4710                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4711                                PL_reg_name[trietype]
4712                             );
4713
4714                         });
4715                         if ( last && trietype ) {
4716                             if ( trietype != NOTHING ) {
4717                                 /* the last branch of the sequence was part of
4718                                  * a trie, so we have to construct it here
4719                                  * outside of the loop */
4720                                 made= make_trie( pRExC_state, startbranch,
4721                                                  first, scan, tail, count,
4722                                                  trietype, depth+1 );
4723 #ifdef TRIE_STUDY_OPT
4724                                 if ( ((made == MADE_EXACT_TRIE &&
4725                                      startbranch == first)
4726                                      || ( first_non_open == first )) &&
4727                                      depth==0 ) {
4728                                     flags |= SCF_TRIE_RESTUDY;
4729                                     if ( startbranch == first
4730                                          && scan >= tail )
4731                                     {
4732                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4733                                     }
4734                                 }
4735 #endif
4736                             } else {
4737                                 /* at this point we know whatever we have is a
4738                                  * NOTHING sequence/branch AND if 'startbranch'
4739                                  * is 'first' then we can turn the whole thing
4740                                  * into a NOTHING
4741                                  */
4742                                 if ( startbranch == first ) {
4743                                     regnode *opt;
4744                                     /* the entire thing is a NOTHING sequence,
4745                                      * something like this: (?:|) So we can
4746                                      * turn it into a plain NOTHING op. */
4747                                     DEBUG_TRIE_COMPILE_r({
4748                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4749                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4750                                           depth+1,
4751                                           SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4752
4753                                     });
4754                                     OP(startbranch)= NOTHING;
4755                                     NEXT_OFF(startbranch)= tail - startbranch;
4756                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4757                                         OP(opt)= OPTIMIZED;
4758                                 }
4759                             }
4760                         } /* end if ( last) */
4761                     } /* TRIE_MAXBUF is non zero */
4762
4763                 } /* do trie */
4764
4765             }
4766             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4767                 scan = NEXTOPER(NEXTOPER(scan));
4768             } else                      /* single branch is optimized. */
4769                 scan = NEXTOPER(scan);
4770             continue;
4771         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4772             I32 paren = 0;
4773             regnode *start = NULL;
4774             regnode *end = NULL;
4775             U32 my_recursed_depth= recursed_depth;
4776
4777             if (OP(scan) != SUSPEND) { /* GOSUB */
4778                 /* Do setup, note this code has side effects beyond
4779                  * the rest of this block. Specifically setting
4780                  * RExC_recurse[] must happen at least once during
4781                  * study_chunk(). */
4782                 paren = ARG(scan);
4783                 RExC_recurse[ARG2L(scan)] = scan;
4784                 start = RExC_open_parens[paren];
4785                 end   = RExC_close_parens[paren];
4786
4787                 /* NOTE we MUST always execute the above code, even
4788                  * if we do nothing with a GOSUB */
4789                 if (
4790                     ( flags & SCF_IN_DEFINE )
4791                     ||
4792                     (
4793                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4794                         &&
4795                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4796                     )
4797                 ) {
4798                     /* no need to do anything here if we are in a define. */
4799                     /* or we are after some kind of infinite construct
4800                      * so we can skip recursing into this item.
4801                      * Since it is infinite we will not change the maxlen
4802                      * or delta, and if we miss something that might raise
4803                      * the minlen it will merely pessimise a little.
4804                      *
4805                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4806                      * might result in a minlen of 1 and not of 4,
4807                      * but this doesn't make us mismatch, just try a bit
4808                      * harder than we should.
4809                      * */
4810                     scan= regnext(scan);
4811                     continue;
4812                 }
4813
4814                 if (
4815                     !recursed_depth
4816                     ||
4817                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4818                 ) {
4819                     /* it is quite possible that there are more efficient ways
4820                      * to do this. We maintain a bitmap per level of recursion
4821                      * of which patterns we have entered so we can detect if a
4822                      * pattern creates a possible infinite loop. When we
4823                      * recurse down a level we copy the previous levels bitmap
4824                      * down. When we are at recursion level 0 we zero the top
4825                      * level bitmap. It would be nice to implement a different
4826                      * more efficient way of doing this. In particular the top
4827                      * level bitmap may be unnecessary.
4828                      */
4829                     if (!recursed_depth) {
4830                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4831                     } else {
4832                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4833                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4834                              RExC_study_chunk_recursed_bytes, U8);
4835                     }
4836                     /* we havent recursed into this paren yet, so recurse into it */
4837                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
4838                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4839                     my_recursed_depth= recursed_depth + 1;
4840                 } else {
4841                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
4842                     /* some form of infinite recursion, assume infinite length
4843                      * */
4844                     if (flags & SCF_DO_SUBSTR) {
4845                         scan_commit(pRExC_state, data, minlenp, is_inf);
4846                         data->cur_is_floating = 1;
4847                     }
4848                     is_inf = is_inf_internal = 1;
4849                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4850                         ssc_anything(data->start_class);
4851                     flags &= ~SCF_DO_STCLASS;
4852
4853                     start= NULL; /* reset start so we dont recurse later on. */
4854                 }
4855             } else {
4856                 paren = stopparen;
4857                 start = scan + 2;
4858                 end = regnext(scan);
4859             }
4860             if (start) {
4861                 scan_frame *newframe;
4862                 assert(end);
4863                 if (!RExC_frame_last) {
4864                     Newxz(newframe, 1, scan_frame);
4865                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4866                     RExC_frame_head= newframe;
4867                     RExC_frame_count++;
4868                 } else if (!RExC_frame_last->next_frame) {
4869                     Newxz(newframe,1,scan_frame);
4870                     RExC_frame_last->next_frame= newframe;
4871                     newframe->prev_frame= RExC_frame_last;
4872                     RExC_frame_count++;
4873                 } else {
4874                     newframe= RExC_frame_last->next_frame;
4875                 }
4876                 RExC_frame_last= newframe;
4877
4878                 newframe->next_regnode = regnext(scan);
4879                 newframe->last_regnode = last;
4880                 newframe->stopparen = stopparen;
4881                 newframe->prev_recursed_depth = recursed_depth;
4882                 newframe->this_prev_frame= frame;
4883
4884                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
4885                 DEBUG_PEEP("fnew", scan, depth, flags);
4886
4887                 frame = newframe;
4888                 scan =  start;
4889                 stopparen = paren;
4890                 last = end;
4891                 depth = depth + 1;
4892                 recursed_depth= my_recursed_depth;
4893
4894                 continue;
4895             }
4896         }
4897         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4898             SSize_t l = STR_LEN(scan);
4899             UV uc;
4900             assert(l);
4901             if (UTF) {
4902                 const U8 * const s = (U8*)STRING(scan);
4903                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4904                 l = utf8_length(s, s + l);
4905             } else {
4906                 uc = *((U8*)STRING(scan));
4907             }
4908             min += l;
4909             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4910                 /* The code below prefers earlier match for fixed
4911                    offset, later match for variable offset.  */
4912                 if (data->last_end == -1) { /* Update the start info. */
4913                     data->last_start_min = data->pos_min;
4914                     data->last_start_max = is_inf
4915                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4916                 }
4917                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4918                 if (UTF)
4919                     SvUTF8_on(data->last_found);
4920                 {
4921                     SV * const sv = data->last_found;
4922                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4923                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4924                     if (mg && mg->mg_len >= 0)
4925                         mg->mg_len += utf8_length((U8*)STRING(scan),
4926                                               (U8*)STRING(scan)+STR_LEN(scan));
4927                 }
4928                 data->last_end = data->pos_min + l;
4929                 data->pos_min += l; /* As in the first entry. */
4930                 data->flags &= ~SF_BEFORE_EOL;
4931             }
4932
4933             /* ANDing the code point leaves at most it, and not in locale, and
4934              * can't match null string */
4935             if (flags & SCF_DO_STCLASS_AND) {
4936                 ssc_cp_and(data->start_class, uc);
4937                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4938                 ssc_clear_locale(data->start_class);
4939             }
4940             else if (flags & SCF_DO_STCLASS_OR) {
4941                 ssc_add_cp(data->start_class, uc);
4942                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4943
4944                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4945                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4946             }
4947             flags &= ~SCF_DO_STCLASS;
4948         }
4949         else if (PL_regkind[OP(scan)] == EXACT) {
4950             /* But OP != EXACT!, so is EXACTFish */
4951             SSize_t l = STR_LEN(scan);
4952             const U8 * s = (U8*)STRING(scan);
4953
4954             /* Search for fixed substrings supports EXACT only. */
4955             if (flags & SCF_DO_SUBSTR) {
4956                 assert(data);
4957                 scan_commit(pRExC_state, data, minlenp, is_inf);
4958             }
4959             if (UTF) {
4960                 l = utf8_length(s, s + l);
4961             }
4962             if (unfolded_multi_char) {
4963                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4964             }
4965             min += l - min_subtract;
4966             assert (min >= 0);
4967             delta += min_subtract;
4968             if (flags & SCF_DO_SUBSTR) {
4969                 data->pos_min += l - min_subtract;
4970                 if (data->pos_min < 0) {
4971                     data->pos_min = 0;
4972                 }
4973                 data->pos_delta += min_subtract;
4974                 if (min_subtract) {
4975                     data->cur_is_floating = 1; /* float */
4976                 }
4977             }
4978
4979             if (flags & SCF_DO_STCLASS) {
4980                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4981
4982                 assert(EXACTF_invlist);
4983                 if (flags & SCF_DO_STCLASS_AND) {
4984                     if (OP(scan) != EXACTFL)
4985                         ssc_clear_locale(data->start_class);
4986                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4987                     ANYOF_POSIXL_ZERO(data->start_class);
4988                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4989                 }
4990                 else {  /* SCF_DO_STCLASS_OR */
4991                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4992                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4993
4994                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4995                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4996                 }
4997                 flags &= ~SCF_DO_STCLASS;
4998                 SvREFCNT_dec(EXACTF_invlist);
4999             }
5000         }
5001         else if (REGNODE_VARIES(OP(scan))) {
5002             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5003             I32 fl = 0, f = flags;
5004             regnode * const oscan = scan;
5005             regnode_ssc this_class;
5006             regnode_ssc *oclass = NULL;
5007             I32 next_is_eval = 0;
5008
5009             switch (PL_regkind[OP(scan)]) {
5010             case WHILEM:                /* End of (?:...)* . */
5011                 scan = NEXTOPER(scan);
5012                 goto finish;
5013             case PLUS:
5014                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5015                     next = NEXTOPER(scan);
5016                     if (OP(next) == EXACT
5017                         || OP(next) == EXACTL
5018                         || (flags & SCF_DO_STCLASS))
5019                     {
5020                         mincount = 1;
5021                         maxcount = REG_INFTY;
5022                         next = regnext(scan);
5023                         scan = NEXTOPER(scan);
5024                         goto do_curly;
5025                     }
5026                 }
5027                 if (flags & SCF_DO_SUBSTR)
5028                     data->pos_min++;
5029                 min++;
5030                 /* FALLTHROUGH */
5031             case STAR:
5032                 if (flags & SCF_DO_STCLASS) {
5033                     mincount = 0;
5034                     maxcount = REG_INFTY;
5035                     next = regnext(scan);
5036                     scan = NEXTOPER(scan);
5037                     goto do_curly;
5038                 }
5039                 if (flags & SCF_DO_SUBSTR) {
5040                     scan_commit(pRExC_state, data, minlenp, is_inf);
5041                     /* Cannot extend fixed substrings */
5042                     data->cur_is_floating = 1; /* float */
5043                 }
5044                 is_inf = is_inf_internal = 1;
5045                 scan = regnext(scan);
5046                 goto optimize_curly_tail;
5047             case CURLY:
5048                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5049                     && (scan->flags == stopparen))
5050                 {
5051                     mincount = 1;
5052                     maxcount = 1;
5053                 } else {
5054                     mincount = ARG1(scan);
5055                     maxcount = ARG2(scan);
5056                 }
5057                 next = regnext(scan);
5058                 if (OP(scan) == CURLYX) {
5059                     I32 lp = (data ? *(data->last_closep) : 0);
5060                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5061                 }
5062                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5063                 next_is_eval = (OP(scan) == EVAL);
5064               do_curly:
5065                 if (flags & SCF_DO_SUBSTR) {
5066                     if (mincount == 0)
5067                         scan_commit(pRExC_state, data, minlenp, is_inf);
5068                     /* Cannot extend fixed substrings */
5069                     pos_before = data->pos_min;
5070                 }
5071                 if (data) {
5072                     fl = data->flags;
5073                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5074                     if (is_inf)
5075                         data->flags |= SF_IS_INF;
5076                 }
5077                 if (flags & SCF_DO_STCLASS) {
5078                     ssc_init(pRExC_state, &this_class);
5079                     oclass = data->start_class;
5080                     data->start_class = &this_class;
5081                     f |= SCF_DO_STCLASS_AND;
5082                     f &= ~SCF_DO_STCLASS_OR;
5083                 }
5084                 /* Exclude from super-linear cache processing any {n,m}
5085                    regops for which the combination of input pos and regex
5086                    pos is not enough information to determine if a match
5087                    will be possible.
5088
5089                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5090                    regex pos at the \s*, the prospects for a match depend not
5091                    only on the input position but also on how many (bar\s*)
5092                    repeats into the {4,8} we are. */
5093                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5094                     f &= ~SCF_WHILEM_VISITED_POS;
5095
5096                 /* This will finish on WHILEM, setting scan, or on NULL: */
5097                 /* recurse study_chunk() on loop bodies */
5098                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5099                                   last, data, stopparen, recursed_depth, NULL,
5100                                   (mincount == 0
5101                                    ? (f & ~SCF_DO_SUBSTR)
5102                                    : f)
5103                                   ,depth+1);
5104
5105                 if (flags & SCF_DO_STCLASS)
5106                     data->start_class = oclass;
5107                 if (mincount == 0 || minnext == 0) {
5108                     if (flags & SCF_DO_STCLASS_OR) {
5109                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5110                     }
5111                     else if (flags & SCF_DO_STCLASS_AND) {
5112                         /* Switch to OR mode: cache the old value of
5113                          * data->start_class */
5114                         INIT_AND_WITHP;
5115                         StructCopy(data->start_class, and_withp, regnode_ssc);
5116                         flags &= ~SCF_DO_STCLASS_AND;
5117                         StructCopy(&this_class, data->start_class, regnode_ssc);
5118                         flags |= SCF_DO_STCLASS_OR;
5119                         ANYOF_FLAGS(data->start_class)
5120                                                 |= SSC_MATCHES_EMPTY_STRING;
5121                     }
5122                 } else {                /* Non-zero len */
5123                     if (flags & SCF_DO_STCLASS_OR) {
5124                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5125                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5126                     }
5127                     else if (flags & SCF_DO_STCLASS_AND)
5128                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5129                     flags &= ~SCF_DO_STCLASS;
5130                 }
5131                 if (!scan)              /* It was not CURLYX, but CURLY. */
5132                     scan = next;
5133                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5134                     /* ? quantifier ok, except for (?{ ... }) */
5135                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5136                     && (minnext == 0) && (deltanext == 0)
5137                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5138                     && maxcount <= REG_INFTY/3) /* Complement check for big
5139                                                    count */
5140                 {
5141                     /* Fatal warnings may leak the regexp without this: */
5142                     SAVEFREESV(RExC_rx_sv);
5143                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5144                         "Quantifier unexpected on zero-length expression "
5145                         "in regex m/%" UTF8f "/",
5146                          UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5147                                   RExC_precomp));
5148                     (void)ReREFCNT_inc(RExC_rx_sv);
5149                 }
5150
5151                 min += minnext * mincount;
5152                 is_inf_internal |= deltanext == SSize_t_MAX
5153                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5154                 is_inf |= is_inf_internal;
5155                 if (is_inf) {
5156                     delta = SSize_t_MAX;
5157                 } else {
5158                     delta += (minnext + deltanext) * maxcount
5159                              - minnext * mincount;
5160                 }
5161                 /* Try powerful optimization CURLYX => CURLYN. */
5162                 if (  OP(oscan) == CURLYX && data
5163                       && data->flags & SF_IN_PAR
5164                       && !(data->flags & SF_HAS_EVAL)
5165                       && !deltanext && minnext == 1 ) {
5166                     /* Try to optimize to CURLYN.  */
5167                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5168                     regnode * const nxt1 = nxt;
5169 #ifdef DEBUGGING
5170                     regnode *nxt2;
5171 #endif
5172
5173                     /* Skip open. */
5174                     nxt = regnext(nxt);
5175                     if (!REGNODE_SIMPLE(OP(nxt))
5176                         && !(PL_regkind[OP(nxt)] == EXACT
5177                              && STR_LEN(nxt) == 1))
5178                         goto nogo;
5179 #ifdef DEBUGGING
5180                     nxt2 = nxt;
5181 #endif
5182                     nxt = regnext(nxt);
5183                     if (OP(nxt) != CLOSE)
5184                         goto nogo;
5185                     if (RExC_open_parens) {
5186                         RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5187                         RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5188                     }
5189                     /* Now we know that nxt2 is the only contents: */
5190                     oscan->flags = (U8)ARG(nxt);
5191                     OP(oscan) = CURLYN;
5192                     OP(nxt1) = NOTHING; /* was OPEN. */
5193
5194 #ifdef DEBUGGING
5195                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5196                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5197                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5198                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5199                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5200                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5201 #endif
5202                 }
5203               nogo:
5204
5205                 /* Try optimization CURLYX => CURLYM. */
5206                 if (  OP(oscan) == CURLYX && data
5207                       && !(data->flags & SF_HAS_PAR)
5208                       && !(data->flags & SF_HAS_EVAL)
5209                       && !deltanext     /* atom is fixed width */
5210                       && minnext != 0   /* CURLYM can't handle zero width */
5211
5212                          /* Nor characters whose fold at run-time may be
5213                           * multi-character */
5214                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5215                 ) {
5216                     /* XXXX How to optimize if data == 0? */
5217                     /* Optimize to a simpler form.  */
5218                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5219                     regnode *nxt2;
5220
5221                     OP(oscan) = CURLYM;
5222                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5223                             && (OP(nxt2) != WHILEM))
5224                         nxt = nxt2;
5225                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5226                     /* Need to optimize away parenths. */
5227                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5228                         /* Set the parenth number.  */
5229                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5230
5231                         oscan->flags = (U8)ARG(nxt);
5232                         if (RExC_open_parens) {
5233                             RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5234                             RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5235                         }
5236                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5237                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5238
5239 #ifdef DEBUGGING
5240                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5241                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5242                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5243                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5244 #endif
5245 #if 0
5246                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5247                             regnode *nnxt = regnext(nxt1);
5248                             if (nnxt == nxt) {
5249                                 if (reg_off_by_arg[OP(nxt1)])
5250                                     ARG_SET(nxt1, nxt2 - nxt1);
5251                                 else if (nxt2 - nxt1 < U16_MAX)
5252                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5253                                 else
5254                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5255                             }
5256                             nxt1 = nnxt;
5257                         }
5258 #endif
5259                         /* Optimize again: */
5260                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5261                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5262                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5263                     }
5264                     else
5265                         oscan->flags = 0;
5266                 }
5267                 else if ((OP(oscan) == CURLYX)
5268                          && (flags & SCF_WHILEM_VISITED_POS)
5269                          /* See the comment on a similar expression above.
5270                             However, this time it's not a subexpression
5271                             we care about, but the expression itself. */
5272                          && (maxcount == REG_INFTY)
5273                          && data) {
5274                     /* This stays as CURLYX, we can put the count/of pair. */
5275                     /* Find WHILEM (as in regexec.c) */
5276                     regnode *nxt = oscan + NEXT_OFF(oscan);
5277
5278                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5279                         nxt += ARG(nxt);
5280                     nxt = PREVOPER(nxt);
5281                     if (nxt->flags & 0xf) {
5282                         /* we've already set whilem count on this node */
5283                     } else if (++data->whilem_c < 16) {
5284                         assert(data->whilem_c <= RExC_whilem_seen);
5285                         nxt->flags = (U8)(data->whilem_c
5286                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5287                     }
5288                 }
5289                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5290                     pars++;
5291                 if (flags & SCF_DO_SUBSTR) {
5292                     SV *last_str = NULL;
5293                     STRLEN last_chrs = 0;
5294                     int counted = mincount != 0;
5295
5296                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5297                                                                   string. */
5298                         SSize_t b = pos_before >= data->last_start_min
5299                             ? pos_before : data->last_start_min;
5300                         STRLEN l;
5301                         const char * const s = SvPV_const(data->last_found, l);
5302                         SSize_t old = b - data->last_start_min;
5303
5304                         if (UTF)
5305                             old = utf8_hop((U8*)s, old) - (U8*)s;
5306                         l -= old;
5307                         /* Get the added string: */
5308                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5309                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5310                                             (U8*)(s + old + l)) : l;
5311                         if (deltanext == 0 && pos_before == b) {
5312                             /* What was added is a constant string */
5313                             if (mincount > 1) {
5314
5315                                 SvGROW(last_str, (mincount * l) + 1);
5316                                 repeatcpy(SvPVX(last_str) + l,
5317                                           SvPVX_const(last_str), l,
5318                                           mincount - 1);
5319                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5320                                 /* Add additional parts. */
5321                                 SvCUR_set(data->last_found,
5322                                           SvCUR(data->last_found) - l);
5323                                 sv_catsv(data->last_found, last_str);
5324                                 {
5325                                     SV * sv = data->last_found;
5326                                     MAGIC *mg =
5327                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5328                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5329                                     if (mg && mg->mg_len >= 0)
5330                                         mg->mg_len += last_chrs * (mincount-1);
5331                                 }
5332                                 last_chrs *= mincount;
5333                                 data->last_end += l * (mincount - 1);
5334                             }
5335                         } else {
5336                             /* start offset must point into the last copy */
5337                             data->last_start_min += minnext * (mincount - 1);
5338                             data->last_start_max =
5339                               is_inf
5340                                ? SSize_t_MAX
5341                                : data->last_start_max +
5342                                  (maxcount - 1) * (minnext + data->pos_delta);
5343                         }
5344                     }
5345                     /* It is counted once already... */
5346                     data->pos_min += minnext * (mincount - counted);
5347 #if 0
5348 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5349                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5350                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5351     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5352     (UV)mincount);
5353 if (deltanext != SSize_t_MAX)
5354 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5355     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5356           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5357 #endif
5358                     if (deltanext == SSize_t_MAX
5359                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5360                         data->pos_delta = SSize_t_MAX;
5361                     else
5362                         data->pos_delta += - counted * deltanext +
5363                         (minnext + deltanext) * maxcount - minnext * mincount;
5364                     if (mincount != maxcount) {
5365                          /* Cannot extend fixed substrings found inside
5366                             the group.  */
5367                         scan_commit(pRExC_state, data, minlenp, is_inf);
5368                         if (mincount && last_str) {
5369                             SV * const sv = data->last_found;
5370                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5371                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5372
5373                             if (mg)
5374                                 mg->mg_len = -1;
5375                             sv_setsv(sv, last_str);
5376                             data->last_end = data->pos_min;
5377                             data->last_start_min = data->pos_min - last_chrs;
5378                             data->last_start_max = is_inf
5379                                 ? SSize_t_MAX
5380                                 : data->pos_min + data->pos_delta - last_chrs;
5381                         }
5382                         data->cur_is_floating = 1; /* float */
5383                     }
5384                     SvREFCNT_dec(last_str);
5385                 }
5386                 if (data && (fl & SF_HAS_EVAL))
5387                     data->flags |= SF_HAS_EVAL;
5388               optimize_curly_tail:
5389                 if (OP(oscan) != CURLYX) {
5390                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5391                            && NEXT_OFF(next))
5392                         NEXT_OFF(oscan) += NEXT_OFF(next);
5393                 }
5394                 continue;
5395
5396             default:
5397 #ifdef DEBUGGING
5398                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5399                                                                     OP(scan));
5400 #endif
5401             case REF:
5402             case CLUMP:
5403                 if (flags & SCF_DO_SUBSTR) {
5404                     /* Cannot expect anything... */
5405                     scan_commit(pRExC_state, data, minlenp, is_inf);
5406                     data->cur_is_floating = 1; /* float */
5407                 }
5408                 is_inf = is_inf_internal = 1;
5409                 if (flags & SCF_DO_STCLASS_OR) {
5410                     if (OP(scan) == CLUMP) {
5411                         /* Actually is any start char, but very few code points
5412                          * aren't start characters */
5413                         ssc_match_all_cp(data->start_class);
5414                     }
5415                     else {
5416                         ssc_anything(data->start_class);
5417                     }
5418                 }
5419                 flags &= ~SCF_DO_STCLASS;
5420                 break;
5421             }
5422         }
5423         else if (OP(scan) == LNBREAK) {
5424             if (flags & SCF_DO_STCLASS) {
5425                 if (flags & SCF_DO_STCLASS_AND) {
5426                     ssc_intersection(data->start_class,
5427                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5428                     ssc_clear_locale(data->start_class);
5429                     ANYOF_FLAGS(data->start_class)
5430                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5431                 }
5432                 else if (flags & SCF_DO_STCLASS_OR) {
5433                     ssc_union(data->start_class,
5434                               PL_XPosix_ptrs[_CC_VERTSPACE],
5435                               FALSE);
5436                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5437
5438                     /* See commit msg for
5439                      * 749e076fceedeb708a624933726e7989f2302f6a */
5440                     ANYOF_FLAGS(data->start_class)
5441                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5442                 }
5443                 flags &= ~SCF_DO_STCLASS;
5444             }
5445             min++;
5446             if (delta != SSize_t_MAX)
5447                 delta++;    /* Because of the 2 char string cr-lf */
5448             if (flags & SCF_DO_SUBSTR) {
5449                 /* Cannot expect anything... */
5450                 scan_commit(pRExC_state, data, minlenp, is_inf);
5451                 data->pos_min += 1;
5452                 data->pos_delta += 1;
5453                 data->cur_is_floating = 1; /* float */
5454             }
5455         }
5456         else if (REGNODE_SIMPLE(OP(scan))) {
5457
5458             if (flags & SCF_DO_SUBSTR) {
5459                 scan_commit(pRExC_state, data, minlenp, is_inf);
5460                 data->pos_min++;
5461             }
5462             min++;
5463             if (flags & SCF_DO_STCLASS) {
5464                 bool invert = 0;
5465                 SV* my_invlist = NULL;
5466                 U8 namedclass;
5467
5468                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5469                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5470
5471                 /* Some of the logic below assumes that switching
5472                    locale on will only add false positives. */
5473                 switch (OP(scan)) {
5474
5475                 default:
5476 #ifdef DEBUGGING
5477                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5478                                                                      OP(scan));
5479 #endif
5480                 case SANY:
5481                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5482                         ssc_match_all_cp(data->start_class);
5483                     break;
5484
5485                 case REG_ANY:
5486                     {
5487                         SV* REG_ANY_invlist = _new_invlist(2);
5488                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5489                                                             '\n');
5490                         if (flags & SCF_DO_STCLASS_OR) {
5491                             ssc_union(data->start_class,
5492                                       REG_ANY_invlist,
5493                                       TRUE /* TRUE => invert, hence all but \n
5494                                             */
5495                                       );
5496                         }
5497                         else if (flags & SCF_DO_STCLASS_AND) {
5498                             ssc_intersection(data->start_class,
5499                                              REG_ANY_invlist,
5500                                              TRUE  /* TRUE => invert */
5501                                              );
5502                             ssc_clear_locale(data->start_class);
5503                         }
5504                         SvREFCNT_dec_NN(REG_ANY_invlist);
5505                     }
5506                     break;
5507
5508                 case ANYOFD:
5509                 case ANYOFL:
5510                 case ANYOF:
5511                     if (flags & SCF_DO_STCLASS_AND)
5512                         ssc_and(pRExC_state, data->start_class,
5513                                 (regnode_charclass *) scan);
5514                     else
5515                         ssc_or(pRExC_state, data->start_class,
5516                                                           (regnode_charclass *) scan);
5517                     break;
5518
5519                 case NPOSIXL:
5520                     invert = 1;
5521                     /* FALLTHROUGH */
5522
5523                 case POSIXL:
5524                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5525                     if (flags & SCF_DO_STCLASS_AND) {
5526                         bool was_there = cBOOL(
5527                                           ANYOF_POSIXL_TEST(data->start_class,
5528                                                                  namedclass));
5529                         ANYOF_POSIXL_ZERO(data->start_class);
5530                         if (was_there) {    /* Do an AND */
5531                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5532                         }
5533                         /* No individual code points can now match */
5534                         data->start_class->invlist
5535                                                 = sv_2mortal(_new_invlist(0));
5536                     }
5537                     else {
5538                         int complement = namedclass + ((invert) ? -1 : 1);
5539
5540                         assert(flags & SCF_DO_STCLASS_OR);
5541
5542                         /* If the complement of this class was already there,
5543                          * the result is that they match all code points,
5544                          * (\d + \D == everything).  Remove the classes from
5545                          * future consideration.  Locale is not relevant in
5546                          * this case */
5547                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5548                             ssc_match_all_cp(data->start_class);
5549                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5550                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5551                         }
5552                         else {  /* The usual case; just add this class to the
5553                                    existing set */
5554                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5555                         }
5556                     }
5557                     break;
5558
5559                 case NASCII:
5560                     invert = 1;
5561                     /* FALLTHROUGH */
5562                 case ASCII:
5563                     my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5564
5565                     /* This can be handled as a Posix class */
5566                     goto join_posix_and_ascii;
5567
5568                 case NPOSIXA:   /* For these, we always know the exact set of
5569                                    what's matched */
5570                     invert = 1;
5571                     /* FALLTHROUGH */
5572                 case POSIXA:
5573                     assert(FLAGS(scan) != _CC_ASCII);
5574                     _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5575                                           PL_XPosix_ptrs[_CC_ASCII],
5576                                           &my_invlist);
5577                     goto join_posix_and_ascii;
5578
5579                 case NPOSIXD:
5580                 case NPOSIXU:
5581                     invert = 1;
5582                     /* FALLTHROUGH */
5583                 case POSIXD:
5584                 case POSIXU:
5585                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5586
5587                     /* NPOSIXD matches all upper Latin1 code points unless the
5588                      * target string being matched is UTF-8, which is
5589                      * unknowable until match time.  Since we are going to
5590                      * invert, we want to get rid of all of them so that the
5591                      * inversion will match all */
5592                     if (OP(scan) == NPOSIXD) {
5593                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5594                                           &my_invlist);
5595                     }
5596
5597                   join_posix_and_ascii:
5598
5599                     if (flags & SCF_DO_STCLASS_AND) {
5600                         ssc_intersection(data->start_class, my_invlist, invert);
5601                         ssc_clear_locale(data->start_class);
5602                     }
5603                     else {
5604                         assert(flags & SCF_DO_STCLASS_OR);
5605                         ssc_union(data->start_class, my_invlist, invert);
5606                     }
5607                     SvREFCNT_dec(my_invlist);
5608                 }
5609                 if (flags & SCF_DO_STCLASS_OR)
5610                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5611                 flags &= ~SCF_DO_STCLASS;
5612             }
5613         }
5614         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5615             data->flags |= (OP(scan) == MEOL
5616                             ? SF_BEFORE_MEOL
5617                             : SF_BEFORE_SEOL);
5618             scan_commit(pRExC_state, data, minlenp, is_inf);
5619
5620         }
5621         else if (  PL_regkind[OP(scan)] == BRANCHJ
5622                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5623                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5624                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5625         {
5626             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5627                 || OP(scan) == UNLESSM )
5628             {
5629                 /* Negative Lookahead/lookbehind
5630                    In this case we can't do fixed string optimisation.
5631                 */
5632
5633                 SSize_t deltanext, minnext, fake = 0;
5634                 regnode *nscan;
5635                 regnode_ssc intrnl;
5636                 int f = 0;
5637
5638                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5639                 if (data) {
5640                     data_fake.whilem_c = data->whilem_c;
5641                     data_fake.last_closep = data->last_closep;
5642                 }
5643                 else
5644                     data_fake.last_closep = &fake;
5645                 data_fake.pos_delta = delta;
5646                 if ( flags & SCF_DO_STCLASS && !scan->flags
5647                      && OP(scan) == IFMATCH ) { /* Lookahead */
5648                     ssc_init(pRExC_state, &intrnl);
5649                     data_fake.start_class = &intrnl;
5650                     f |= SCF_DO_STCLASS_AND;
5651                 }
5652                 if (flags & SCF_WHILEM_VISITED_POS)
5653                     f |= SCF_WHILEM_VISITED_POS;
5654                 next = regnext(scan);
5655                 nscan = NEXTOPER(NEXTOPER(scan));
5656
5657                 /* recurse study_chunk() for lookahead body */
5658                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5659                                       last, &data_fake, stopparen,
5660                                       recursed_depth, NULL, f, depth+1);
5661                 if (scan->flags) {
5662                     if (deltanext) {
5663                         FAIL("Variable length lookbehind not implemented");
5664                     }
5665                     else if (minnext > (I32)U8_MAX) {
5666                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5667                               (UV)U8_MAX);
5668                     }
5669                     scan->flags = (U8)minnext;
5670                 }
5671                 if (data) {
5672                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5673                         pars++;
5674                     if (data_fake.flags & SF_HAS_EVAL)
5675                         data->flags |= SF_HAS_EVAL;
5676                     data->whilem_c = data_fake.whilem_c;
5677                 }
5678                 if (f & SCF_DO_STCLASS_AND) {
5679                     if (flags & SCF_DO_STCLASS_OR) {
5680                         /* OR before, AND after: ideally we would recurse with
5681                          * data_fake to get the AND applied by study of the
5682                          * remainder of the pattern, and then derecurse;
5683                          * *** HACK *** for now just treat as "no information".
5684                          * See [perl #56690].
5685                          */
5686                         ssc_init(pRExC_state, data->start_class);
5687                     }  else {
5688                         /* AND before and after: combine and continue.  These
5689                          * assertions are zero-length, so can match an EMPTY
5690                          * string */
5691                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5692                         ANYOF_FLAGS(data->start_class)
5693                                                    |= SSC_MATCHES_EMPTY_STRING;
5694                     }
5695                 }
5696             }
5697 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5698             else {
5699                 /* Positive Lookahead/lookbehind
5700                    In this case we can do fixed string optimisation,
5701                    but we must be careful about it. Note in the case of
5702                    lookbehind the positions will be offset by the minimum
5703                    length of the pattern, something we won't know about
5704                    until after the recurse.
5705                 */
5706                 SSize_t deltanext, fake = 0;
5707                 regnode *nscan;
5708                 regnode_ssc intrnl;
5709                 int f = 0;
5710                 /* We use SAVEFREEPV so that when the full compile
5711                     is finished perl will clean up the allocated
5712                     minlens when it's all done. This way we don't
5713                     have to worry about freeing them when we know
5714                     they wont be used, which would be a pain.
5715                  */
5716                 SSize_t *minnextp;
5717                 Newx( minnextp, 1, SSize_t );
5718                 SAVEFREEPV(minnextp);
5719
5720                 if (data) {
5721                     StructCopy(data, &data_fake, scan_data_t);
5722                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5723                         f |= SCF_DO_SUBSTR;
5724                         if (scan->flags)
5725                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5726                         data_fake.last_found=newSVsv(data->last_found);
5727                     }
5728                 }
5729                 else
5730                     data_fake.last_closep = &fake;
5731                 data_fake.flags = 0;
5732                 data_fake.substrs[0].flags = 0;
5733                 data_fake.substrs[1].flags = 0;
5734                 data_fake.pos_delta = delta;
5735                 if (is_inf)
5736                     data_fake.flags |= SF_IS_INF;
5737                 if ( flags & SCF_DO_STCLASS && !scan->flags
5738                      && OP(scan) == IFMATCH ) { /* Lookahead */
5739                     ssc_init(pRExC_state, &intrnl);
5740                     data_fake.start_class = &intrnl;
5741                     f |= SCF_DO_STCLASS_AND;
5742                 }
5743                 if (flags & SCF_WHILEM_VISITED_POS)
5744                     f |= SCF_WHILEM_VISITED_POS;
5745                 next = regnext(scan);
5746                 nscan = NEXTOPER(NEXTOPER(scan));
5747
5748                 /* positive lookahead study_chunk() recursion */
5749                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5750                                         &deltanext, last, &data_fake,
5751                                         stopparen, recursed_depth, NULL,
5752                                         f,depth+1);
5753                 if (scan->flags) {
5754                     if (deltanext) {
5755                         FAIL("Variable length lookbehind not implemented");
5756                     }
5757                     else if (*minnextp > (I32)U8_MAX) {
5758                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5759                               (UV)U8_MAX);
5760                     }
5761                     scan->flags = (U8)*minnextp;
5762                 }
5763
5764                 *minnextp += min;
5765
5766                 if (f & SCF_DO_STCLASS_AND) {
5767                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5768                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5769                 }
5770                 if (data) {
5771                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5772                         pars++;
5773                     if (data_fake.flags & SF_HAS_EVAL)
5774                         data->flags |= SF_HAS_EVAL;
5775                     data->whilem_c = data_fake.whilem_c;
5776                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5777                         int i;
5778                         if (RExC_rx->minlen<*minnextp)
5779                             RExC_rx->minlen=*minnextp;
5780                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5781                         SvREFCNT_dec_NN(data_fake.last_found);
5782
5783                         for (i = 0; i < 2; i++) {
5784                             if (data_fake.substrs[i].minlenp != minlenp) {
5785                                 data->substrs[i].min_offset =
5786                                             data_fake.substrs[i].min_offset;
5787                                 data->substrs[i].max_offset =
5788                                             data_fake.substrs[i].max_offset;
5789                                 data->substrs[i].minlenp =
5790                                             data_fake.substrs[i].minlenp;
5791                                 data->substrs[i].lookbehind += scan->flags;
5792                             }
5793                         }
5794                     }
5795                 }
5796             }
5797 #endif
5798         }
5799
5800         else if (OP(scan) == OPEN) {
5801             if (stopparen != (I32)ARG(scan))
5802                 pars++;
5803         }
5804         else if (OP(scan) == CLOSE) {
5805             if (stopparen == (I32)ARG(scan)) {
5806                 break;
5807             }
5808             if ((I32)ARG(scan) == is_par) {
5809                 next = regnext(scan);
5810
5811                 if ( next && (OP(next) != WHILEM) && next < last)
5812                     is_par = 0;         /* Disable optimization */
5813             }
5814             if (data)
5815                 *(data->last_closep) = ARG(scan);
5816         }
5817         else if (OP(scan) == EVAL) {
5818                 if (data)
5819                     data->flags |= SF_HAS_EVAL;
5820         }
5821         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5822             if (flags & SCF_DO_SUBSTR) {
5823                 scan_commit(pRExC_state, data, minlenp, is_inf);
5824                 flags &= ~SCF_DO_SUBSTR;
5825             }
5826             if (data && OP(scan)==ACCEPT) {
5827                 data->flags |= SCF_SEEN_ACCEPT;
5828                 if (stopmin > min)
5829                     stopmin = min;
5830             }
5831         }
5832         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5833         {
5834                 if (flags & SCF_DO_SUBSTR) {
5835                     scan_commit(pRExC_state, data, minlenp, is_inf);
5836                     data->cur_is_floating = 1; /* float */
5837                 }
5838                 is_inf = is_inf_internal = 1;
5839                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5840                     ssc_anything(data->start_class);
5841                 flags &= ~SCF_DO_STCLASS;
5842         }
5843         else if (OP(scan) == GPOS) {
5844             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5845                 !(delta || is_inf || (data && data->pos_delta)))
5846             {
5847                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5848                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5849                 if (RExC_rx->gofs < (STRLEN)min)
5850                     RExC_rx->gofs = min;
5851             } else {
5852                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5853                 RExC_rx->gofs = 0;
5854             }
5855         }
5856 #ifdef TRIE_STUDY_OPT
5857 #ifdef FULL_TRIE_STUDY
5858         else if (PL_regkind[OP(scan)] == TRIE) {
5859             /* NOTE - There is similar code to this block above for handling
5860                BRANCH nodes on the initial study.  If you change stuff here
5861                check there too. */
5862             regnode *trie_node= scan;
5863             regnode *tail= regnext(scan);
5864             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5865             SSize_t max1 = 0, min1 = SSize_t_MAX;
5866             regnode_ssc accum;
5867
5868             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5869                 /* Cannot merge strings after this. */
5870                 scan_commit(pRExC_state, data, minlenp, is_inf);
5871             }
5872             if (flags & SCF_DO_STCLASS)
5873                 ssc_init_zero(pRExC_state, &accum);
5874
5875             if (!trie->jump) {
5876                 min1= trie->minlen;
5877                 max1= trie->maxlen;
5878             } else {
5879                 const regnode *nextbranch= NULL;
5880                 U32 word;
5881
5882                 for ( word=1 ; word <= trie->wordcount ; word++)
5883                 {
5884                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5885                     regnode_ssc this_class;
5886
5887                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5888                     if (data) {
5889                         data_fake.whilem_c = data->whilem_c;
5890                         data_fake.last_closep = data->last_closep;
5891                     }
5892                     else
5893                         data_fake.last_closep = &fake;
5894                     data_fake.pos_delta = delta;
5895                     if (flags & SCF_DO_STCLASS) {
5896                         ssc_init(pRExC_state, &this_class);
5897                         data_fake.start_class = &this_class;
5898                         f = SCF_DO_STCLASS_AND;
5899                     }
5900                     if (flags & SCF_WHILEM_VISITED_POS)
5901                         f |= SCF_WHILEM_VISITED_POS;
5902
5903                     if (trie->jump[word]) {
5904                         if (!nextbranch)
5905                             nextbranch = trie_node + trie->jump[0];
5906                         scan= trie_node + trie->jump[word];
5907                         /* We go from the jump point to the branch that follows
5908                            it. Note this means we need the vestigal unused
5909                            branches even though they arent otherwise used. */
5910                         /* optimise study_chunk() for TRIE */
5911                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5912                             &deltanext, (regnode *)nextbranch, &data_fake,
5913                             stopparen, recursed_depth, NULL, f,depth+1);
5914                     }
5915                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5916                         nextbranch= regnext((regnode*)nextbranch);
5917
5918                     if (min1 > (SSize_t)(minnext + trie->minlen))
5919                         min1 = minnext + trie->minlen;
5920                     if (deltanext == SSize_t_MAX) {
5921                         is_inf = is_inf_internal = 1;
5922                         max1 = SSize_t_MAX;
5923                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5924                         max1 = minnext + deltanext + trie->maxlen;
5925
5926                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5927                         pars++;
5928                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5929                         if ( stopmin > min + min1)
5930                             stopmin = min + min1;
5931                         flags &= ~SCF_DO_SUBSTR;
5932                         if (data)
5933                             data->flags |= SCF_SEEN_ACCEPT;
5934                     }
5935                     if (data) {
5936                         if (data_fake.flags & SF_HAS_EVAL)
5937                             data->flags |= SF_HAS_EVAL;
5938                         data->whilem_c = data_fake.whilem_c;
5939                     }
5940                     if (flags & SCF_DO_STCLASS)
5941                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5942                 }
5943             }
5944             if (flags & SCF_DO_SUBSTR) {
5945                 data->pos_min += min1;
5946                 data->pos_delta += max1 - min1;
5947                 if (max1 != min1 || is_inf)
5948                     data->cur_is_floating = 1; /* float */
5949             }
5950             min += min1;
5951             if (delta != SSize_t_MAX) {
5952                 if (SSize_t_MAX - (max1 - min1) >= delta)
5953                     delta += max1 - min1;
5954                 else
5955                     delta = SSize_t_MAX;
5956             }
5957             if (flags & SCF_DO_STCLASS_OR) {
5958                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5959                 if (min1) {
5960                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5961                     flags &= ~SCF_DO_STCLASS;
5962                 }
5963             }
5964             else if (flags & SCF_DO_STCLASS_AND) {
5965                 if (min1) {
5966                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5967                     flags &= ~SCF_DO_STCLASS;
5968                 }
5969                 else {
5970                     /* Switch to OR mode: cache the old value of
5971                      * data->start_class */
5972                     INIT_AND_WITHP;
5973                     StructCopy(data->start_class, and_withp, regnode_ssc);
5974                     flags &= ~SCF_DO_STCLASS_AND;
5975                     StructCopy(&accum, data->start_class, regnode_ssc);
5976                     flags |= SCF_DO_STCLASS_OR;
5977                 }
5978             }
5979             scan= tail;
5980             continue;
5981         }
5982 #else
5983         else if (PL_regkind[OP(scan)] == TRIE) {
5984             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5985             U8*bang=NULL;
5986
5987             min += trie->minlen;
5988             delta += (trie->maxlen - trie->minlen);
5989             flags &= ~SCF_DO_STCLASS; /* xxx */
5990             if (flags & SCF_DO_SUBSTR) {
5991                 /* Cannot expect anything... */
5992                 scan_commit(pRExC_state, data, minlenp, is_inf);
5993                 data->pos_min += trie->minlen;
5994                 data->pos_delta += (trie->maxlen - trie->minlen);
5995                 if (trie->maxlen != trie->minlen)
5996                     data->cur_is_floating = 1; /* float */
5997             }
5998             if (trie->jump) /* no more substrings -- for now /grr*/
5999                flags &= ~SCF_DO_SUBSTR;
6000         }
6001 #endif /* old or new */
6002 #endif /* TRIE_STUDY_OPT */
6003
6004         /* Else: zero-length, ignore. */
6005         scan = regnext(scan);
6006     }
6007
6008   finish:
6009     if (frame) {
6010         /* we need to unwind recursion. */
6011         depth = depth - 1;
6012
6013         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6014         DEBUG_PEEP("fend", scan, depth, flags);
6015
6016         /* restore previous context */
6017         last = frame->last_regnode;
6018         scan = frame->next_regnode;
6019         stopparen = frame->stopparen;
6020         recursed_depth = frame->prev_recursed_depth;
6021
6022         RExC_frame_last = frame->prev_frame;
6023         frame = frame->this_prev_frame;
6024         goto fake_study_recurse;
6025     }
6026
6027     assert(!frame);
6028     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6029
6030     *scanp = scan;
6031     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6032
6033     if (flags & SCF_DO_SUBSTR && is_inf)
6034         data->pos_delta = SSize_t_MAX - data->pos_min;
6035     if (is_par > (I32)U8_MAX)
6036         is_par = 0;
6037     if (is_par && pars==1 && data) {
6038         data->flags |= SF_IN_PAR;
6039         data->flags &= ~SF_HAS_PAR;
6040     }
6041     else if (pars && data) {
6042         data->flags |= SF_HAS_PAR;
6043         data->flags &= ~SF_IN_PAR;
6044     }
6045     if (flags & SCF_DO_STCLASS_OR)
6046         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6047     if (flags & SCF_TRIE_RESTUDY)
6048         data->flags |=  SCF_TRIE_RESTUDY;
6049
6050     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6051
6052     {
6053         SSize_t final_minlen= min < stopmin ? min : stopmin;
6054
6055         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6056             if (final_minlen > SSize_t_MAX - delta)
6057                 RExC_maxlen = SSize_t_MAX;
6058             else if (RExC_maxlen < final_minlen + delta)
6059                 RExC_maxlen = final_minlen + delta;
6060         }
6061         return final_minlen;
6062     }
6063     NOT_REACHED; /* NOTREACHED */
6064 }
6065
6066 STATIC U32
6067 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6068 {
6069     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6070
6071     PERL_ARGS_ASSERT_ADD_DATA;
6072
6073     Renewc(RExC_rxi->data,
6074            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6075            char, struct reg_data);
6076     if(count)
6077         Renew(RExC_rxi->data->what, count + n, U8);
6078     else
6079         Newx(RExC_rxi->data->what, n, U8);
6080     RExC_rxi->data->count = count + n;
6081     Copy(s, RExC_rxi->data->what + count, n, U8);
6082     return count;
6083 }
6084
6085 /*XXX: todo make this not included in a non debugging perl, but appears to be
6086  * used anyway there, in 'use re' */
6087 #ifndef PERL_IN_XSUB_RE
6088 void
6089 Perl_reginitcolors(pTHX)
6090 {
6091     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6092     if (s) {
6093         char *t = savepv(s);
6094         int i = 0;
6095         PL_colors[0] = t;
6096         while (++i < 6) {
6097             t = strchr(t, '\t');
6098             if (t) {
6099                 *t = '\0';
6100                 PL_colors[i] = ++t;
6101             }
6102             else
6103                 PL_colors[i] = t = (char *)"";
6104         }
6105     } else {
6106         int i = 0;
6107         while (i < 6)
6108             PL_colors[i++] = (char *)"";
6109     }
6110     PL_colorset = 1;
6111 }
6112 #endif
6113
6114
6115 #ifdef TRIE_STUDY_OPT
6116 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6117     STMT_START {                                            \
6118         if (                                                \
6119               (data.flags & SCF_TRIE_RESTUDY)               \
6120               && ! restudied++                              \
6121         ) {                                                 \
6122             dOsomething;                                    \
6123             goto reStudy;                                   \
6124         }                                                   \
6125     } STMT_END
6126 #else
6127 #define CHECK_RESTUDY_GOTO_butfirst
6128 #endif
6129
6130 /*
6131  * pregcomp - compile a regular expression into internal code
6132  *
6133  * Decides which engine's compiler to call based on the hint currently in
6134  * scope
6135  */
6136
6137 #ifndef PERL_IN_XSUB_RE
6138
6139 /* return the currently in-scope regex engine (or the default if none)  */
6140
6141 regexp_engine const *
6142 Perl_current_re_engine(pTHX)
6143 {
6144     if (IN_PERL_COMPILETIME) {
6145         HV * const table = GvHV(PL_hintgv);
6146         SV **ptr;
6147
6148         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6149             return &PL_core_reg_engine;
6150         ptr = hv_fetchs(table, "regcomp", FALSE);
6151         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6152             return &PL_core_reg_engine;
6153         return INT2PTR(regexp_engine*,SvIV(*ptr));
6154     }
6155     else {
6156         SV *ptr;
6157         if (!PL_curcop->cop_hints_hash)
6158             return &PL_core_reg_engine;
6159         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6160         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6161             return &PL_core_reg_engine;
6162         return INT2PTR(regexp_engine*,SvIV(ptr));
6163     }
6164 }
6165
6166
6167 REGEXP *
6168 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6169 {
6170     regexp_engine const *eng = current_re_engine();
6171     GET_RE_DEBUG_FLAGS_DECL;
6172
6173     PERL_ARGS_ASSERT_PREGCOMP;
6174
6175     /* Dispatch a request to compile a regexp to correct regexp engine. */
6176     DEBUG_COMPILE_r({
6177         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6178                         PTR2UV(eng));
6179     });
6180     return CALLREGCOMP_ENG(eng, pattern, flags);
6181 }
6182 #endif
6183
6184 /* public(ish) entry point for the perl core's own regex compiling code.
6185  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6186  * pattern rather than a list of OPs, and uses the internal engine rather
6187  * than the current one */
6188
6189 REGEXP *
6190 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6191 {
6192     SV *pat = pattern; /* defeat constness! */
6193     PERL_ARGS_ASSERT_RE_COMPILE;
6194     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6195 #ifdef PERL_IN_XSUB_RE
6196                                 &my_reg_engine,
6197 #else
6198                                 &PL_core_reg_engine,
6199 #endif
6200                                 NULL, NULL, rx_flags, 0);
6201 }
6202
6203
6204 static void
6205 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6206 {
6207     int n;
6208
6209     if (--cbs->refcnt > 0)
6210         return;
6211     for (n = 0; n < cbs->count; n++) {
6212         REGEXP *rx = cbs->cb[n].src_regex;
6213         cbs->cb[n].src_regex = NULL;
6214         SvREFCNT_dec(rx);
6215     }
6216     Safefree(cbs->cb);
6217     Safefree(cbs);
6218 }
6219
6220
6221 static struct reg_code_blocks *
6222 S_alloc_code_blocks(pTHX_  int ncode)
6223 {
6224      struct reg_code_blocks *cbs;
6225     Newx(cbs, 1, struct reg_code_blocks);
6226     cbs->count = ncode;
6227     cbs->refcnt = 1;
6228     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6229     if (ncode)
6230         Newx(cbs->cb, ncode, struct reg_code_block);
6231     else
6232         cbs->cb = NULL;
6233     return cbs;
6234 }
6235
6236
6237 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6238  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6239  * point to the realloced string and length.
6240  *
6241  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6242  * stuff added */
6243
6244 static void
6245 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6246                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6247 {
6248     U8 *const src = (U8*)*pat_p;
6249     U8 *dst, *d;
6250     int n=0;
6251     STRLEN s = 0;
6252     bool do_end = 0;
6253     GET_RE_DEBUG_FLAGS_DECL;
6254
6255     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6256         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6257
6258     Newx(dst, *plen_p * 2 + 1, U8);
6259     d = dst;
6260
6261     while (s < *plen_p) {
6262         append_utf8_from_native_byte(src[s], &d);
6263
6264         if (n < num_code_blocks) {
6265             assert(pRExC_state->code_blocks);
6266             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6267                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6268                 assert(*(d - 1) == '(');
6269                 do_end = 1;
6270             }
6271             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6272                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6273                 assert(*(d - 1) == ')');
6274                 do_end = 0;
6275                 n++;
6276             }
6277         }
6278         s++;
6279     }
6280     *d = '\0';
6281     *plen_p = d - dst;
6282     *pat_p = (char*) dst;
6283     SAVEFREEPV(*pat_p);
6284     RExC_orig_utf8 = RExC_utf8 = 1;
6285 }
6286
6287
6288
6289 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6290  * while recording any code block indices, and handling overloading,
6291  * nested qr// objects etc.  If pat is null, it will allocate a new
6292  * string, or just return the first arg, if there's only one.
6293  *
6294  * Returns the malloced/updated pat.
6295  * patternp and pat_count is the array of SVs to be concatted;
6296  * oplist is the optional list of ops that generated the SVs;
6297  * recompile_p is a pointer to a boolean that will be set if
6298  *   the regex will need to be recompiled.
6299  * delim, if non-null is an SV that will be inserted between each element
6300  */
6301
6302 static SV*
6303 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6304                 SV *pat, SV ** const patternp, int pat_count,
6305                 OP *oplist, bool *recompile_p, SV *delim)
6306 {
6307     SV **svp;
6308     int n = 0;
6309     bool use_delim = FALSE;
6310     bool alloced = FALSE;
6311
6312     /* if we know we have at least two args, create an empty string,
6313      * then concatenate args to that. For no args, return an empty string */
6314     if (!pat && pat_count != 1) {
6315         pat = newSVpvs("");
6316         SAVEFREESV(pat);
6317         alloced = TRUE;
6318     }
6319
6320     for (svp = patternp; svp < patternp + pat_count; svp++) {
6321         SV *sv;
6322         SV *rx  = NULL;
6323         STRLEN orig_patlen = 0;
6324         bool code = 0;
6325         SV *msv = use_delim ? delim : *svp;
6326         if (!msv) msv = &PL_sv_undef;
6327
6328         /* if we've got a delimiter, we go round the loop twice for each
6329          * svp slot (except the last), using the delimiter the second
6330          * time round */
6331         if (use_delim) {
6332             svp--;
6333             use_delim = FALSE;
6334         }
6335         else if (delim)
6336             use_delim = TRUE;
6337
6338         if (SvTYPE(msv) == SVt_PVAV) {
6339             /* we've encountered an interpolated array within
6340              * the pattern, e.g. /...@a..../. Expand the list of elements,
6341              * then recursively append elements.
6342              * The code in this block is based on S_pushav() */
6343
6344             AV *const av = (AV*)msv;
6345             const SSize_t maxarg = AvFILL(av) + 1;
6346             SV **array;
6347
6348             if (oplist) {
6349                 assert(oplist->op_type == OP_PADAV
6350                     || oplist->op_type == OP_RV2AV);
6351                 oplist = OpSIBLING(oplist);
6352             }
6353
6354             if (SvRMAGICAL(av)) {
6355                 SSize_t i;
6356
6357                 Newx(array, maxarg, SV*);
6358                 SAVEFREEPV(array);
6359                 for (i=0; i < maxarg; i++) {
6360                     SV ** const svp = av_fetch(av, i, FALSE);
6361                     array[i] = svp ? *svp : &PL_sv_undef;
6362                 }
6363             }
6364             else
6365                 array = AvARRAY(av);
6366
6367             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6368                                 array, maxarg, NULL, recompile_p,
6369                                 /* $" */
6370                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6371
6372             continue;
6373         }
6374
6375
6376         /* we make the assumption here that each op in the list of
6377          * op_siblings maps to one SV pushed onto the stack,
6378          * except for code blocks, with have both an OP_NULL and
6379          * and OP_CONST.
6380          * This allows us to match up the list of SVs against the
6381          * list of OPs to find the next code block.
6382          *
6383          * Note that       PUSHMARK PADSV PADSV ..
6384          * is optimised to
6385          *                 PADRANGE PADSV  PADSV  ..
6386          * so the alignment still works. */
6387
6388         if (oplist) {
6389             if (oplist->op_type == OP_NULL
6390                 && (oplist->op_flags & OPf_SPECIAL))
6391             {
6392                 assert(n < pRExC_state->code_blocks->count);
6393                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6394                 pRExC_state->code_blocks->cb[n].block = oplist;
6395                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6396                 n++;
6397                 code = 1;
6398                 oplist = OpSIBLING(oplist); /* skip CONST */
6399                 assert(oplist);
6400             }
6401             oplist = OpSIBLING(oplist);;
6402         }
6403
6404         /* apply magic and QR overloading to arg */
6405
6406         SvGETMAGIC(msv);
6407         if (SvROK(msv) && SvAMAGIC(msv)) {
6408             SV *sv = AMG_CALLunary(msv, regexp_amg);
6409             if (sv) {
6410                 if (SvROK(sv))
6411                     sv = SvRV(sv);
6412                 if (SvTYPE(sv) != SVt_REGEXP)
6413                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6414                 msv = sv;
6415             }
6416         }
6417
6418         /* try concatenation overload ... */
6419         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6420                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6421         {
6422             sv_setsv(pat, sv);
6423             /* overloading involved: all bets are off over literal
6424              * code. Pretend we haven't seen it */
6425             if (n)
6426                 pRExC_state->code_blocks->count -= n;
6427             n = 0;
6428         }
6429         else  {
6430             /* ... or failing that, try "" overload */
6431             while (SvAMAGIC(msv)
6432                     && (sv = AMG_CALLunary(msv, string_amg))
6433                     && sv != msv
6434                     &&  !(   SvROK(msv)
6435                           && SvROK(sv)
6436                           && SvRV(msv) == SvRV(sv))
6437             ) {
6438                 msv = sv;
6439                 SvGETMAGIC(msv);
6440             }
6441             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6442                 msv = SvRV(msv);
6443
6444             if (pat) {
6445                 /* this is a partially unrolled
6446                  *     sv_catsv_nomg(pat, msv);
6447                  * that allows us to adjust code block indices if
6448                  * needed */
6449                 STRLEN dlen;
6450                 char *dst = SvPV_force_nomg(pat, dlen);
6451                 orig_patlen = dlen;
6452                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6453                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6454                     sv_setpvn(pat, dst, dlen);
6455                     SvUTF8_on(pat);
6456                 }
6457                 sv_catsv_nomg(pat, msv);
6458                 rx = msv;
6459             }
6460             else {
6461                 /* We have only one SV to process, but we need to verify
6462                  * it is properly null terminated or we will fail asserts
6463                  * later. In theory we probably shouldn't get such SV's,
6464                  * but if we do we should handle it gracefully. */
6465                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
6466                     /* not a string, or a string with a trailing null */
6467                     pat = msv;
6468                 } else {
6469                     /* a string with no trailing null, we need to copy it
6470                      * so it we have a trailing null */
6471                     pat = newSVsv(msv);
6472                 }
6473             }
6474
6475             if (code)
6476                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6477         }
6478
6479         /* extract any code blocks within any embedded qr//'s */
6480         if (rx && SvTYPE(rx) == SVt_REGEXP
6481             && RX_ENGINE((REGEXP*)rx)->op_comp)
6482         {
6483
6484             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6485             if (ri->code_blocks && ri->code_blocks->count) {
6486                 int i;
6487                 /* the presence of an embedded qr// with code means
6488                  * we should always recompile: the text of the
6489                  * qr// may not have changed, but it may be a
6490                  * different closure than last time */
6491                 *recompile_p = 1;
6492                 if (pRExC_state->code_blocks) {
6493                     int new_count = pRExC_state->code_blocks->count
6494                             + ri->code_blocks->count;
6495                     Renew(pRExC_state->code_blocks->cb,
6496                             new_count, struct reg_code_block);
6497                     pRExC_state->code_blocks->count = new_count;
6498                 }
6499                 else
6500                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6501                                                     ri->code_blocks->count);
6502
6503                 for (i=0; i < ri->code_blocks->count; i++) {
6504                     struct reg_code_block *src, *dst;
6505                     STRLEN offset =  orig_patlen
6506                         + ReANY((REGEXP *)rx)->pre_prefix;
6507                     assert(n < pRExC_state->code_blocks->count);
6508                     src = &ri->code_blocks->cb[i];
6509                     dst = &pRExC_state->code_blocks->cb[n];
6510                     dst->start      = src->start + offset;
6511                     dst->end        = src->end   + offset;
6512                     dst->block      = src->block;
6513                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6514                                             src->src_regex
6515                                                 ? src->src_regex
6516                                                 : (REGEXP*)rx);
6517                     n++;
6518                 }
6519             }
6520         }
6521     }
6522     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6523     if (alloced)
6524         SvSETMAGIC(pat);
6525
6526     return pat;
6527 }
6528
6529
6530
6531 /* see if there are any run-time code blocks in the pattern.
6532  * False positives are allowed */
6533
6534 static bool
6535 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6536                     char *pat, STRLEN plen)
6537 {
6538     int n = 0;
6539     STRLEN s;
6540
6541     PERL_UNUSED_CONTEXT;
6542
6543     for (s = 0; s < plen; s++) {
6544         if (   pRExC_state->code_blocks
6545             && n < pRExC_state->code_blocks->count
6546             && s == pRExC_state->code_blocks->cb[n].start)
6547         {
6548             s = pRExC_state->code_blocks->cb[n].end;
6549             n++;
6550             continue;
6551         }
6552         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6553          * positives here */
6554         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6555             (pat[s+2] == '{'
6556                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6557         )
6558             return 1;
6559     }
6560     return 0;
6561 }
6562
6563 /* Handle run-time code blocks. We will already have compiled any direct
6564  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6565  * copy of it, but with any literal code blocks blanked out and
6566  * appropriate chars escaped; then feed it into
6567  *
6568  *    eval "qr'modified_pattern'"
6569  *
6570  * For example,
6571  *
6572  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6573  *
6574  * becomes
6575  *
6576  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6577  *
6578  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6579  * and merge them with any code blocks of the original regexp.
6580  *
6581  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6582  * instead, just save the qr and return FALSE; this tells our caller that
6583  * the original pattern needs upgrading to utf8.
6584  */
6585
6586 static bool
6587 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6588     char *pat, STRLEN plen)
6589 {
6590     SV *qr;
6591
6592     GET_RE_DEBUG_FLAGS_DECL;
6593
6594     if (pRExC_state->runtime_code_qr) {
6595         /* this is the second time we've been called; this should
6596          * only happen if the main pattern got upgraded to utf8
6597          * during compilation; re-use the qr we compiled first time
6598          * round (which should be utf8 too)
6599          */
6600         qr = pRExC_state->runtime_code_qr;
6601         pRExC_state->runtime_code_qr = NULL;
6602         assert(RExC_utf8 && SvUTF8(qr));
6603     }
6604     else {
6605         int n = 0;
6606         STRLEN s;
6607         char *p, *newpat;
6608         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6609         SV *sv, *qr_ref;
6610         dSP;
6611
6612         /* determine how many extra chars we need for ' and \ escaping */
6613         for (s = 0; s < plen; s++) {
6614             if (pat[s] == '\'' || pat[s] == '\\')
6615                 newlen++;
6616         }
6617
6618         Newx(newpat, newlen, char);
6619         p = newpat;
6620         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6621
6622         for (s = 0; s < plen; s++) {
6623             if (   pRExC_state->code_blocks
6624                 && n < pRExC_state->code_blocks->count
6625                 && s == pRExC_state->code_blocks->cb[n].start)
6626             {
6627                 /* blank out literal code block */
6628                 assert(pat[s] == '(');
6629                 while (s <= pRExC_state->code_blocks->cb[n].end) {
6630                     *p++ = '_';
6631                     s++;
6632                 }
6633                 s--;
6634                 n++;
6635                 continue;
6636             }
6637             if (pat[s] == '\'' || pat[s] == '\\')
6638                 *p++ = '\\';
6639             *p++ = pat[s];
6640         }
6641         *p++ = '\'';
6642         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6643             *p++ = 'x';
6644             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6645                 *p++ = 'x';
6646             }
6647         }
6648         *p++ = '\0';
6649         DEBUG_COMPILE_r({
6650             Perl_re_printf( aTHX_
6651                 "%sre-parsing pattern for runtime code:%s %s\n",
6652                 PL_colors[4],PL_colors[5],newpat);
6653         });
6654
6655         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6656         Safefree(newpat);
6657
6658         ENTER;
6659         SAVETMPS;
6660         save_re_context();
6661         PUSHSTACKi(PERLSI_REQUIRE);
6662         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6663          * parsing qr''; normally only q'' does this. It also alters
6664          * hints handling */
6665         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6666         SvREFCNT_dec_NN(sv);
6667         SPAGAIN;
6668         qr_ref = POPs;
6669         PUTBACK;
6670         {
6671             SV * const errsv = ERRSV;
6672             if (SvTRUE_NN(errsv))
6673                 /* use croak_sv ? */
6674                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6675         }
6676         assert(SvROK(qr_ref));
6677         qr = SvRV(qr_ref);
6678         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6679         /* the leaving below frees the tmp qr_ref.
6680          * Give qr a life of its own */
6681         SvREFCNT_inc(qr);
6682         POPSTACK;
6683         FREETMPS;
6684         LEAVE;
6685
6686     }
6687
6688     if (!RExC_utf8 && SvUTF8(qr)) {
6689         /* first time through; the pattern got upgraded; save the
6690          * qr for the next time through */
6691         assert(!pRExC_state->runtime_code_qr);
6692         pRExC_state->runtime_code_qr = qr;
6693         return 0;
6694     }
6695
6696
6697     /* extract any code blocks within the returned qr//  */
6698
6699
6700     /* merge the main (r1) and run-time (r2) code blocks into one */
6701     {
6702         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6703         struct reg_code_block *new_block, *dst;
6704         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6705         int i1 = 0, i2 = 0;
6706         int r1c, r2c;
6707
6708         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
6709         {
6710             SvREFCNT_dec_NN(qr);
6711             return 1;
6712         }
6713
6714         if (!r1->code_blocks)
6715             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
6716
6717         r1c = r1->code_blocks->count;
6718         r2c = r2->code_blocks->count;
6719
6720         Newx(new_block, r1c + r2c, struct reg_code_block);
6721
6722         dst = new_block;
6723
6724         while (i1 < r1c || i2 < r2c) {
6725             struct reg_code_block *src;
6726             bool is_qr = 0;
6727
6728             if (i1 == r1c) {
6729                 src = &r2->code_blocks->cb[i2++];
6730                 is_qr = 1;
6731             }
6732             else if (i2 == r2c)
6733                 src = &r1->code_blocks->cb[i1++];
6734             else if (  r1->code_blocks->cb[i1].start
6735                      < r2->code_blocks->cb[i2].start)
6736             {
6737                 src = &r1->code_blocks->cb[i1++];
6738                 assert(src->end < r2->code_blocks->cb[i2].start);
6739             }
6740             else {
6741                 assert(  r1->code_blocks->cb[i1].start
6742                        > r2->code_blocks->cb[i2].start);
6743                 src = &r2->code_blocks->cb[i2++];
6744                 is_qr = 1;
6745                 assert(src->end < r1->code_blocks->cb[i1].start);
6746             }
6747
6748             assert(pat[src->start] == '(');
6749             assert(pat[src->end]   == ')');
6750             dst->start      = src->start;
6751             dst->end        = src->end;
6752             dst->block      = src->block;
6753             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6754                                     : src->src_regex;
6755             dst++;
6756         }
6757         r1->code_blocks->count += r2c;
6758         Safefree(r1->code_blocks->cb);
6759         r1->code_blocks->cb = new_block;
6760     }
6761
6762     SvREFCNT_dec_NN(qr);
6763     return 1;
6764 }
6765
6766
6767 STATIC bool
6768 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
6769                       struct reg_substr_datum  *rsd,
6770                       struct scan_data_substrs *sub,
6771                       STRLEN longest_length)
6772 {
6773     /* This is the common code for setting up the floating and fixed length
6774      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6775      * as to whether succeeded or not */
6776
6777     I32 t;
6778     SSize_t ml;
6779     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
6780     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
6781
6782     if (! (longest_length
6783            || (eol /* Can't have SEOL and MULTI */
6784                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6785           )
6786             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6787         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6788     {
6789         return FALSE;
6790     }
6791
6792     /* copy the information about the longest from the reg_scan_data
6793         over to the program. */
6794     if (SvUTF8(sub->str)) {
6795         rsd->substr      = NULL;
6796         rsd->utf8_substr = sub->str;
6797     } else {
6798         rsd->substr      = sub->str;
6799         rsd->utf8_substr = NULL;
6800     }
6801     /* end_shift is how many chars that must be matched that
6802         follow this item. We calculate it ahead of time as once the
6803         lookbehind offset is added in we lose the ability to correctly
6804         calculate it.*/
6805     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
6806     rsd->end_shift = ml - sub->min_offset
6807         - longest_length
6808             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
6809              * intead? - DAPM
6810             + (SvTAIL(sub->str) != 0)
6811             */
6812         + sub->lookbehind;
6813
6814     t = (eol/* Can't have SEOL and MULTI */
6815          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6816     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
6817
6818     return TRUE;
6819 }
6820
6821 /*
6822  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6823  * regular expression into internal code.
6824  * The pattern may be passed either as:
6825  *    a list of SVs (patternp plus pat_count)
6826  *    a list of OPs (expr)
6827  * If both are passed, the SV list is used, but the OP list indicates
6828  * which SVs are actually pre-compiled code blocks
6829  *
6830  * The SVs in the list have magic and qr overloading applied to them (and
6831  * the list may be modified in-place with replacement SVs in the latter
6832  * case).
6833  *
6834  * If the pattern hasn't changed from old_re, then old_re will be
6835  * returned.
6836  *
6837  * eng is the current engine. If that engine has an op_comp method, then
6838  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6839  * do the initial concatenation of arguments and pass on to the external
6840  * engine.
6841  *
6842  * If is_bare_re is not null, set it to a boolean indicating whether the
6843  * arg list reduced (after overloading) to a single bare regex which has
6844  * been returned (i.e. /$qr/).
6845  *
6846  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6847  *
6848  * pm_flags contains the PMf_* flags, typically based on those from the
6849  * pm_flags field of the related PMOP. Currently we're only interested in
6850  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6851  *
6852  * We can't allocate space until we know how big the compiled form will be,
6853  * but we can't compile it (and thus know how big it is) until we've got a
6854  * place to put the code.  So we cheat:  we compile it twice, once with code
6855  * generation turned off and size counting turned on, and once "for real".
6856  * This also means that we don't allocate space until we are sure that the
6857  * thing really will compile successfully, and we never have to move the
6858  * code and thus invalidate pointers into it.  (Note that it has to be in
6859  * one piece because free() must be able to free it all.) [NB: not true in perl]
6860  *
6861  * Beware that the optimization-preparation code in here knows about some
6862  * of the structure of the compiled regexp.  [I'll say.]
6863  */
6864
6865 REGEXP *
6866 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6867                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6868                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6869 {
6870     REGEXP *rx;
6871     struct regexp *r;
6872     regexp_internal *ri;
6873     STRLEN plen;
6874     char *exp;
6875     regnode *scan;
6876     I32 flags;
6877     SSize_t minlen = 0;
6878     U32 rx_flags;
6879     SV *pat;
6880     SV** new_patternp = patternp;
6881
6882     /* these are all flags - maybe they should be turned
6883      * into a single int with different bit masks */
6884     I32 sawlookahead = 0;
6885     I32 sawplus = 0;
6886     I32 sawopen = 0;
6887     I32 sawminmod = 0;
6888
6889     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6890     bool recompile = 0;
6891     bool runtime_code = 0;
6892     scan_data_t data;
6893     RExC_state_t RExC_state;
6894     RExC_state_t * const pRExC_state = &RExC_state;
6895 #ifdef TRIE_STUDY_OPT
6896     int restudied = 0;
6897     RExC_state_t copyRExC_state;
6898 #endif
6899     GET_RE_DEBUG_FLAGS_DECL;
6900
6901     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6902
6903     DEBUG_r(if (!PL_colorset) reginitcolors());
6904
6905     /* Initialize these here instead of as-needed, as is quick and avoids
6906      * having to test them each time otherwise */
6907     if (! PL_AboveLatin1) {
6908 #ifdef DEBUGGING
6909         char * dump_len_string;
6910 #endif
6911
6912         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6913         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6914         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6915         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6916         PL_HasMultiCharFold =
6917                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6918
6919         /* This is calculated here, because the Perl program that generates the
6920          * static global ones doesn't currently have access to
6921          * NUM_ANYOF_CODE_POINTS */
6922         PL_InBitmap = _new_invlist(2);
6923         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6924                                                     NUM_ANYOF_CODE_POINTS - 1);
6925 #ifdef DEBUGGING
6926         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6927         if (   ! dump_len_string
6928             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6929         {
6930             PL_dump_re_max_len = 60;    /* A reasonable default */
6931         }
6932 #endif
6933     }
6934
6935     pRExC_state->warn_text = NULL;
6936     pRExC_state->code_blocks = NULL;
6937
6938     if (is_bare_re)
6939         *is_bare_re = FALSE;
6940
6941     if (expr && (expr->op_type == OP_LIST ||
6942                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6943         /* allocate code_blocks if needed */
6944         OP *o;
6945         int ncode = 0;
6946
6947         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6948             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6949                 ncode++; /* count of DO blocks */
6950
6951         if (ncode)
6952             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
6953     }
6954
6955     if (!pat_count) {
6956         /* compile-time pattern with just OP_CONSTs and DO blocks */
6957
6958         int n;
6959         OP *o;
6960
6961         /* find how many CONSTs there are */
6962         assert(expr);
6963         n = 0;
6964         if (expr->op_type == OP_CONST)
6965             n = 1;
6966         else
6967             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6968                 if (o->op_type == OP_CONST)
6969                     n++;
6970             }
6971
6972         /* fake up an SV array */
6973
6974         assert(!new_patternp);
6975         Newx(new_patternp, n, SV*);
6976         SAVEFREEPV(new_patternp);
6977         pat_count = n;
6978
6979         n = 0;
6980         if (expr->op_type == OP_CONST)
6981             new_patternp[n] = cSVOPx_sv(expr);
6982         else
6983             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6984                 if (o->op_type == OP_CONST)
6985                     new_patternp[n++] = cSVOPo_sv;
6986             }
6987
6988     }
6989
6990     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6991         "Assembling pattern from %d elements%s\n", pat_count,
6992             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6993
6994     /* set expr to the first arg op */
6995
6996     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
6997          && expr->op_type != OP_CONST)
6998     {
6999             expr = cLISTOPx(expr)->op_first;
7000             assert(   expr->op_type == OP_PUSHMARK
7001                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7002                    || expr->op_type == OP_PADRANGE);
7003             expr = OpSIBLING(expr);
7004     }
7005
7006     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7007                         expr, &recompile, NULL);
7008
7009     /* handle bare (possibly after overloading) regex: foo =~ $re */
7010     {
7011         SV *re = pat;
7012         if (SvROK(re))
7013             re = SvRV(re);
7014         if (SvTYPE(re) == SVt_REGEXP) {
7015             if (is_bare_re)
7016                 *is_bare_re = TRUE;
7017             SvREFCNT_inc(re);
7018             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7019                 "Precompiled pattern%s\n",
7020                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7021
7022             return (REGEXP*)re;
7023         }
7024     }
7025
7026     exp = SvPV_nomg(pat, plen);
7027
7028     if (!eng->op_comp) {
7029         if ((SvUTF8(pat) && IN_BYTES)
7030                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7031         {
7032             /* make a temporary copy; either to convert to bytes,
7033              * or to avoid repeating get-magic / overloaded stringify */
7034             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7035                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7036         }
7037         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7038     }
7039
7040     /* ignore the utf8ness if the pattern is 0 length */
7041     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7042
7043     RExC_uni_semantics = 0;
7044     RExC_seen_unfolded_sharp_s = 0;
7045     RExC_contains_locale = 0;
7046     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7047     RExC_in_script_run = 0;
7048     RExC_study_started = 0;
7049     pRExC_state->runtime_code_qr = NULL;
7050     RExC_frame_head= NULL;
7051     RExC_frame_last= NULL;
7052     RExC_frame_count= 0;
7053
7054     DEBUG_r({
7055         RExC_mysv1= sv_newmortal();
7056         RExC_mysv2= sv_newmortal();
7057     });
7058     DEBUG_COMPILE_r({
7059             SV *dsv= sv_newmortal();
7060             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7061             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7062                           PL_colors[4],PL_colors[5],s);
7063         });
7064
7065   redo_first_pass:
7066     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7067      * to utf8 */
7068
7069     if ((pm_flags & PMf_USE_RE_EVAL)
7070                 /* this second condition covers the non-regex literal case,
7071                  * i.e.  $foo =~ '(?{})'. */
7072                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7073     )
7074         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7075
7076     /* return old regex if pattern hasn't changed */
7077     /* XXX: note in the below we have to check the flags as well as the
7078      * pattern.
7079      *
7080      * Things get a touch tricky as we have to compare the utf8 flag
7081      * independently from the compile flags.  */
7082
7083     if (   old_re
7084         && !recompile
7085         && !!RX_UTF8(old_re) == !!RExC_utf8
7086         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7087         && RX_PRECOMP(old_re)
7088         && RX_PRELEN(old_re) == plen
7089         && memEQ(RX_PRECOMP(old_re), exp, plen)
7090         && !runtime_code /* with runtime code, always recompile */ )
7091     {
7092         return old_re;
7093     }
7094
7095     rx_flags = orig_rx_flags;
7096
7097     if (   initial_charset == REGEX_DEPENDS_CHARSET
7098         && (RExC_utf8 ||RExC_uni_semantics))
7099     {
7100
7101         /* Set to use unicode semantics if the pattern is in utf8 and has the
7102          * 'depends' charset specified, as it means unicode when utf8  */
7103         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7104     }
7105
7106     RExC_precomp = exp;
7107     RExC_precomp_adj = 0;
7108     RExC_flags = rx_flags;
7109     RExC_pm_flags = pm_flags;
7110
7111     if (runtime_code) {
7112         assert(TAINTING_get || !TAINT_get);
7113         if (TAINT_get)
7114             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7115
7116         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7117             /* whoops, we have a non-utf8 pattern, whilst run-time code
7118              * got compiled as utf8. Try again with a utf8 pattern */
7119             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7120                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7121             goto redo_first_pass;
7122         }
7123     }
7124     assert(!pRExC_state->runtime_code_qr);
7125
7126     RExC_sawback = 0;
7127
7128     RExC_seen = 0;
7129     RExC_maxlen = 0;
7130     RExC_in_lookbehind = 0;
7131     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7132     RExC_extralen = 0;
7133 #ifdef EBCDIC
7134     RExC_recode_x_to_native = 0;
7135 #endif
7136     RExC_in_multi_char_class = 0;
7137
7138     /* First pass: determine size, legality. */
7139     RExC_parse = exp;
7140     RExC_start = RExC_adjusted_start = exp;
7141     RExC_end = exp + plen;
7142     RExC_precomp_end = RExC_end;
7143     RExC_naughty = 0;
7144     RExC_npar = 1;
7145     RExC_nestroot = 0;
7146     RExC_size = 0L;
7147     RExC_emit = (regnode *) &RExC_emit_dummy;
7148     RExC_whilem_seen = 0;
7149     RExC_open_parens = NULL;
7150     RExC_close_parens = NULL;
7151     RExC_end_op = NULL;
7152     RExC_paren_names = NULL;
7153 #ifdef DEBUGGING
7154     RExC_paren_name_list = NULL;
7155 #endif
7156     RExC_recurse = NULL;
7157     RExC_study_chunk_recursed = NULL;
7158     RExC_study_chunk_recursed_bytes= 0;
7159     RExC_recurse_count = 0;
7160     pRExC_state->code_index = 0;
7161
7162     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7163      * code makes sure the final byte is an uncounted NUL.  But should this
7164      * ever not be the case, lots of things could read beyond the end of the
7165      * buffer: loops like
7166      *      while(isFOO(*RExC_parse)) RExC_parse++;
7167      *      strchr(RExC_parse, "foo");
7168      * etc.  So it is worth noting. */
7169     assert(*RExC_end == '\0');
7170
7171     DEBUG_PARSE_r(
7172         Perl_re_printf( aTHX_  "Starting first pass (sizing)\n");
7173         RExC_lastnum=0;
7174         RExC_lastparse=NULL;
7175     );
7176
7177     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7178         /* It's possible to write a regexp in ascii that represents Unicode
7179         codepoints outside of the byte range, such as via \x{100}. If we
7180         detect such a sequence we have to convert the entire pattern to utf8
7181         and then recompile, as our sizing calculation will have been based
7182         on 1 byte == 1 character, but we will need to use utf8 to encode
7183         at least some part of the pattern, and therefore must convert the whole
7184         thing.
7185         -- dmq */
7186         if (flags & RESTART_PASS1) {
7187             if (flags & NEED_UTF8) {
7188                 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7189                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7190             }
7191             else {
7192                 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7193                 "Need to redo pass 1\n"));
7194             }
7195
7196             goto redo_first_pass;
7197         }
7198         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
7199     }
7200
7201     DEBUG_PARSE_r({
7202         Perl_re_printf( aTHX_
7203             "Required size %" IVdf " nodes\n"
7204             "Starting second pass (creation)\n",
7205             (IV)RExC_size);
7206         RExC_lastnum=0;
7207         RExC_lastparse=NULL;
7208     });
7209
7210     /* The first pass could have found things that force Unicode semantics */
7211     if ((RExC_utf8 || RExC_uni_semantics)
7212          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7213     {
7214         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7215     }
7216
7217     /* Small enough for pointer-storage convention?
7218        If extralen==0, this means that we will not need long jumps. */
7219     if (RExC_size >= 0x10000L && RExC_extralen)
7220         RExC_size += RExC_extralen;
7221     else
7222         RExC_extralen = 0;
7223     if (RExC_whilem_seen > 15)
7224         RExC_whilem_seen = 15;
7225
7226     /* Allocate space and zero-initialize. Note, the two step process
7227        of zeroing when in debug mode, thus anything assigned has to
7228        happen after that */
7229     rx = (REGEXP*) newSV_type(SVt_REGEXP);
7230     r = ReANY(rx);
7231     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7232          char, regexp_internal);
7233     if ( r == NULL || ri == NULL )
7234         FAIL("Regexp out of space");
7235 #ifdef DEBUGGING
7236     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7237     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7238          char);
7239 #else
7240     /* bulk initialize base fields with 0. */
7241     Zero(ri, sizeof(regexp_internal), char);
7242 #endif
7243
7244     /* non-zero initialization begins here */
7245     RXi_SET( r, ri );
7246     r->engine= eng;
7247     r->extflags = rx_flags;
7248     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7249
7250     if (pm_flags & PMf_IS_QR) {
7251         ri->code_blocks = pRExC_state->code_blocks;
7252         if (ri->code_blocks)
7253             ri->code_blocks->refcnt++;
7254     }
7255
7256     {
7257         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7258         bool has_charset = (get_regex_charset(r->extflags)
7259                                                     != REGEX_DEPENDS_CHARSET);
7260
7261         /* The caret is output if there are any defaults: if not all the STD
7262          * flags are set, or if no character set specifier is needed */
7263         bool has_default =
7264                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7265                     || ! has_charset);
7266         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7267                                                    == REG_RUN_ON_COMMENT_SEEN);
7268         U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7269                             >> RXf_PMf_STD_PMMOD_SHIFT);
7270         const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7271         char *p;
7272
7273         /* We output all the necessary flags; we never output a minus, as all
7274          * those are defaults, so are
7275          * covered by the caret */
7276         const STRLEN wraplen = plen + has_p + has_runon
7277             + has_default       /* If needs a caret */
7278             + PL_bitcount[reganch] /* 1 char for each set standard flag */
7279
7280                 /* If needs a character set specifier */
7281             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7282             + (sizeof("(?:)") - 1);
7283
7284         /* make sure PL_bitcount bounds not exceeded */
7285         assert(sizeof(STD_PAT_MODS) <= 8);
7286
7287         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
7288         SvPOK_on(rx);
7289         if (RExC_utf8)
7290             SvFLAGS(rx) |= SVf_UTF8;
7291         *p++='('; *p++='?';
7292
7293         /* If a default, cover it using the caret */
7294         if (has_default) {
7295             *p++= DEFAULT_PAT_MOD;
7296         }
7297         if (has_charset) {
7298             STRLEN len;
7299             const char* const name = get_regex_charset_name(r->extflags, &len);
7300             Copy(name, p, len, char);
7301             p += len;
7302         }
7303         if (has_p)
7304             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7305         {
7306             char ch;
7307             while((ch = *fptr++)) {
7308                 if(reganch & 1)
7309                     *p++ = ch;
7310                 reganch >>= 1;
7311             }
7312         }
7313
7314         *p++ = ':';
7315         Copy(RExC_precomp, p, plen, char);
7316         assert ((RX_WRAPPED(rx) - p) < 16);
7317         r->pre_prefix = p - RX_WRAPPED(rx);
7318         p += plen;
7319         if (has_runon)
7320             *p++ = '\n';
7321         *p++ = ')';
7322         *p = 0;
7323         SvCUR_set(rx, p - RX_WRAPPED(rx));
7324     }
7325
7326     r->intflags = 0;
7327     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
7328
7329     /* Useful during FAIL. */
7330 #ifdef RE_TRACK_PATTERN_OFFSETS
7331     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7332     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7333                           "%s %" UVuf " bytes for offset annotations.\n",
7334                           ri->u.offsets ? "Got" : "Couldn't get",
7335                           (UV)((2*RExC_size+1) * sizeof(U32))));
7336 #endif
7337     SetProgLen(ri,RExC_size);
7338     RExC_rx_sv = rx;
7339     RExC_rx = r;
7340     RExC_rxi = ri;
7341
7342     /* Second pass: emit code. */
7343     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7344     RExC_pm_flags = pm_flags;
7345     RExC_parse = exp;
7346     RExC_end = exp + plen;
7347     RExC_naughty = 0;
7348     RExC_emit_start = ri->program;
7349     RExC_emit = ri->program;
7350     RExC_emit_bound = ri->program + RExC_size + 1;
7351     pRExC_state->code_index = 0;
7352
7353     *((char*) RExC_emit++) = (char) REG_MAGIC;
7354     /* setup various meta data about recursion, this all requires
7355      * RExC_npar to be correctly set, and a bit later on we clear it */
7356     if (RExC_seen & REG_RECURSE_SEEN) {
7357         DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7358             "%*s%*s Setting up open/close parens\n",
7359                   22, "|    |", (int)(0 * 2 + 1), ""));
7360
7361         /* setup RExC_open_parens, which holds the address of each
7362          * OPEN tag, and to make things simpler for the 0 index
7363          * the start of the program - this is used later for offsets */
7364         Newxz(RExC_open_parens, RExC_npar,regnode *);
7365         SAVEFREEPV(RExC_open_parens);
7366         RExC_open_parens[0] = RExC_emit;
7367
7368         /* setup RExC_close_parens, which holds the address of each
7369          * CLOSE tag, and to make things simpler for the 0 index
7370          * the end of the program - this is used later for offsets */
7371         Newxz(RExC_close_parens, RExC_npar,regnode *);
7372         SAVEFREEPV(RExC_close_parens);
7373         /* we dont know where end op starts yet, so we dont
7374          * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7375
7376         /* Note, RExC_npar is 1 + the number of parens in a pattern.
7377          * So its 1 if there are no parens. */
7378         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7379                                          ((RExC_npar & 0x07) != 0);
7380         Newx(RExC_study_chunk_recursed,
7381              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7382         SAVEFREEPV(RExC_study_chunk_recursed);
7383     }
7384     RExC_npar = 1;
7385     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7386         ReREFCNT_dec(rx);
7387         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
7388     }
7389     DEBUG_OPTIMISE_r(
7390         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7391     );
7392
7393     /* XXXX To minimize changes to RE engine we always allocate
7394        3-units-long substrs field. */
7395     Newx(r->substrs, 1, struct reg_substr_data);
7396     if (RExC_recurse_count) {
7397         Newx(RExC_recurse,RExC_recurse_count,regnode *);
7398         SAVEFREEPV(RExC_recurse);
7399     }
7400
7401   reStudy:
7402     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7403     DEBUG_r(
7404         RExC_study_chunk_recursed_count= 0;
7405     );
7406     Zero(r->substrs, 1, struct reg_substr_data);
7407     if (RExC_study_chunk_recursed) {
7408         Zero(RExC_study_chunk_recursed,
7409              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7410     }
7411
7412
7413 #ifdef TRIE_STUDY_OPT
7414     if (!restudied) {
7415         StructCopy(&zero_scan_data, &data, scan_data_t);
7416         copyRExC_state = RExC_state;
7417     } else {
7418         U32 seen=RExC_seen;
7419         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7420
7421         RExC_state = copyRExC_state;
7422         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7423             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7424         else
7425             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7426         StructCopy(&zero_scan_data, &data, scan_data_t);
7427     }
7428 #else
7429     StructCopy(&zero_scan_data, &data, scan_data_t);
7430 #endif
7431
7432     /* Dig out information for optimizations. */
7433     r->extflags = RExC_flags; /* was pm_op */
7434     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7435
7436     if (UTF)
7437         SvUTF8_on(rx);  /* Unicode in it? */
7438     ri->regstclass = NULL;
7439     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7440         r->intflags |= PREGf_NAUGHTY;
7441     scan = ri->program + 1;             /* First BRANCH. */
7442
7443     /* testing for BRANCH here tells us whether there is "must appear"
7444        data in the pattern. If there is then we can use it for optimisations */
7445     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7446                                                   */
7447         SSize_t fake;
7448         STRLEN longest_length[2];
7449         regnode_ssc ch_class; /* pointed to by data */
7450         int stclass_flag;
7451         SSize_t last_close = 0; /* pointed to by data */
7452         regnode *first= scan;
7453         regnode *first_next= regnext(first);
7454         int i;
7455
7456         /*
7457          * Skip introductions and multiplicators >= 1
7458          * so that we can extract the 'meat' of the pattern that must
7459          * match in the large if() sequence following.
7460          * NOTE that EXACT is NOT covered here, as it is normally
7461          * picked up by the optimiser separately.
7462          *
7463          * This is unfortunate as the optimiser isnt handling lookahead
7464          * properly currently.
7465          *
7466          */
7467         while ((OP(first) == OPEN && (sawopen = 1)) ||
7468                /* An OR of *one* alternative - should not happen now. */
7469             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7470             /* for now we can't handle lookbehind IFMATCH*/
7471             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7472             (OP(first) == PLUS) ||
7473             (OP(first) == MINMOD) ||
7474                /* An {n,m} with n>0 */
7475             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7476             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7477         {
7478                 /*
7479                  * the only op that could be a regnode is PLUS, all the rest
7480                  * will be regnode_1 or regnode_2.
7481                  *
7482                  * (yves doesn't think this is true)
7483                  */
7484                 if (OP(first) == PLUS)
7485                     sawplus = 1;
7486                 else {
7487                     if (OP(first) == MINMOD)
7488                         sawminmod = 1;
7489                     first += regarglen[OP(first)];
7490                 }
7491                 first = NEXTOPER(first);
7492                 first_next= regnext(first);
7493         }
7494
7495         /* Starting-point info. */
7496       again:
7497         DEBUG_PEEP("first:", first, 0, 0);
7498         /* Ignore EXACT as we deal with it later. */
7499         if (PL_regkind[OP(first)] == EXACT) {
7500             if (OP(first) == EXACT || OP(first) == EXACTL)
7501                 NOOP;   /* Empty, get anchored substr later. */
7502             else
7503                 ri->regstclass = first;
7504         }
7505 #ifdef TRIE_STCLASS
7506         else if (PL_regkind[OP(first)] == TRIE &&
7507                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7508         {
7509             /* this can happen only on restudy */
7510             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7511         }
7512 #endif
7513         else if (REGNODE_SIMPLE(OP(first)))
7514             ri->regstclass = first;
7515         else if (PL_regkind[OP(first)] == BOUND ||
7516                  PL_regkind[OP(first)] == NBOUND)
7517             ri->regstclass = first;
7518         else if (PL_regkind[OP(first)] == BOL) {
7519             r->intflags |= (OP(first) == MBOL
7520                            ? PREGf_ANCH_MBOL
7521                            : PREGf_ANCH_SBOL);
7522             first = NEXTOPER(first);
7523             goto again;
7524         }
7525         else if (OP(first) == GPOS) {
7526             r->intflags |= PREGf_ANCH_GPOS;
7527             first = NEXTOPER(first);
7528             goto again;
7529         }
7530         else if ((!sawopen || !RExC_sawback) &&
7531             !sawlookahead &&
7532             (OP(first) == STAR &&
7533             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7534             !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7535         {
7536             /* turn .* into ^.* with an implied $*=1 */
7537             const int type =
7538                 (OP(NEXTOPER(first)) == REG_ANY)
7539                     ? PREGf_ANCH_MBOL
7540                     : PREGf_ANCH_SBOL;
7541             r->intflags |= (type | PREGf_IMPLICIT);
7542             first = NEXTOPER(first);
7543             goto again;
7544         }
7545         if (sawplus && !sawminmod && !sawlookahead
7546             && (!sawopen || !RExC_sawback)
7547             && !pRExC_state->code_blocks) /* May examine pos and $& */
7548             /* x+ must match at the 1st pos of run of x's */
7549             r->intflags |= PREGf_SKIP;
7550
7551         /* Scan is after the zeroth branch, first is atomic matcher. */
7552 #ifdef TRIE_STUDY_OPT
7553         DEBUG_PARSE_r(
7554             if (!restudied)
7555                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7556                               (IV)(first - scan + 1))
7557         );
7558 #else
7559         DEBUG_PARSE_r(
7560             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7561                 (IV)(first - scan + 1))
7562         );
7563 #endif
7564
7565
7566         /*
7567         * If there's something expensive in the r.e., find the
7568         * longest literal string that must appear and make it the
7569         * regmust.  Resolve ties in favor of later strings, since
7570         * the regstart check works with the beginning of the r.e.
7571         * and avoiding duplication strengthens checking.  Not a
7572         * strong reason, but sufficient in the absence of others.
7573         * [Now we resolve ties in favor of the earlier string if
7574         * it happens that c_offset_min has been invalidated, since the
7575         * earlier string may buy us something the later one won't.]
7576         */
7577
7578         data.substrs[0].str = newSVpvs("");
7579         data.substrs[1].str = newSVpvs("");
7580         data.last_found = newSVpvs("");
7581         data.cur_is_floating = 0; /* initially any found substring is fixed */
7582         ENTER_with_name("study_chunk");
7583         SAVEFREESV(data.substrs[0].str);
7584         SAVEFREESV(data.substrs[1].str);
7585         SAVEFREESV(data.last_found);
7586         first = scan;
7587         if (!ri->regstclass) {
7588             ssc_init(pRExC_state, &ch_class);
7589             data.start_class = &ch_class;
7590             stclass_flag = SCF_DO_STCLASS_AND;
7591         } else                          /* XXXX Check for BOUND? */
7592             stclass_flag = 0;
7593         data.last_closep = &last_close;
7594
7595         DEBUG_RExC_seen();
7596         /*
7597          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7598          * (NO top level branches)
7599          */
7600         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7601                              scan + RExC_size, /* Up to end */
7602             &data, -1, 0, NULL,
7603             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7604                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7605             0);
7606
7607
7608         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7609
7610
7611         if ( RExC_npar == 1 && !data.cur_is_floating
7612              && data.last_start_min == 0 && data.last_end > 0
7613              && !RExC_seen_zerolen
7614              && !(RExC_seen & REG_VERBARG_SEEN)
7615              && !(RExC_seen & REG_GPOS_SEEN)
7616         ){
7617             r->extflags |= RXf_CHECK_ALL;
7618         }
7619         scan_commit(pRExC_state, &data,&minlen,0);
7620
7621
7622         /* XXX this is done in reverse order because that's the way the
7623          * code was before it was parameterised. Don't know whether it
7624          * actually needs doing in reverse order. DAPM */
7625         for (i = 1; i >= 0; i--) {
7626             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
7627
7628             if (   !(   i
7629                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
7630                      &&    data.substrs[0].min_offset
7631                         == data.substrs[1].min_offset
7632                      &&    SvCUR(data.substrs[0].str)
7633                         == SvCUR(data.substrs[1].str)
7634                     )
7635                 && S_setup_longest (aTHX_ pRExC_state,
7636                                         &(r->substrs->data[i]),
7637                                         &(data.substrs[i]),
7638                                         longest_length[i]))
7639             {
7640                 r->substrs->data[i].min_offset =
7641                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
7642
7643                 r->substrs->data[i].max_offset = data.substrs[i].max_offset;
7644                 /* Don't offset infinity */
7645                 if (data.substrs[i].max_offset < SSize_t_MAX)
7646                     r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
7647                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
7648             }
7649             else {
7650                 r->substrs->data[i].substr      = NULL;
7651                 r->substrs->data[i].utf8_substr = NULL;
7652                 longest_length[i] = 0;
7653             }
7654         }
7655
7656         LEAVE_with_name("study_chunk");
7657
7658         if (ri->regstclass
7659             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7660             ri->regstclass = NULL;
7661
7662         if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
7663               || r->substrs->data[0].min_offset)
7664             && stclass_flag
7665             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7666             && is_ssc_worth_it(pRExC_state, data.start_class))
7667         {
7668             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7669
7670             ssc_finalize(pRExC_state, data.start_class);
7671
7672             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7673             StructCopy(data.start_class,
7674                        (regnode_ssc*)RExC_rxi->data->data[n],
7675                        regnode_ssc);
7676             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7677             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7678             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7679                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7680                       Perl_re_printf( aTHX_
7681                                     "synthetic stclass \"%s\".\n",
7682                                     SvPVX_const(sv));});
7683             data.start_class = NULL;
7684         }
7685
7686         /* A temporary algorithm prefers floated substr to fixed one of
7687          * same length to dig more info. */
7688         i = (longest_length[0] <= longest_length[1]);
7689         r->substrs->check_ix = i;
7690         r->check_end_shift  = r->substrs->data[i].end_shift;
7691         r->check_substr     = r->substrs->data[i].substr;
7692         r->check_utf8       = r->substrs->data[i].utf8_substr;
7693         r->check_offset_min = r->substrs->data[i].min_offset;
7694         r->check_offset_max = r->substrs->data[i].max_offset;
7695         if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
7696             r->intflags |= PREGf_NOSCAN;
7697
7698         if ((r->check_substr || r->check_utf8) ) {
7699             r->extflags |= RXf_USE_INTUIT;
7700             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7701                 r->extflags |= RXf_INTUIT_TAIL;
7702         }
7703
7704         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7705         if ( (STRLEN)minlen < longest_length[1] )
7706             minlen= longest_length[1];
7707         if ( (STRLEN)minlen < longest_length[0] )
7708             minlen= longest_length[0];
7709         */
7710     }
7711     else {
7712         /* Several toplevels. Best we can is to set minlen. */
7713         SSize_t fake;
7714         regnode_ssc ch_class;
7715         SSize_t last_close = 0;
7716
7717         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
7718
7719         scan = ri->program + 1;
7720         ssc_init(pRExC_state, &ch_class);
7721         data.start_class = &ch_class;
7722         data.last_closep = &last_close;
7723
7724         DEBUG_RExC_seen();
7725         /*
7726          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
7727          * (patterns WITH top level branches)
7728          */
7729         minlen = study_chunk(pRExC_state,
7730             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7731             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7732                                                       ? SCF_TRIE_DOING_RESTUDY
7733                                                       : 0),
7734             0);
7735
7736         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7737
7738         r->check_substr = NULL;
7739         r->check_utf8 = NULL;
7740         r->substrs->data[0].substr      = NULL;
7741         r->substrs->data[0].utf8_substr = NULL;
7742         r->substrs->data[1].substr      = NULL;
7743         r->substrs->data[1].utf8_substr = NULL;
7744
7745         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7746             && is_ssc_worth_it(pRExC_state, data.start_class))
7747         {
7748             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7749
7750             ssc_finalize(pRExC_state, data.start_class);
7751
7752             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7753             StructCopy(data.start_class,
7754                        (regnode_ssc*)RExC_rxi->data->data[n],
7755                        regnode_ssc);
7756             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7757             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7758             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7759                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7760                       Perl_re_printf( aTHX_
7761                                     "synthetic stclass \"%s\".\n",
7762                                     SvPVX_const(sv));});
7763             data.start_class = NULL;
7764         }
7765     }
7766
7767     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7768         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7769         r->maxlen = REG_INFTY;
7770     }
7771     else {
7772         r->maxlen = RExC_maxlen;
7773     }
7774
7775     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7776        the "real" pattern. */
7777     DEBUG_OPTIMISE_r({
7778         Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
7779                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7780     });
7781     r->minlenret = minlen;
7782     if (r->minlen < minlen)
7783         r->minlen = minlen;
7784
7785     if (RExC_seen & REG_RECURSE_SEEN ) {
7786         r->intflags |= PREGf_RECURSE_SEEN;
7787         Newx(r->recurse_locinput, r->nparens + 1, char *);
7788     }
7789     if (RExC_seen & REG_GPOS_SEEN)
7790         r->intflags |= PREGf_GPOS_SEEN;
7791     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7792         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7793                                                 lookbehind */
7794     if (pRExC_state->code_blocks)
7795         r->extflags |= RXf_EVAL_SEEN;
7796     if (RExC_seen & REG_VERBARG_SEEN)
7797     {
7798         r->intflags |= PREGf_VERBARG_SEEN;
7799         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7800     }
7801     if (RExC_seen & REG_CUTGROUP_SEEN)
7802         r->intflags |= PREGf_CUTGROUP_SEEN;
7803     if (pm_flags & PMf_USE_RE_EVAL)
7804         r->intflags |= PREGf_USE_RE_EVAL;
7805     if (RExC_paren_names)
7806         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7807     else
7808         RXp_PAREN_NAMES(r) = NULL;
7809
7810     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7811      * so it can be used in pp.c */
7812     if (r->intflags & PREGf_ANCH)
7813         r->extflags |= RXf_IS_ANCHORED;
7814
7815
7816     {
7817         /* this is used to identify "special" patterns that might result
7818          * in Perl NOT calling the regex engine and instead doing the match "itself",
7819          * particularly special cases in split//. By having the regex compiler
7820          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7821          * we avoid weird issues with equivalent patterns resulting in different behavior,
7822          * AND we allow non Perl engines to get the same optimizations by the setting the
7823          * flags appropriately - Yves */
7824         regnode *first = ri->program + 1;
7825         U8 fop = OP(first);
7826         regnode *next = regnext(first);
7827         U8 nop = OP(next);
7828
7829         if (PL_regkind[fop] == NOTHING && nop == END)
7830             r->extflags |= RXf_NULL;
7831         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7832             /* when fop is SBOL first->flags will be true only when it was
7833              * produced by parsing /\A/, and not when parsing /^/. This is
7834              * very important for the split code as there we want to
7835              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7836              * See rt #122761 for more details. -- Yves */
7837             r->extflags |= RXf_START_ONLY;
7838         else if (fop == PLUS
7839                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7840                  && nop == END)
7841             r->extflags |= RXf_WHITE;
7842         else if ( r->extflags & RXf_SPLIT
7843                   && (fop == EXACT || fop == EXACTL)
7844                   && STR_LEN(first) == 1
7845                   && *(STRING(first)) == ' '
7846                   && nop == END )
7847             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7848
7849     }
7850
7851     if (RExC_contains_locale) {
7852         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7853     }
7854
7855 #ifdef DEBUGGING
7856     if (RExC_paren_names) {
7857         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7858         ri->data->data[ri->name_list_idx]
7859                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7860     } else
7861 #endif
7862     ri->name_list_idx = 0;
7863
7864     while ( RExC_recurse_count > 0 ) {
7865         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7866         /*
7867          * This data structure is set up in study_chunk() and is used
7868          * to calculate the distance between a GOSUB regopcode and
7869          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
7870          * it refers to.
7871          *
7872          * If for some reason someone writes code that optimises
7873          * away a GOSUB opcode then the assert should be changed to
7874          * an if(scan) to guard the ARG2L_SET() - Yves
7875          *
7876          */
7877         assert(scan && OP(scan) == GOSUB);
7878         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7879     }
7880
7881     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7882     /* assume we don't need to swap parens around before we match */
7883     DEBUG_TEST_r({
7884         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7885             (unsigned long)RExC_study_chunk_recursed_count);
7886     });
7887     DEBUG_DUMP_r({
7888         DEBUG_RExC_seen();
7889         Perl_re_printf( aTHX_ "Final program:\n");
7890         regdump(r);
7891     });
7892 #ifdef RE_TRACK_PATTERN_OFFSETS
7893     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7894         const STRLEN len = ri->u.offsets[0];
7895         STRLEN i;
7896         GET_RE_DEBUG_FLAGS_DECL;
7897         Perl_re_printf( aTHX_
7898                       "Offsets: [%" UVuf "]\n\t", (UV)ri->u.offsets[0]);
7899         for (i = 1; i <= len; i++) {
7900             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7901                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7902                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7903             }
7904         Perl_re_printf( aTHX_  "\n");
7905     });
7906 #endif
7907
7908 #ifdef USE_ITHREADS
7909     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7910      * by setting the regexp SV to readonly-only instead. If the
7911      * pattern's been recompiled, the USEDness should remain. */
7912     if (old_re && SvREADONLY(old_re))
7913         SvREADONLY_on(rx);
7914 #endif
7915     return rx;
7916 }
7917
7918
7919 SV*
7920 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7921                     const U32 flags)
7922 {
7923     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7924
7925     PERL_UNUSED_ARG(value);
7926
7927     if (flags & RXapif_FETCH) {
7928         return reg_named_buff_fetch(rx, key, flags);
7929     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7930         Perl_croak_no_modify();
7931         return NULL;
7932     } else if (flags & RXapif_EXISTS) {
7933         return reg_named_buff_exists(rx, key, flags)
7934             ? &PL_sv_yes
7935             : &PL_sv_no;
7936     } else if (flags & RXapif_REGNAMES) {
7937         return reg_named_buff_all(rx, flags);
7938     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7939         return reg_named_buff_scalar(rx, flags);
7940     } else {
7941         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7942         return NULL;
7943     }
7944 }
7945
7946 SV*
7947 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7948                          const U32 flags)
7949 {
7950     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7951     PERL_UNUSED_ARG(lastkey);
7952
7953     if (flags & RXapif_FIRSTKEY)
7954         return reg_named_buff_firstkey(rx, flags);
7955     else if (flags & RXapif_NEXTKEY)
7956         return reg_named_buff_nextkey(rx, flags);
7957     else {
7958         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7959                                             (int)flags);
7960         return NULL;
7961     }
7962 }
7963
7964 SV*
7965 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7966                           const U32 flags)
7967 {
7968     SV *ret;
7969     struct regexp *const rx = ReANY(r);
7970
7971     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7972
7973     if (rx && RXp_PAREN_NAMES(rx)) {
7974         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7975         if (he_str) {
7976             IV i;
7977             SV* sv_dat=HeVAL(he_str);
7978             I32 *nums=(I32*)SvPVX(sv_dat);
7979             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
7980             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7981                 if ((I32)(rx->nparens) >= nums[i]
7982                     && rx->offs[nums[i]].start != -1
7983                     && rx->offs[nums[i]].end != -1)
7984                 {
7985                     ret = newSVpvs("");
7986                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7987                     if (!retarray)
7988                         return ret;
7989                 } else {
7990                     if (retarray)
7991                         ret = newSVsv(&PL_sv_undef);
7992                 }
7993                 if (retarray)
7994                     av_push(retarray, ret);
7995             }
7996             if (retarray)
7997                 return newRV_noinc(MUTABLE_SV(retarray));
7998         }
7999     }
8000     return NULL;
8001 }
8002
8003 bool
8004 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8005                            const U32 flags)
8006 {
8007     struct regexp *const rx = ReANY(r);
8008
8009     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8010
8011     if (rx && RXp_PAREN_NAMES(rx)) {
8012         if (flags & RXapif_ALL) {
8013             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8014         } else {
8015             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8016             if (sv) {
8017                 SvREFCNT_dec_NN(sv);
8018                 return TRUE;
8019             } else {
8020                 return FALSE;
8021             }
8022         }
8023     } else {
8024         return FALSE;
8025     }
8026 }
8027
8028 SV*
8029 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8030 {
8031     struct regexp *const rx = ReANY(r);
8032
8033     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8034
8035     if ( rx && RXp_PAREN_NAMES(rx) ) {
8036         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8037
8038         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8039     } else {
8040         return FALSE;
8041     }
8042 }
8043
8044 SV*
8045 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8046 {
8047     struct regexp *const rx = ReANY(r);
8048     GET_RE_DEBUG_FLAGS_DECL;
8049
8050     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8051
8052     if (rx && RXp_PAREN_NAMES(rx)) {
8053         HV *hv = RXp_PAREN_NAMES(rx);
8054         HE *temphe;
8055         while ( (temphe = hv_iternext_flags(hv,0)) ) {
8056             IV i;
8057             IV parno = 0;
8058             SV* sv_dat = HeVAL(temphe);
8059             I32 *nums = (I32*)SvPVX(sv_dat);
8060             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8061                 if ((I32)(rx->lastparen) >= nums[i] &&
8062                     rx->offs[nums[i]].start != -1 &&
8063                     rx->offs[nums[i]].end != -1)
8064                 {
8065                     parno = nums[i];
8066                     break;
8067                 }
8068             }
8069             if (parno || flags & RXapif_ALL) {
8070                 return newSVhek(HeKEY_hek(temphe));
8071             }
8072         }
8073     }
8074     return NULL;
8075 }
8076
8077 SV*
8078 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8079 {
8080     SV *ret;
8081     AV *av;
8082     SSize_t length;
8083     struct regexp *const rx = ReANY(r);
8084
8085     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8086
8087     if (rx && RXp_PAREN_NAMES(rx)) {
8088         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8089             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8090         } else if (flags & RXapif_ONE) {
8091             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8092             av = MUTABLE_AV(SvRV(ret));
8093             length = av_tindex(av);
8094             SvREFCNT_dec_NN(ret);
8095             return newSViv(length + 1);
8096         } else {
8097             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8098                                                 (int)flags);
8099             return NULL;
8100         }
8101     }
8102     return &PL_sv_undef;
8103 }
8104
8105 SV*
8106 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8107 {
8108     struct regexp *const rx = ReANY(r);
8109     AV *av = newAV();
8110
8111     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8112
8113     if (rx && RXp_PAREN_NAMES(rx)) {
8114         HV *hv= RXp_PAREN_NAMES(rx);
8115         HE *temphe;
8116         (void)hv_iterinit(hv);
8117         while ( (temphe = hv_iternext_flags(hv,0)) ) {
8118             IV i;
8119             IV parno = 0;
8120             SV* sv_dat = HeVAL(temphe);
8121             I32 *nums = (I32*)SvPVX(sv_dat);
8122             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8123                 if ((I32)(rx->lastparen) >= nums[i] &&
8124                     rx->offs[nums[i]].start != -1 &&
8125                     rx->offs[nums[i]].end != -1)
8126                 {
8127                     parno = nums[i];
8128                     break;
8129                 }
8130             }
8131             if (parno || flags & RXapif_ALL) {
8132                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8133             }
8134         }
8135     }
8136
8137     return newRV_noinc(MUTABLE_SV(av));
8138 }
8139
8140 void
8141 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8142                              SV * const sv)
8143 {
8144     struct regexp *const rx = ReANY(r);
8145     char *s = NULL;
8146     SSize_t i = 0;
8147     SSize_t s1, t1;
8148     I32 n = paren;
8149
8150     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8151
8152     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8153            || n == RX_BUFF_IDX_CARET_FULLMATCH
8154            || n == RX_BUFF_IDX_CARET_POSTMATCH
8155        )
8156     {
8157         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8158         if (!keepcopy) {
8159             /* on something like
8160              *    $r = qr/.../;
8161              *    /$qr/p;
8162              * the KEEPCOPY is set on the PMOP rather than the regex */
8163             if (PL_curpm && r == PM_GETRE(PL_curpm))
8164                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8165         }
8166         if (!keepcopy)
8167             goto ret_undef;
8168     }
8169
8170     if (!rx->subbeg)
8171         goto ret_undef;
8172
8173     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8174         /* no need to distinguish between them any more */
8175         n = RX_BUFF_IDX_FULLMATCH;
8176
8177     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8178         && rx->offs[0].start != -1)
8179     {
8180         /* $`, ${^PREMATCH} */
8181         i = rx->offs[0].start;
8182         s = rx->subbeg;
8183     }
8184     else
8185     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8186         && rx->offs[0].end != -1)
8187     {
8188         /* $', ${^POSTMATCH} */
8189         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8190         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8191     }
8192     else
8193     if ( 0 <= n && n <= (I32)rx->nparens &&
8194         (s1 = rx->offs[n].start) != -1 &&
8195         (t1 = rx->offs[n].end) != -1)
8196     {
8197         /* $&, ${^MATCH},  $1 ... */
8198         i = t1 - s1;
8199         s = rx->subbeg + s1 - rx->suboffset;
8200     } else {
8201         goto ret_undef;
8202     }
8203
8204     assert(s >= rx->subbeg);
8205     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8206     if (i >= 0) {
8207 #ifdef NO_TAINT_SUPPORT
8208         sv_setpvn(sv, s, i);
8209 #else
8210         const int oldtainted = TAINT_get;
8211         TAINT_NOT;
8212         sv_setpvn(sv, s, i);
8213         TAINT_set(oldtainted);
8214 #endif
8215         if (RXp_MATCH_UTF8(rx))
8216             SvUTF8_on(sv);
8217         else
8218             SvUTF8_off(sv);
8219         if (TAINTING_get) {
8220             if (RXp_MATCH_TAINTED(rx)) {
8221                 if (SvTYPE(sv) >= SVt_PVMG) {
8222                     MAGIC* const mg = SvMAGIC(sv);
8223                     MAGIC* mgt;
8224                     TAINT;
8225                     SvMAGIC_set(sv, mg->mg_moremagic);
8226                     SvTAINT(sv);
8227                     if ((mgt = SvMAGIC(sv))) {
8228                         mg->mg_moremagic = mgt;
8229                         SvMAGIC_set(sv, mg);
8230                     }
8231                 } else {
8232                     TAINT;
8233                     SvTAINT(sv);
8234                 }
8235             } else
8236                 SvTAINTED_off(sv);
8237         }
8238     } else {
8239       ret_undef:
8240         sv_set_undef(sv);
8241         return;
8242     }
8243 }
8244
8245 void
8246 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8247                                                          SV const * const value)
8248 {
8249     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8250
8251     PERL_UNUSED_ARG(rx);
8252     PERL_UNUSED_ARG(paren);
8253     PERL_UNUSED_ARG(value);
8254
8255     if (!PL_localizing)
8256         Perl_croak_no_modify();
8257 }
8258
8259 I32
8260 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8261                               const I32 paren)
8262 {
8263     struct regexp *const rx = ReANY(r);
8264     I32 i;
8265     I32 s1, t1;
8266
8267     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8268
8269     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8270         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8271         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8272     )
8273     {
8274         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8275         if (!keepcopy) {
8276             /* on something like
8277              *    $r = qr/.../;
8278              *    /$qr/p;
8279              * the KEEPCOPY is set on the PMOP rather than the regex */
8280             if (PL_curpm && r == PM_GETRE(PL_curpm))
8281                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8282         }
8283         if (!keepcopy)
8284             goto warn_undef;
8285     }
8286
8287     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8288     switch (paren) {
8289       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8290       case RX_BUFF_IDX_PREMATCH:       /* $` */
8291         if (rx->offs[0].start != -1) {
8292                         i = rx->offs[0].start;
8293                         if (i > 0) {
8294                                 s1 = 0;
8295                                 t1 = i;
8296                                 goto getlen;
8297                         }
8298             }
8299         return 0;
8300
8301       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8302       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8303             if (rx->offs[0].end != -1) {
8304                         i = rx->sublen - rx->offs[0].end;
8305                         if (i > 0) {
8306                                 s1 = rx->offs[0].end;
8307                                 t1 = rx->sublen;
8308                                 goto getlen;
8309                         }
8310             }
8311         return 0;
8312
8313       default: /* $& / ${^MATCH}, $1, $2, ... */
8314             if (paren <= (I32)rx->nparens &&
8315             (s1 = rx->offs[paren].start) != -1 &&
8316             (t1 = rx->offs[paren].end) != -1)
8317             {
8318             i = t1 - s1;
8319             goto getlen;
8320         } else {
8321           warn_undef:
8322             if (ckWARN(WARN_UNINITIALIZED))
8323                 report_uninit((const SV *)sv);
8324             return 0;
8325         }
8326     }
8327   getlen:
8328     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8329         const char * const s = rx->subbeg - rx->suboffset + s1;
8330         const U8 *ep;
8331         STRLEN el;
8332
8333         i = t1 - s1;
8334         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8335                         i = el;
8336     }
8337     return i;
8338 }
8339
8340 SV*
8341 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8342 {
8343     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8344         PERL_UNUSED_ARG(rx);
8345         if (0)
8346             return NULL;
8347         else
8348             return newSVpvs("Regexp");
8349 }
8350
8351 /* Scans the name of a named buffer from the pattern.
8352  * If flags is REG_RSN_RETURN_NULL returns null.
8353  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8354  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8355  * to the parsed name as looked up in the RExC_paren_names hash.
8356  * If there is an error throws a vFAIL().. type exception.
8357  */
8358
8359 #define REG_RSN_RETURN_NULL    0
8360 #define REG_RSN_RETURN_NAME    1
8361 #define REG_RSN_RETURN_DATA    2
8362
8363 STATIC SV*
8364 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8365 {
8366     char *name_start = RExC_parse;
8367
8368     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8369
8370     assert (RExC_parse <= RExC_end);
8371     if (RExC_parse == RExC_end) NOOP;
8372     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8373          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8374           * using do...while */
8375         if (UTF)
8376             do {
8377                 RExC_parse += UTF8SKIP(RExC_parse);
8378             } while (   RExC_parse < RExC_end
8379                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8380         else
8381             do {
8382                 RExC_parse++;
8383             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8384     } else {
8385         RExC_parse++; /* so the <- from the vFAIL is after the offending
8386                          character */
8387         vFAIL("Group name must start with a non-digit word character");
8388     }
8389     if ( flags ) {
8390         SV* sv_name
8391             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8392                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8393         if ( flags == REG_RSN_RETURN_NAME)
8394             return sv_name;
8395         else if (flags==REG_RSN_RETURN_DATA) {
8396             HE *he_str = NULL;
8397             SV *sv_dat = NULL;
8398             if ( ! sv_name )      /* should not happen*/
8399                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8400             if (RExC_paren_names)
8401                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8402             if ( he_str )
8403                 sv_dat = HeVAL(he_str);
8404             if ( ! sv_dat )
8405                 vFAIL("Reference to nonexistent named group");
8406             return sv_dat;
8407         }
8408         else {
8409             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8410                        (unsigned long) flags);
8411         }
8412         NOT_REACHED; /* NOTREACHED */
8413     }
8414     return NULL;
8415 }
8416
8417 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8418     int num;                                                    \
8419     if (RExC_lastparse!=RExC_parse) {                           \
8420         Perl_re_printf( aTHX_  "%s",                                        \
8421             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8422                 RExC_end - RExC_parse, 16,                      \
8423                 "", "",                                         \
8424                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8425                 PERL_PV_PRETTY_ELLIPSES   |                     \
8426                 PERL_PV_PRETTY_LTGT       |                     \
8427                 PERL_PV_ESCAPE_RE         |                     \
8428                 PERL_PV_PRETTY_EXACTSIZE                        \
8429             )                                                   \
8430         );                                                      \
8431     } else                                                      \
8432         Perl_re_printf( aTHX_ "%16s","");                                   \
8433                                                                 \
8434     if (SIZE_ONLY)                                              \
8435        num = RExC_size + 1;                                     \
8436     else                                                        \
8437        num=REG_NODE_NUM(RExC_emit);                             \
8438     if (RExC_lastnum!=num)                                      \
8439        Perl_re_printf( aTHX_ "|%4d",num);                                   \
8440     else                                                        \
8441        Perl_re_printf( aTHX_ "|%4s","");                                    \
8442     Perl_re_printf( aTHX_ "|%*s%-4s",                                       \
8443         (int)((depth*2)), "",                                   \
8444         (funcname)                                              \
8445     );                                                          \
8446     RExC_lastnum=num;                                           \
8447     RExC_lastparse=RExC_parse;                                  \
8448 })
8449
8450
8451
8452 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8453     DEBUG_PARSE_MSG((funcname));                            \
8454     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8455 })
8456 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8457     DEBUG_PARSE_MSG((funcname));                            \
8458     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8459 })
8460
8461 /* This section of code defines the inversion list object and its methods.  The
8462  * interfaces are highly subject to change, so as much as possible is static to
8463  * this file.  An inversion list is here implemented as a malloc'd C UV array
8464  * as an SVt_INVLIST scalar.
8465  *
8466  * An inversion list for Unicode is an array of code points, sorted by ordinal
8467  * number.  Each element gives the code point that begins a range that extends
8468  * up-to but not including the code point given by the next element.  The final
8469  * element gives the first code point of a range that extends to the platform's
8470  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8471  * ...) give ranges whose code points are all in the inversion list.  We say
8472  * that those ranges are in the set.  The odd-numbered elements give ranges
8473  * whose code points are not in the inversion list, and hence not in the set.
8474  * Thus, element [0] is the first code point in the list.  Element [1]
8475  * is the first code point beyond that not in the list; and element [2] is the
8476  * first code point beyond that that is in the list.  In other words, the first
8477  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8478  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8479  * all code points in that range are not in the inversion list.  The third
8480  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8481  * list, and so forth.  Thus every element whose index is divisible by two
8482  * gives the beginning of a range that is in the list, and every element whose
8483  * index is not divisible by two gives the beginning of a range not in the
8484  * list.  If the final element's index is divisible by two, the inversion list
8485  * extends to the platform's infinity; otherwise the highest code point in the
8486  * inversion list is the contents of that element minus 1.
8487  *
8488  * A range that contains just a single code point N will look like
8489  *  invlist[i]   == N
8490  *  invlist[i+1] == N+1
8491  *
8492  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8493  * impossible to represent, so element [i+1] is omitted.  The single element
8494  * inversion list
8495  *  invlist[0] == UV_MAX
8496  * contains just UV_MAX, but is interpreted as matching to infinity.
8497  *
8498  * Taking the complement (inverting) an inversion list is quite simple, if the
8499  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8500  * This implementation reserves an element at the beginning of each inversion
8501  * list to always contain 0; there is an additional flag in the header which
8502  * indicates if the list begins at the 0, or is offset to begin at the next
8503  * element.  This means that the inversion list can be inverted without any
8504  * copying; just flip the flag.
8505  *
8506  * More about inversion lists can be found in "Unicode Demystified"
8507  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8508  *
8509  * The inversion list data structure is currently implemented as an SV pointing
8510  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8511  * array of UV whose memory management is automatically handled by the existing
8512  * facilities for SV's.
8513  *
8514  * Some of the methods should always be private to the implementation, and some
8515  * should eventually be made public */
8516
8517 /* The header definitions are in F<invlist_inline.h> */
8518
8519 #ifndef PERL_IN_XSUB_RE
8520
8521 PERL_STATIC_INLINE UV*
8522 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8523 {
8524     /* Returns a pointer to the first element in the inversion list's array.
8525      * This is called upon initialization of an inversion list.  Where the
8526      * array begins depends on whether the list has the code point U+0000 in it
8527      * or not.  The other parameter tells it whether the code that follows this
8528      * call is about to put a 0 in the inversion list or not.  The first
8529      * element is either the element reserved for 0, if TRUE, or the element
8530      * after it, if FALSE */
8531
8532     bool* offset = get_invlist_offset_addr(invlist);
8533     UV* zero_addr = (UV *) SvPVX(invlist);
8534
8535     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8536
8537     /* Must be empty */
8538     assert(! _invlist_len(invlist));
8539
8540     *zero_addr = 0;
8541
8542     /* 1^1 = 0; 1^0 = 1 */
8543     *offset = 1 ^ will_have_0;
8544     return zero_addr + *offset;
8545 }
8546
8547 #endif
8548
8549 PERL_STATIC_INLINE void
8550 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8551 {
8552     /* Sets the current number of elements stored in the inversion list.
8553      * Updates SvCUR correspondingly */
8554     PERL_UNUSED_CONTEXT;
8555     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8556
8557     assert(SvTYPE(invlist) == SVt_INVLIST);
8558
8559     SvCUR_set(invlist,
8560               (len == 0)
8561                ? 0
8562                : TO_INTERNAL_SIZE(len + offset));
8563     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8564 }
8565
8566 #ifndef PERL_IN_XSUB_RE
8567
8568 STATIC void
8569 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8570 {
8571     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
8572      * steals the list from 'src', so 'src' is made to have a NULL list.  This
8573      * is similar to what SvSetMagicSV() would do, if it were implemented on
8574      * inversion lists, though this routine avoids a copy */
8575
8576     const UV src_len          = _invlist_len(src);
8577     const bool src_offset     = *get_invlist_offset_addr(src);
8578     const STRLEN src_byte_len = SvLEN(src);
8579     char * array              = SvPVX(src);
8580
8581     const int oldtainted = TAINT_get;
8582
8583     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8584
8585     assert(SvTYPE(src) == SVt_INVLIST);
8586     assert(SvTYPE(dest) == SVt_INVLIST);
8587     assert(! invlist_is_iterating(src));
8588     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8589
8590     /* Make sure it ends in the right place with a NUL, as our inversion list
8591      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8592      * asserts it */
8593     array[src_byte_len - 1] = '\0';
8594
8595     TAINT_NOT;      /* Otherwise it breaks */
8596     sv_usepvn_flags(dest,
8597                     (char *) array,
8598                     src_byte_len - 1,
8599
8600                     /* This flag is documented to cause a copy to be avoided */
8601                     SV_HAS_TRAILING_NUL);
8602     TAINT_set(oldtainted);
8603     SvPV_set(src, 0);
8604     SvLEN_set(src, 0);
8605     SvCUR_set(src, 0);
8606
8607     /* Finish up copying over the other fields in an inversion list */
8608     *get_invlist_offset_addr(dest) = src_offset;
8609     invlist_set_len(dest, src_len, src_offset);
8610     *get_invlist_previous_index_addr(dest) = 0;
8611     invlist_iterfinish(dest);
8612 }
8613
8614 PERL_STATIC_INLINE IV*
8615 S_get_invlist_previous_index_addr(SV* invlist)
8616 {
8617     /* Return the address of the IV that is reserved to hold the cached index
8618      * */
8619     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8620
8621     assert(SvTYPE(invlist) == SVt_INVLIST);
8622
8623     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8624 }
8625
8626 PERL_STATIC_INLINE IV
8627 S_invlist_previous_index(SV* const invlist)
8628 {
8629     /* Returns cached index of previous search */
8630
8631     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8632
8633     return *get_invlist_previous_index_addr(invlist);
8634 }
8635
8636 PERL_STATIC_INLINE void
8637 S_invlist_set_previous_index(SV* const invlist, const IV index)
8638 {
8639     /* Caches <index> for later retrieval */
8640
8641     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8642
8643     assert(index == 0 || index < (int) _invlist_len(invlist));
8644
8645     *get_invlist_previous_index_addr(invlist) = index;
8646 }
8647
8648 PERL_STATIC_INLINE void
8649 S_invlist_trim(SV* invlist)
8650 {
8651     /* Free the not currently-being-used space in an inversion list */
8652
8653     /* But don't free up the space needed for the 0 UV that is always at the
8654      * beginning of the list, nor the trailing NUL */
8655     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8656
8657     PERL_ARGS_ASSERT_INVLIST_TRIM;
8658
8659     assert(SvTYPE(invlist) == SVt_INVLIST);
8660
8661     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8662 }
8663
8664 PERL_STATIC_INLINE void
8665 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
8666 {
8667     PERL_ARGS_ASSERT_INVLIST_CLEAR;
8668
8669     assert(SvTYPE(invlist) == SVt_INVLIST);
8670
8671     invlist_set_len(invlist, 0, 0);
8672     invlist_trim(invlist);
8673 }
8674
8675 #endif /* ifndef PERL_IN_XSUB_RE */
8676
8677 PERL_STATIC_INLINE bool
8678 S_invlist_is_iterating(SV* const invlist)
8679 {
8680     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8681
8682     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8683 }
8684
8685 #ifndef PERL_IN_XSUB_RE
8686
8687 PERL_STATIC_INLINE UV
8688 S_invlist_max(SV* const invlist)
8689 {
8690     /* Returns the maximum number of elements storable in the inversion list's
8691      * array, without having to realloc() */
8692
8693     PERL_ARGS_ASSERT_INVLIST_MAX;
8694
8695     assert(SvTYPE(invlist) == SVt_INVLIST);
8696
8697     /* Assumes worst case, in which the 0 element is not counted in the
8698      * inversion list, so subtracts 1 for that */
8699     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8700            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8701            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8702 }
8703 SV*
8704 Perl__new_invlist(pTHX_ IV initial_size)
8705 {
8706
8707     /* Return a pointer to a newly constructed inversion list, with enough
8708      * space to store 'initial_size' elements.  If that number is negative, a
8709      * system default is used instead */
8710
8711     SV* new_list;
8712
8713     if (initial_size < 0) {
8714         initial_size = 10;
8715     }
8716
8717     /* Allocate the initial space */
8718     new_list = newSV_type(SVt_INVLIST);
8719
8720     /* First 1 is in case the zero element isn't in the list; second 1 is for
8721      * trailing NUL */
8722     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8723     invlist_set_len(new_list, 0, 0);
8724
8725     /* Force iterinit() to be used to get iteration to work */
8726     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8727
8728     *get_invlist_previous_index_addr(new_list) = 0;
8729
8730     return new_list;
8731 }
8732
8733 SV*
8734 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8735 {
8736     /* Return a pointer to a newly constructed inversion list, initialized to
8737      * point to <list>, which has to be in the exact correct inversion list
8738      * form, including internal fields.  Thus this is a dangerous routine that
8739      * should not be used in the wrong hands.  The passed in 'list' contains
8740      * several header fields at the beginning that are not part of the
8741      * inversion list body proper */
8742
8743     const STRLEN length = (STRLEN) list[0];
8744     const UV version_id =          list[1];
8745     const bool offset   =    cBOOL(list[2]);
8746 #define HEADER_LENGTH 3
8747     /* If any of the above changes in any way, you must change HEADER_LENGTH
8748      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8749      *      perl -E 'say int(rand 2**31-1)'
8750      */
8751 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8752                                         data structure type, so that one being
8753                                         passed in can be validated to be an
8754                                         inversion list of the correct vintage.
8755                                        */
8756
8757     SV* invlist = newSV_type(SVt_INVLIST);
8758
8759     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8760
8761     if (version_id != INVLIST_VERSION_ID) {
8762         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8763     }
8764
8765     /* The generated array passed in includes header elements that aren't part
8766      * of the list proper, so start it just after them */
8767     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8768
8769     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8770                                shouldn't touch it */
8771
8772     *(get_invlist_offset_addr(invlist)) = offset;
8773
8774     /* The 'length' passed to us is the physical number of elements in the
8775      * inversion list.  But if there is an offset the logical number is one
8776      * less than that */
8777     invlist_set_len(invlist, length  - offset, offset);
8778
8779     invlist_set_previous_index(invlist, 0);
8780
8781     /* Initialize the iteration pointer. */
8782     invlist_iterfinish(invlist);
8783
8784     SvREADONLY_on(invlist);
8785
8786     return invlist;
8787 }
8788
8789 STATIC void
8790 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8791 {
8792     /* Grow the maximum size of an inversion list */
8793
8794     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8795
8796     assert(SvTYPE(invlist) == SVt_INVLIST);
8797
8798     /* Add one to account for the zero element at the beginning which may not
8799      * be counted by the calling parameters */
8800     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8801 }
8802
8803 STATIC void
8804 S__append_range_to_invlist(pTHX_ SV* const invlist,
8805                                  const UV start, const UV end)
8806 {
8807    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8808     * the end of the inversion list.  The range must be above any existing
8809     * ones. */
8810
8811     UV* array;
8812     UV max = invlist_max(invlist);
8813     UV len = _invlist_len(invlist);
8814     bool offset;
8815
8816     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8817
8818     if (len == 0) { /* Empty lists must be initialized */
8819         offset = start != 0;
8820         array = _invlist_array_init(invlist, ! offset);
8821     }
8822     else {
8823         /* Here, the existing list is non-empty. The current max entry in the
8824          * list is generally the first value not in the set, except when the
8825          * set extends to the end of permissible values, in which case it is
8826          * the first entry in that final set, and so this call is an attempt to
8827          * append out-of-order */
8828
8829         UV final_element = len - 1;
8830         array = invlist_array(invlist);
8831         if (   array[final_element] > start
8832             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8833         {
8834             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
8835                      array[final_element], start,
8836                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8837         }
8838
8839         /* Here, it is a legal append.  If the new range begins 1 above the end
8840          * of the range below it, it is extending the range below it, so the
8841          * new first value not in the set is one greater than the newly
8842          * extended range.  */
8843         offset = *get_invlist_offset_addr(invlist);
8844         if (array[final_element] == start) {
8845             if (end != UV_MAX) {
8846                 array[final_element] = end + 1;
8847             }
8848             else {
8849                 /* But if the end is the maximum representable on the machine,
8850                  * assume that infinity was actually what was meant.  Just let
8851                  * the range that this would extend to have no end */
8852                 invlist_set_len(invlist, len - 1, offset);
8853             }
8854             return;
8855         }
8856     }
8857
8858     /* Here the new range doesn't extend any existing set.  Add it */
8859
8860     len += 2;   /* Includes an element each for the start and end of range */
8861
8862     /* If wll overflow the existing space, extend, which may cause the array to
8863      * be moved */
8864     if (max < len) {
8865         invlist_extend(invlist, len);
8866
8867         /* Have to set len here to avoid assert failure in invlist_array() */
8868         invlist_set_len(invlist, len, offset);
8869
8870         array = invlist_array(invlist);
8871     }
8872     else {
8873         invlist_set_len(invlist, len, offset);
8874     }
8875
8876     /* The next item on the list starts the range, the one after that is
8877      * one past the new range.  */
8878     array[len - 2] = start;
8879     if (end != UV_MAX) {
8880         array[len - 1] = end + 1;
8881     }
8882     else {
8883         /* But if the end is the maximum representable on the machine, just let
8884          * the range have no end */
8885         invlist_set_len(invlist, len - 1, offset);
8886     }
8887 }
8888
8889 SSize_t
8890 Perl__invlist_search(SV* const invlist, const UV cp)
8891 {
8892     /* Searches the inversion list for the entry that contains the input code
8893      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8894      * return value is the index into the list's array of the range that
8895      * contains <cp>, that is, 'i' such that
8896      *  array[i] <= cp < array[i+1]
8897      */
8898
8899     IV low = 0;
8900     IV mid;
8901     IV high = _invlist_len(invlist);
8902     const IV highest_element = high - 1;
8903     const UV* array;
8904
8905     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8906
8907     /* If list is empty, return failure. */
8908     if (high == 0) {
8909         return -1;
8910     }
8911
8912     /* (We can't get the array unless we know the list is non-empty) */
8913     array = invlist_array(invlist);
8914
8915     mid = invlist_previous_index(invlist);
8916     assert(mid >=0);
8917     if (mid > highest_element) {
8918         mid = highest_element;
8919     }
8920
8921     /* <mid> contains the cache of the result of the previous call to this
8922      * function (0 the first time).  See if this call is for the same result,
8923      * or if it is for mid-1.  This is under the theory that calls to this
8924      * function will often be for related code points that are near each other.
8925      * And benchmarks show that caching gives better results.  We also test
8926      * here if the code point is within the bounds of the list.  These tests
8927      * replace others that would have had to be made anyway to make sure that
8928      * the array bounds were not exceeded, and these give us extra information
8929      * at the same time */
8930     if (cp >= array[mid]) {
8931         if (cp >= array[highest_element]) {
8932             return highest_element;
8933         }
8934
8935         /* Here, array[mid] <= cp < array[highest_element].  This means that
8936          * the final element is not the answer, so can exclude it; it also
8937          * means that <mid> is not the final element, so can refer to 'mid + 1'
8938          * safely */
8939         if (cp < array[mid + 1]) {
8940             return mid;
8941         }
8942         high--;
8943         low = mid + 1;
8944     }
8945     else { /* cp < aray[mid] */
8946         if (cp < array[0]) { /* Fail if outside the array */
8947             return -1;
8948         }
8949         high = mid;
8950         if (cp >= array[mid - 1]) {
8951             goto found_entry;
8952         }
8953     }
8954
8955     /* Binary search.  What we are looking for is <i> such that
8956      *  array[i] <= cp < array[i+1]
8957      * The loop below converges on the i+1.  Note that there may not be an
8958      * (i+1)th element in the array, and things work nonetheless */
8959     while (low < high) {
8960         mid = (low + high) / 2;
8961         assert(mid <= highest_element);
8962         if (array[mid] <= cp) { /* cp >= array[mid] */
8963             low = mid + 1;
8964
8965             /* We could do this extra test to exit the loop early.
8966             if (cp < array[low]) {
8967                 return mid;
8968             }
8969             */
8970         }
8971         else { /* cp < array[mid] */
8972             high = mid;
8973         }
8974     }
8975
8976   found_entry:
8977     high--;
8978     invlist_set_previous_index(invlist, high);
8979     return high;
8980 }
8981
8982 void
8983 Perl__invlist_populate_swatch(SV* const invlist,
8984                               const UV start, const UV end, U8* swatch)
8985 {
8986     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8987      * but is used when the swash has an inversion list.  This makes this much
8988      * faster, as it uses a binary search instead of a linear one.  This is
8989      * intimately tied to that function, and perhaps should be in utf8.c,
8990      * except it is intimately tied to inversion lists as well.  It assumes
8991      * that <swatch> is all 0's on input */
8992
8993     UV current = start;
8994     const IV len = _invlist_len(invlist);
8995     IV i;
8996     const UV * array;
8997
8998     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8999
9000     if (len == 0) { /* Empty inversion list */
9001         return;
9002     }
9003
9004     array = invlist_array(invlist);
9005
9006     /* Find which element it is */
9007     i = _invlist_search(invlist, start);
9008
9009     /* We populate from <start> to <end> */
9010     while (current < end) {
9011         UV upper;
9012
9013         /* The inversion list gives the results for every possible code point
9014          * after the first one in the list.  Only those ranges whose index is
9015          * even are ones that the inversion list matches.  For the odd ones,
9016          * and if the initial code point is not in the list, we have to skip
9017          * forward to the next element */
9018         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9019             i++;
9020             if (i >= len) { /* Finished if beyond the end of the array */
9021                 return;
9022             }
9023             current = array[i];
9024             if (current >= end) {   /* Finished if beyond the end of what we
9025                                        are populating */
9026                 if (LIKELY(end < UV_MAX)) {
9027                     return;
9028                 }
9029
9030                 /* We get here when the upper bound is the maximum
9031                  * representable on the machine, and we are looking for just
9032                  * that code point.  Have to special case it */
9033                 i = len;
9034                 goto join_end_of_list;
9035             }
9036         }
9037         assert(current >= start);
9038
9039         /* The current range ends one below the next one, except don't go past
9040          * <end> */
9041         i++;
9042         upper = (i < len && array[i] < end) ? array[i] : end;
9043
9044         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
9045          * for each code point in it */
9046         for (; current < upper; current++) {
9047             const STRLEN offset = (STRLEN)(current - start);
9048             swatch[offset >> 3] |= 1 << (offset & 7);
9049         }
9050
9051       join_end_of_list:
9052
9053         /* Quit if at the end of the list */
9054         if (i >= len) {
9055
9056             /* But first, have to deal with the highest possible code point on
9057              * the platform.  The previous code assumes that <end> is one
9058              * beyond where we want to populate, but that is impossible at the
9059              * platform's infinity, so have to handle it specially */
9060             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9061             {
9062                 const STRLEN offset = (STRLEN)(end - start);
9063                 swatch[offset >> 3] |= 1 << (offset & 7);
9064             }
9065             return;
9066         }
9067
9068         /* Advance to the next range, which will be for code points not in the
9069          * inversion list */
9070         current = array[i];
9071     }
9072
9073     return;
9074 }
9075
9076 void
9077 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9078                                          const bool complement_b, SV** output)
9079 {
9080     /* Take the union of two inversion lists and point '*output' to it.  On
9081      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9082      * even 'a' or 'b').  If to an inversion list, the contents of the original
9083      * list will be replaced by the union.  The first list, 'a', may be
9084      * NULL, in which case a copy of the second list is placed in '*output'.
9085      * If 'complement_b' is TRUE, the union is taken of the complement
9086      * (inversion) of 'b' instead of b itself.
9087      *
9088      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9089      * Richard Gillam, published by Addison-Wesley, and explained at some
9090      * length there.  The preface says to incorporate its examples into your
9091      * code at your own risk.
9092      *
9093      * The algorithm is like a merge sort. */
9094
9095     const UV* array_a;    /* a's array */
9096     const UV* array_b;
9097     UV len_a;       /* length of a's array */
9098     UV len_b;
9099
9100     SV* u;                      /* the resulting union */
9101     UV* array_u;
9102     UV len_u = 0;
9103
9104     UV i_a = 0;             /* current index into a's array */
9105     UV i_b = 0;
9106     UV i_u = 0;
9107
9108     /* running count, as explained in the algorithm source book; items are
9109      * stopped accumulating and are output when the count changes to/from 0.
9110      * The count is incremented when we start a range that's in an input's set,
9111      * and decremented when we start a range that's not in a set.  So this
9112      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9113      * and hence nothing goes into the union; 1, just one of the inputs is in
9114      * its set (and its current range gets added to the union); and 2 when both
9115      * inputs are in their sets.  */
9116     UV count = 0;
9117
9118     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9119     assert(a != b);
9120     assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
9121
9122     len_b = _invlist_len(b);
9123     if (len_b == 0) {
9124
9125         /* Here, 'b' is empty, hence it's complement is all possible code
9126          * points.  So if the union includes the complement of 'b', it includes
9127          * everything, and we need not even look at 'a'.  It's easiest to
9128          * create a new inversion list that matches everything.  */
9129         if (complement_b) {
9130             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9131
9132             if (*output == NULL) { /* If the output didn't exist, just point it
9133                                       at the new list */
9134                 *output = everything;
9135             }
9136             else { /* Otherwise, replace its contents with the new list */
9137                 invlist_replace_list_destroys_src(*output, everything);
9138                 SvREFCNT_dec_NN(everything);
9139             }
9140
9141             return;
9142         }
9143
9144         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9145          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9146          * output will be empty */
9147
9148         if (a == NULL || _invlist_len(a) == 0) {
9149             if (*output == NULL) {
9150                 *output = _new_invlist(0);
9151             }
9152             else {
9153                 invlist_clear(*output);
9154             }
9155             return;
9156         }
9157
9158         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9159          * union.  We can just return a copy of 'a' if '*output' doesn't point
9160          * to an existing list */
9161         if (*output == NULL) {
9162             *output = invlist_clone(a);
9163             return;
9164         }
9165
9166         /* If the output is to overwrite 'a', we have a no-op, as it's
9167          * already in 'a' */
9168         if (*output == a) {
9169             return;
9170         }
9171
9172         /* Here, '*output' is to be overwritten by 'a' */
9173         u = invlist_clone(a);
9174         invlist_replace_list_destroys_src(*output, u);
9175         SvREFCNT_dec_NN(u);
9176
9177         return;
9178     }
9179
9180     /* Here 'b' is not empty.  See about 'a' */
9181
9182     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9183
9184         /* Here, 'a' is empty (and b is not).  That means the union will come
9185          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9186          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9187          * the clone */
9188
9189         SV ** dest = (*output == NULL) ? output : &u;
9190         *dest = invlist_clone(b);
9191         if (complement_b) {
9192             _invlist_invert(*dest);
9193         }
9194
9195         if (dest == &u) {
9196             invlist_replace_list_destroys_src(*output, u);
9197             SvREFCNT_dec_NN(u);
9198         }
9199
9200         return;
9201     }
9202
9203     /* Here both lists exist and are non-empty */
9204     array_a = invlist_array(a);
9205     array_b = invlist_array(b);
9206
9207     /* If are to take the union of 'a' with the complement of b, set it
9208      * up so are looking at b's complement. */
9209     if (complement_b) {
9210
9211         /* To complement, we invert: if the first element is 0, remove it.  To
9212          * do this, we just pretend the array starts one later */
9213         if (array_b[0] == 0) {
9214             array_b++;
9215             len_b--;
9216         }
9217         else {
9218
9219             /* But if the first element is not zero, we pretend the list starts
9220              * at the 0 that is always stored immediately before the array. */
9221             array_b--;
9222             len_b++;
9223         }
9224     }
9225
9226     /* Size the union for the worst case: that the sets are completely
9227      * disjoint */
9228     u = _new_invlist(len_a + len_b);
9229
9230     /* Will contain U+0000 if either component does */
9231     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9232                                       || (len_b > 0 && array_b[0] == 0));
9233
9234     /* Go through each input list item by item, stopping when have exhausted
9235      * one of them */
9236     while (i_a < len_a && i_b < len_b) {
9237         UV cp;      /* The element to potentially add to the union's array */
9238         bool cp_in_set;   /* is it in the the input list's set or not */
9239
9240         /* We need to take one or the other of the two inputs for the union.
9241          * Since we are merging two sorted lists, we take the smaller of the
9242          * next items.  In case of a tie, we take first the one that is in its
9243          * set.  If we first took the one not in its set, it would decrement
9244          * the count, possibly to 0 which would cause it to be output as ending
9245          * the range, and the next time through we would take the same number,
9246          * and output it again as beginning the next range.  By doing it the
9247          * opposite way, there is no possibility that the count will be
9248          * momentarily decremented to 0, and thus the two adjoining ranges will
9249          * be seamlessly merged.  (In a tie and both are in the set or both not
9250          * in the set, it doesn't matter which we take first.) */
9251         if (       array_a[i_a] < array_b[i_b]
9252             || (   array_a[i_a] == array_b[i_b]
9253                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9254         {
9255             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9256             cp = array_a[i_a++];
9257         }
9258         else {
9259             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9260             cp = array_b[i_b++];
9261         }
9262
9263         /* Here, have chosen which of the two inputs to look at.  Only output
9264          * if the running count changes to/from 0, which marks the
9265          * beginning/end of a range that's in the set */
9266         if (cp_in_set) {
9267             if (count == 0) {
9268                 array_u[i_u++] = cp;
9269             }
9270             count++;
9271         }
9272         else {
9273             count--;
9274             if (count == 0) {
9275                 array_u[i_u++] = cp;
9276             }
9277         }
9278     }
9279
9280
9281     /* The loop above increments the index into exactly one of the input lists
9282      * each iteration, and ends when either index gets to its list end.  That
9283      * means the other index is lower than its end, and so something is
9284      * remaining in that one.  We decrement 'count', as explained below, if
9285      * that list is in its set.  (i_a and i_b each currently index the element
9286      * beyond the one we care about.) */
9287     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9288         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9289     {
9290         count--;
9291     }
9292
9293     /* Above we decremented 'count' if the list that had unexamined elements in
9294      * it was in its set.  This has made it so that 'count' being non-zero
9295      * means there isn't anything left to output; and 'count' equal to 0 means
9296      * that what is left to output is precisely that which is left in the
9297      * non-exhausted input list.
9298      *
9299      * To see why, note first that the exhausted input obviously has nothing
9300      * left to add to the union.  If it was in its set at its end, that means
9301      * the set extends from here to the platform's infinity, and hence so does
9302      * the union and the non-exhausted set is irrelevant.  The exhausted set
9303      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9304      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9305      * 'count' remains at 1.  This is consistent with the decremented 'count'
9306      * != 0 meaning there's nothing left to add to the union.
9307      *
9308      * But if the exhausted input wasn't in its set, it contributed 0 to
9309      * 'count', and the rest of the union will be whatever the other input is.
9310      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9311      * otherwise it gets decremented to 0.  This is consistent with 'count'
9312      * == 0 meaning the remainder of the union is whatever is left in the
9313      * non-exhausted list. */
9314     if (count != 0) {
9315         len_u = i_u;
9316     }
9317     else {
9318         IV copy_count = len_a - i_a;
9319         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9320             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9321         }
9322         else { /* The non-exhausted input is b */
9323             copy_count = len_b - i_b;
9324             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9325         }
9326         len_u = i_u + copy_count;
9327     }
9328
9329     /* Set the result to the final length, which can change the pointer to
9330      * array_u, so re-find it.  (Note that it is unlikely that this will
9331      * change, as we are shrinking the space, not enlarging it) */
9332     if (len_u != _invlist_len(u)) {
9333         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9334         invlist_trim(u);
9335         array_u = invlist_array(u);
9336     }
9337
9338     if (*output == NULL) {  /* Simply return the new inversion list */
9339         *output = u;
9340     }
9341     else {
9342         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9343          * could instead free '*output', and then set it to 'u', but experience
9344          * has shown [perl #127392] that if the input is a mortal, we can get a
9345          * huge build-up of these during regex compilation before they get
9346          * freed. */
9347         invlist_replace_list_destroys_src(*output, u);
9348         SvREFCNT_dec_NN(u);
9349     }
9350
9351     return;
9352 }
9353
9354 void
9355 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9356                                                const bool complement_b, SV** i)
9357 {
9358     /* Take the intersection of two inversion lists and point '*i' to it.  On
9359      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9360      * even 'a' or 'b').  If to an inversion list, the contents of the original
9361      * list will be replaced by the intersection.  The first list, 'a', may be
9362      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9363      * TRUE, the result will be the intersection of 'a' and the complement (or
9364      * inversion) of 'b' instead of 'b' directly.
9365      *
9366      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9367      * Richard Gillam, published by Addison-Wesley, and explained at some
9368      * length there.  The preface says to incorporate its examples into your
9369      * code at your own risk.  In fact, it had bugs
9370      *
9371      * The algorithm is like a merge sort, and is essentially the same as the
9372      * union above
9373      */
9374
9375     const UV* array_a;          /* a's array */
9376     const UV* array_b;
9377     UV len_a;   /* length of a's array */
9378     UV len_b;
9379
9380     SV* r;                   /* the resulting intersection */
9381     UV* array_r;
9382     UV len_r = 0;
9383
9384     UV i_a = 0;             /* current index into a's array */
9385     UV i_b = 0;
9386     UV i_r = 0;
9387
9388     /* running count of how many of the two inputs are postitioned at ranges
9389      * that are in their sets.  As explained in the algorithm source book,
9390      * items are stopped accumulating and are output when the count changes
9391      * to/from 2.  The count is incremented when we start a range that's in an
9392      * input's set, and decremented when we start a range that's not in a set.
9393      * Only when it is 2 are we in the intersection. */
9394     UV count = 0;
9395
9396     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9397     assert(a != b);
9398     assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
9399
9400     /* Special case if either one is empty */
9401     len_a = (a == NULL) ? 0 : _invlist_len(a);
9402     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9403         if (len_a != 0 && complement_b) {
9404
9405             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9406              * must be empty.  Here, also we are using 'b's complement, which
9407              * hence must be every possible code point.  Thus the intersection
9408              * is simply 'a'. */
9409
9410             if (*i == a) {  /* No-op */
9411                 return;
9412             }
9413
9414             if (*i == NULL) {
9415                 *i = invlist_clone(a);
9416                 return;
9417             }
9418
9419             r = invlist_clone(a);
9420             invlist_replace_list_destroys_src(*i, r);
9421             SvREFCNT_dec_NN(r);
9422             return;
9423         }
9424
9425         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9426          * intersection must be empty */
9427         if (*i == NULL) {
9428             *i = _new_invlist(0);
9429             return;
9430         }
9431
9432         invlist_clear(*i);
9433         return;
9434     }
9435
9436     /* Here both lists exist and are non-empty */
9437     array_a = invlist_array(a);
9438     array_b = invlist_array(b);
9439
9440     /* If are to take the intersection of 'a' with the complement of b, set it
9441      * up so are looking at b's complement. */
9442     if (complement_b) {
9443
9444         /* To complement, we invert: if the first element is 0, remove it.  To
9445          * do this, we just pretend the array starts one later */
9446         if (array_b[0] == 0) {
9447             array_b++;
9448             len_b--;
9449         }
9450         else {
9451
9452             /* But if the first element is not zero, we pretend the list starts
9453              * at the 0 that is always stored immediately before the array. */
9454             array_b--;
9455             len_b++;
9456         }
9457     }
9458
9459     /* Size the intersection for the worst case: that the intersection ends up
9460      * fragmenting everything to be completely disjoint */
9461     r= _new_invlist(len_a + len_b);
9462
9463     /* Will contain U+0000 iff both components do */
9464     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9465                                      && len_b > 0 && array_b[0] == 0);
9466
9467     /* Go through each list item by item, stopping when have exhausted one of
9468      * them */
9469     while (i_a < len_a && i_b < len_b) {
9470         UV cp;      /* The element to potentially add to the intersection's
9471                        array */
9472         bool cp_in_set; /* Is it in the input list's set or not */
9473
9474         /* We need to take one or the other of the two inputs for the
9475          * intersection.  Since we are merging two sorted lists, we take the
9476          * smaller of the next items.  In case of a tie, we take first the one
9477          * that is not in its set (a difference from the union algorithm).  If
9478          * we first took the one in its set, it would increment the count,
9479          * possibly to 2 which would cause it to be output as starting a range
9480          * in the intersection, and the next time through we would take that
9481          * same number, and output it again as ending the set.  By doing the
9482          * opposite of this, there is no possibility that the count will be
9483          * momentarily incremented to 2.  (In a tie and both are in the set or
9484          * both not in the set, it doesn't matter which we take first.) */
9485         if (       array_a[i_a] < array_b[i_b]
9486             || (   array_a[i_a] == array_b[i_b]
9487                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9488         {
9489             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9490             cp = array_a[i_a++];
9491         }
9492         else {
9493             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9494             cp= array_b[i_b++];
9495         }
9496
9497         /* Here, have chosen which of the two inputs to look at.  Only output
9498          * if the running count changes to/from 2, which marks the
9499          * beginning/end of a range that's in the intersection */
9500         if (cp_in_set) {
9501             count++;
9502             if (count == 2) {
9503                 array_r[i_r++] = cp;
9504             }
9505         }
9506         else {
9507             if (count == 2) {
9508                 array_r[i_r++] = cp;
9509             }
9510             count--;
9511         }
9512
9513     }
9514
9515     /* The loop above increments the index into exactly one of the input lists
9516      * each iteration, and ends when either index gets to its list end.  That
9517      * means the other index is lower than its end, and so something is
9518      * remaining in that one.  We increment 'count', as explained below, if the
9519      * exhausted list was in its set.  (i_a and i_b each currently index the
9520      * element beyond the one we care about.) */
9521     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9522         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9523     {
9524         count++;
9525     }
9526
9527     /* Above we incremented 'count' if the exhausted list was in its set.  This
9528      * has made it so that 'count' being below 2 means there is nothing left to
9529      * output; otheriwse what's left to add to the intersection is precisely
9530      * that which is left in the non-exhausted input list.
9531      *
9532      * To see why, note first that the exhausted input obviously has nothing
9533      * left to affect the intersection.  If it was in its set at its end, that
9534      * means the set extends from here to the platform's infinity, and hence
9535      * anything in the non-exhausted's list will be in the intersection, and
9536      * anything not in it won't be.  Hence, the rest of the intersection is
9537      * precisely what's in the non-exhausted list  The exhausted set also
9538      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9539      * it means 'count' is now at least 2.  This is consistent with the
9540      * incremented 'count' being >= 2 means to add the non-exhausted list to
9541      * the intersection.
9542      *
9543      * But if the exhausted input wasn't in its set, it contributed 0 to
9544      * 'count', and the intersection can't include anything further; the
9545      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9546      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9547      * further to add to the intersection. */
9548     if (count < 2) { /* Nothing left to put in the intersection. */
9549         len_r = i_r;
9550     }
9551     else { /* copy the non-exhausted list, unchanged. */
9552         IV copy_count = len_a - i_a;
9553         if (copy_count > 0) {   /* a is the one with stuff left */
9554             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9555         }
9556         else {  /* b is the one with stuff left */
9557             copy_count = len_b - i_b;
9558             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9559         }
9560         len_r = i_r + copy_count;
9561     }
9562
9563     /* Set the result to the final length, which can change the pointer to
9564      * array_r, so re-find it.  (Note that it is unlikely that this will
9565      * change, as we are shrinking the space, not enlarging it) */
9566     if (len_r != _invlist_len(r)) {
9567         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9568         invlist_trim(r);
9569         array_r = invlist_array(r);
9570     }
9571
9572     if (*i == NULL) { /* Simply return the calculated intersection */
9573         *i = r;
9574     }
9575     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9576               instead free '*i', and then set it to 'r', but experience has
9577               shown [perl #127392] that if the input is a mortal, we can get a
9578               huge build-up of these during regex compilation before they get
9579               freed. */
9580         if (len_r) {
9581             invlist_replace_list_destroys_src(*i, r);
9582         }
9583         else {
9584             invlist_clear(*i);
9585         }
9586         SvREFCNT_dec_NN(r);
9587     }
9588
9589     return;
9590 }
9591
9592 SV*
9593 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9594 {
9595     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9596      * set.  A pointer to the inversion list is returned.  This may actually be
9597      * a new list, in which case the passed in one has been destroyed.  The
9598      * passed-in inversion list can be NULL, in which case a new one is created
9599      * with just the one range in it.  The new list is not necessarily
9600      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9601      * result of this function.  The gain would not be large, and in many
9602      * cases, this is called multiple times on a single inversion list, so
9603      * anything freed may almost immediately be needed again.
9604      *
9605      * This used to mostly call the 'union' routine, but that is much more
9606      * heavyweight than really needed for a single range addition */
9607
9608     UV* array;              /* The array implementing the inversion list */
9609     UV len;                 /* How many elements in 'array' */
9610     SSize_t i_s;            /* index into the invlist array where 'start'
9611                                should go */
9612     SSize_t i_e = 0;        /* And the index where 'end' should go */
9613     UV cur_highest;         /* The highest code point in the inversion list
9614                                upon entry to this function */
9615
9616     /* This range becomes the whole inversion list if none already existed */
9617     if (invlist == NULL) {
9618         invlist = _new_invlist(2);
9619         _append_range_to_invlist(invlist, start, end);
9620         return invlist;
9621     }
9622
9623     /* Likewise, if the inversion list is currently empty */
9624     len = _invlist_len(invlist);
9625     if (len == 0) {
9626         _append_range_to_invlist(invlist, start, end);
9627         return invlist;
9628     }
9629
9630     /* Starting here, we have to know the internals of the list */
9631     array = invlist_array(invlist);
9632
9633     /* If the new range ends higher than the current highest ... */
9634     cur_highest = invlist_highest(invlist);
9635     if (end > cur_highest) {
9636
9637         /* If the whole range is higher, we can just append it */
9638         if (start > cur_highest) {
9639             _append_range_to_invlist(invlist, start, end);
9640             return invlist;
9641         }
9642
9643         /* Otherwise, add the portion that is higher ... */
9644         _append_range_to_invlist(invlist, cur_highest + 1, end);
9645
9646         /* ... and continue on below to handle the rest.  As a result of the
9647          * above append, we know that the index of the end of the range is the
9648          * final even numbered one of the array.  Recall that the final element
9649          * always starts a range that extends to infinity.  If that range is in
9650          * the set (meaning the set goes from here to infinity), it will be an
9651          * even index, but if it isn't in the set, it's odd, and the final
9652          * range in the set is one less, which is even. */
9653         if (end == UV_MAX) {
9654             i_e = len;
9655         }
9656         else {
9657             i_e = len - 2;
9658         }
9659     }
9660
9661     /* We have dealt with appending, now see about prepending.  If the new
9662      * range starts lower than the current lowest ... */
9663     if (start < array[0]) {
9664
9665         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9666          * Let the union code handle it, rather than having to know the
9667          * trickiness in two code places.  */
9668         if (UNLIKELY(start == 0)) {
9669             SV* range_invlist;
9670
9671             range_invlist = _new_invlist(2);
9672             _append_range_to_invlist(range_invlist, start, end);
9673
9674             _invlist_union(invlist, range_invlist, &invlist);
9675
9676             SvREFCNT_dec_NN(range_invlist);
9677
9678             return invlist;
9679         }
9680
9681         /* If the whole new range comes before the first entry, and doesn't
9682          * extend it, we have to insert it as an additional range */
9683         if (end < array[0] - 1) {
9684             i_s = i_e = -1;
9685             goto splice_in_new_range;
9686         }
9687
9688         /* Here the new range adjoins the existing first range, extending it
9689          * downwards. */
9690         array[0] = start;
9691
9692         /* And continue on below to handle the rest.  We know that the index of
9693          * the beginning of the range is the first one of the array */
9694         i_s = 0;
9695     }
9696     else { /* Not prepending any part of the new range to the existing list.
9697             * Find where in the list it should go.  This finds i_s, such that:
9698             *     invlist[i_s] <= start < array[i_s+1]
9699             */
9700         i_s = _invlist_search(invlist, start);
9701     }
9702
9703     /* At this point, any extending before the beginning of the inversion list
9704      * and/or after the end has been done.  This has made it so that, in the
9705      * code below, each endpoint of the new range is either in a range that is
9706      * in the set, or is in a gap between two ranges that are.  This means we
9707      * don't have to worry about exceeding the array bounds.
9708      *
9709      * Find where in the list the new range ends (but we can skip this if we
9710      * have already determined what it is, or if it will be the same as i_s,
9711      * which we already have computed) */
9712     if (i_e == 0) {
9713         i_e = (start == end)
9714               ? i_s
9715               : _invlist_search(invlist, end);
9716     }
9717
9718     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
9719      * is a range that goes to infinity there is no element at invlist[i_e+1],
9720      * so only the first relation holds. */
9721
9722     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9723
9724         /* Here, the ranges on either side of the beginning of the new range
9725          * are in the set, and this range starts in the gap between them.
9726          *
9727          * The new range extends the range above it downwards if the new range
9728          * ends at or above that range's start */
9729         const bool extends_the_range_above = (   end == UV_MAX
9730                                               || end + 1 >= array[i_s+1]);
9731
9732         /* The new range extends the range below it upwards if it begins just
9733          * after where that range ends */
9734         if (start == array[i_s]) {
9735
9736             /* If the new range fills the entire gap between the other ranges,
9737              * they will get merged together.  Other ranges may also get
9738              * merged, depending on how many of them the new range spans.  In
9739              * the general case, we do the merge later, just once, after we
9740              * figure out how many to merge.  But in the case where the new
9741              * range exactly spans just this one gap (possibly extending into
9742              * the one above), we do the merge here, and an early exit.  This
9743              * is done here to avoid having to special case later. */
9744             if (i_e - i_s <= 1) {
9745
9746                 /* If i_e - i_s == 1, it means that the new range terminates
9747                  * within the range above, and hence 'extends_the_range_above'
9748                  * must be true.  (If the range above it extends to infinity,
9749                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9750                  * will be 0, so no harm done.) */
9751                 if (extends_the_range_above) {
9752                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9753                     invlist_set_len(invlist,
9754                                     len - 2,
9755                                     *(get_invlist_offset_addr(invlist)));
9756                     return invlist;
9757                 }
9758
9759                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
9760                  * to the same range, and below we are about to decrement i_s
9761                  * */
9762                 i_e--;
9763             }
9764
9765             /* Here, the new range is adjacent to the one below.  (It may also
9766              * span beyond the range above, but that will get resolved later.)
9767              * Extend the range below to include this one. */
9768             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
9769             i_s--;
9770             start = array[i_s];
9771         }
9772         else if (extends_the_range_above) {
9773
9774             /* Here the new range only extends the range above it, but not the
9775              * one below.  It merges with the one above.  Again, we keep i_e
9776              * and i_s in sync if they point to the same range */
9777             if (i_e == i_s) {
9778                 i_e++;
9779             }
9780             i_s++;
9781             array[i_s] = start;
9782         }
9783     }
9784
9785     /* Here, we've dealt with the new range start extending any adjoining
9786      * existing ranges.
9787      *
9788      * If the new range extends to infinity, it is now the final one,
9789      * regardless of what was there before */
9790     if (UNLIKELY(end == UV_MAX)) {
9791         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
9792         return invlist;
9793     }
9794
9795     /* If i_e started as == i_s, it has also been dealt with,
9796      * and been updated to the new i_s, which will fail the following if */
9797     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
9798
9799         /* Here, the ranges on either side of the end of the new range are in
9800          * the set, and this range ends in the gap between them.
9801          *
9802          * If this range is adjacent to (hence extends) the range above it, it
9803          * becomes part of that range; likewise if it extends the range below,
9804          * it becomes part of that range */
9805         if (end + 1 == array[i_e+1]) {
9806             i_e++;
9807             array[i_e] = start;
9808         }
9809         else if (start <= array[i_e]) {
9810             array[i_e] = end + 1;
9811             i_e--;
9812         }
9813     }
9814
9815     if (i_s == i_e) {
9816
9817         /* If the range fits entirely in an existing range (as possibly already
9818          * extended above), it doesn't add anything new */
9819         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9820             return invlist;
9821         }
9822
9823         /* Here, no part of the range is in the list.  Must add it.  It will
9824          * occupy 2 more slots */
9825       splice_in_new_range:
9826
9827         invlist_extend(invlist, len + 2);
9828         array = invlist_array(invlist);
9829         /* Move the rest of the array down two slots. Don't include any
9830          * trailing NUL */
9831         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
9832
9833         /* Do the actual splice */
9834         array[i_e+1] = start;
9835         array[i_e+2] = end + 1;
9836         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
9837         return invlist;
9838     }
9839
9840     /* Here the new range crossed the boundaries of a pre-existing range.  The
9841      * code above has adjusted things so that both ends are in ranges that are
9842      * in the set.  This means everything in between must also be in the set.
9843      * Just squash things together */
9844     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
9845     invlist_set_len(invlist,
9846                     len - i_e + i_s,
9847                     *(get_invlist_offset_addr(invlist)));
9848
9849     return invlist;
9850 }
9851
9852 SV*
9853 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9854                                  UV** other_elements_ptr)
9855 {
9856     /* Create and return an inversion list whose contents are to be populated
9857      * by the caller.  The caller gives the number of elements (in 'size') and
9858      * the very first element ('element0').  This function will set
9859      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9860      * are to be placed.
9861      *
9862      * Obviously there is some trust involved that the caller will properly
9863      * fill in the other elements of the array.
9864      *
9865      * (The first element needs to be passed in, as the underlying code does
9866      * things differently depending on whether it is zero or non-zero) */
9867
9868     SV* invlist = _new_invlist(size);
9869     bool offset;
9870
9871     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9872
9873     invlist = add_cp_to_invlist(invlist, element0);
9874     offset = *get_invlist_offset_addr(invlist);
9875
9876     invlist_set_len(invlist, size, offset);
9877     *other_elements_ptr = invlist_array(invlist) + 1;
9878     return invlist;
9879 }
9880
9881 #endif
9882
9883 PERL_STATIC_INLINE SV*
9884 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9885     return _add_range_to_invlist(invlist, cp, cp);
9886 }
9887
9888 #ifndef PERL_IN_XSUB_RE
9889 void
9890 Perl__invlist_invert(pTHX_ SV* const invlist)
9891 {
9892     /* Complement the input inversion list.  This adds a 0 if the list didn't
9893      * have a zero; removes it otherwise.  As described above, the data
9894      * structure is set up so that this is very efficient */
9895
9896     PERL_ARGS_ASSERT__INVLIST_INVERT;
9897
9898     assert(! invlist_is_iterating(invlist));
9899
9900     /* The inverse of matching nothing is matching everything */
9901     if (_invlist_len(invlist) == 0) {
9902         _append_range_to_invlist(invlist, 0, UV_MAX);
9903         return;
9904     }
9905
9906     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9907 }
9908
9909 #endif
9910
9911 PERL_STATIC_INLINE SV*
9912 S_invlist_clone(pTHX_ SV* const invlist)
9913 {
9914
9915     /* Return a new inversion list that is a copy of the input one, which is
9916      * unchanged.  The new list will not be mortal even if the old one was. */
9917
9918     /* Need to allocate extra space to accommodate Perl's addition of a
9919      * trailing NUL to SvPV's, since it thinks they are always strings */
9920     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9921     STRLEN physical_length = SvCUR(invlist);
9922     bool offset = *(get_invlist_offset_addr(invlist));
9923
9924     PERL_ARGS_ASSERT_INVLIST_CLONE;
9925
9926     *(get_invlist_offset_addr(new_invlist)) = offset;
9927     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9928     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9929
9930     return new_invlist;
9931 }
9932
9933 PERL_STATIC_INLINE STRLEN*
9934 S_get_invlist_iter_addr(SV* invlist)
9935 {
9936     /* Return the address of the UV that contains the current iteration
9937      * position */
9938
9939     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9940
9941     assert(SvTYPE(invlist) == SVt_INVLIST);
9942
9943     return &(((XINVLIST*) SvANY(invlist))->iterator);
9944 }
9945
9946 PERL_STATIC_INLINE void
9947 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9948 {
9949     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9950
9951     *get_invlist_iter_addr(invlist) = 0;
9952 }
9953
9954 PERL_STATIC_INLINE void
9955 S_invlist_iterfinish(SV* invlist)
9956 {
9957     /* Terminate iterator for invlist.  This is to catch development errors.
9958      * Any iteration that is interrupted before completed should call this
9959      * function.  Functions that add code points anywhere else but to the end
9960      * of an inversion list assert that they are not in the middle of an
9961      * iteration.  If they were, the addition would make the iteration
9962      * problematical: if the iteration hadn't reached the place where things
9963      * were being added, it would be ok */
9964
9965     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9966
9967     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9968 }
9969
9970 STATIC bool
9971 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9972 {
9973     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9974      * This call sets in <*start> and <*end>, the next range in <invlist>.
9975      * Returns <TRUE> if successful and the next call will return the next
9976      * range; <FALSE> if was already at the end of the list.  If the latter,
9977      * <*start> and <*end> are unchanged, and the next call to this function
9978      * will start over at the beginning of the list */
9979
9980     STRLEN* pos = get_invlist_iter_addr(invlist);
9981     UV len = _invlist_len(invlist);
9982     UV *array;
9983
9984     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9985
9986     if (*pos >= len) {
9987         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9988         return FALSE;
9989     }
9990
9991     array = invlist_array(invlist);
9992
9993     *start = array[(*pos)++];
9994
9995     if (*pos >= len) {
9996         *end = UV_MAX;
9997     }
9998     else {
9999         *end = array[(*pos)++] - 1;
10000     }
10001
10002     return TRUE;
10003 }
10004
10005 PERL_STATIC_INLINE UV
10006 S_invlist_highest(SV* const invlist)
10007 {
10008     /* Returns the highest code point that matches an inversion list.  This API
10009      * has an ambiguity, as it returns 0 under either the highest is actually
10010      * 0, or if the list is empty.  If this distinction matters to you, check
10011      * for emptiness before calling this function */
10012
10013     UV len = _invlist_len(invlist);
10014     UV *array;
10015
10016     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10017
10018     if (len == 0) {
10019         return 0;
10020     }
10021
10022     array = invlist_array(invlist);
10023
10024     /* The last element in the array in the inversion list always starts a
10025      * range that goes to infinity.  That range may be for code points that are
10026      * matched in the inversion list, or it may be for ones that aren't
10027      * matched.  In the latter case, the highest code point in the set is one
10028      * less than the beginning of this range; otherwise it is the final element
10029      * of this range: infinity */
10030     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10031            ? UV_MAX
10032            : array[len - 1] - 1;
10033 }
10034
10035 STATIC SV *
10036 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10037 {
10038     /* Get the contents of an inversion list into a string SV so that they can
10039      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10040      * traditionally done for debug tracing; otherwise it uses a format
10041      * suitable for just copying to the output, with blanks between ranges and
10042      * a dash between range components */
10043
10044     UV start, end;
10045     SV* output;
10046     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10047     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10048
10049     if (traditional_style) {
10050         output = newSVpvs("\n");
10051     }
10052     else {
10053         output = newSVpvs("");
10054     }
10055
10056     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10057
10058     assert(! invlist_is_iterating(invlist));
10059
10060     invlist_iterinit(invlist);
10061     while (invlist_iternext(invlist, &start, &end)) {
10062         if (end == UV_MAX) {
10063             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
10064                                           start, intra_range_delimiter,
10065                                                  inter_range_delimiter);
10066         }
10067         else if (end != start) {
10068             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10069                                           start,
10070                                                    intra_range_delimiter,
10071                                                   end, inter_range_delimiter);
10072         }
10073         else {
10074             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10075                                           start, inter_range_delimiter);
10076         }
10077     }
10078
10079     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10080         SvCUR_set(output, SvCUR(output) - 1);
10081     }
10082
10083     return output;
10084 }
10085
10086 #ifndef PERL_IN_XSUB_RE
10087 void
10088 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10089                          const char * const indent, SV* const invlist)
10090 {
10091     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10092      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10093      * the string 'indent'.  The output looks like this:
10094          [0] 0x000A .. 0x000D
10095          [2] 0x0085
10096          [4] 0x2028 .. 0x2029
10097          [6] 0x3104 .. INFINITY
10098      * This means that the first range of code points matched by the list are
10099      * 0xA through 0xD; the second range contains only the single code point
10100      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10101      * are used to define each range (except if the final range extends to
10102      * infinity, only a single element is needed).  The array index of the
10103      * first element for the corresponding range is given in brackets. */
10104
10105     UV start, end;
10106     STRLEN count = 0;
10107
10108     PERL_ARGS_ASSERT__INVLIST_DUMP;
10109
10110     if (invlist_is_iterating(invlist)) {
10111         Perl_dump_indent(aTHX_ level, file,
10112              "%sCan't dump inversion list because is in middle of iterating\n",
10113              indent);
10114         return;
10115     }
10116
10117     invlist_iterinit(invlist);
10118     while (invlist_iternext(invlist, &start, &end)) {
10119         if (end == UV_MAX) {
10120             Perl_dump_indent(aTHX_ level, file,
10121                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
10122                                    indent, (UV)count, start);
10123         }
10124         else if (end != start) {
10125             Perl_dump_indent(aTHX_ level, file,
10126                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10127                                 indent, (UV)count, start,         end);
10128         }
10129         else {
10130             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10131                                             indent, (UV)count, start);
10132         }
10133         count += 2;
10134     }
10135 }
10136
10137 void
10138 Perl__load_PL_utf8_foldclosures (pTHX)
10139 {
10140     assert(! PL_utf8_foldclosures);
10141
10142     /* If the folds haven't been read in, call a fold function
10143      * to force that */
10144     if (! PL_utf8_tofold) {
10145         U8 dummy[UTF8_MAXBYTES_CASE+1];
10146         const U8 hyphen[] = HYPHEN_UTF8;
10147
10148         /* This string is just a short named one above \xff */
10149         toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL);
10150         assert(PL_utf8_tofold); /* Verify that worked */
10151     }
10152     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10153 }
10154 #endif
10155
10156 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10157 bool
10158 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10159 {
10160     /* Return a boolean as to if the two passed in inversion lists are
10161      * identical.  The final argument, if TRUE, says to take the complement of
10162      * the second inversion list before doing the comparison */
10163
10164     const UV* array_a = invlist_array(a);
10165     const UV* array_b = invlist_array(b);
10166     UV len_a = _invlist_len(a);
10167     UV len_b = _invlist_len(b);
10168
10169     PERL_ARGS_ASSERT__INVLISTEQ;
10170
10171     /* If are to compare 'a' with the complement of b, set it
10172      * up so are looking at b's complement. */
10173     if (complement_b) {
10174
10175         /* The complement of nothing is everything, so <a> would have to have
10176          * just one element, starting at zero (ending at infinity) */
10177         if (len_b == 0) {
10178             return (len_a == 1 && array_a[0] == 0);
10179         }
10180         else if (array_b[0] == 0) {
10181
10182             /* Otherwise, to complement, we invert.  Here, the first element is
10183              * 0, just remove it.  To do this, we just pretend the array starts
10184              * one later */
10185
10186             array_b++;
10187             len_b--;
10188         }
10189         else {
10190
10191             /* But if the first element is not zero, we pretend the list starts
10192              * at the 0 that is always stored immediately before the array. */
10193             array_b--;
10194             len_b++;
10195         }
10196     }
10197
10198     return    len_a == len_b
10199            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10200
10201 }
10202 #endif
10203
10204 /*
10205  * As best we can, determine the characters that can match the start of
10206  * the given EXACTF-ish node.
10207  *
10208  * Returns the invlist as a new SV*; it is the caller's responsibility to
10209  * call SvREFCNT_dec() when done with it.
10210  */
10211 STATIC SV*
10212 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10213 {
10214     const U8 * s = (U8*)STRING(node);
10215     SSize_t bytelen = STR_LEN(node);
10216     UV uc;
10217     /* Start out big enough for 2 separate code points */
10218     SV* invlist = _new_invlist(4);
10219
10220     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10221
10222     if (! UTF) {
10223         uc = *s;
10224
10225         /* We punt and assume can match anything if the node begins
10226          * with a multi-character fold.  Things are complicated.  For
10227          * example, /ffi/i could match any of:
10228          *  "\N{LATIN SMALL LIGATURE FFI}"
10229          *  "\N{LATIN SMALL LIGATURE FF}I"
10230          *  "F\N{LATIN SMALL LIGATURE FI}"
10231          *  plus several other things; and making sure we have all the
10232          *  possibilities is hard. */
10233         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10234             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10235         }
10236         else {
10237             /* Any Latin1 range character can potentially match any
10238              * other depending on the locale */
10239             if (OP(node) == EXACTFL) {
10240                 _invlist_union(invlist, PL_Latin1, &invlist);
10241             }
10242             else {
10243                 /* But otherwise, it matches at least itself.  We can
10244                  * quickly tell if it has a distinct fold, and if so,
10245                  * it matches that as well */
10246                 invlist = add_cp_to_invlist(invlist, uc);
10247                 if (IS_IN_SOME_FOLD_L1(uc))
10248                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10249             }
10250
10251             /* Some characters match above-Latin1 ones under /i.  This
10252              * is true of EXACTFL ones when the locale is UTF-8 */
10253             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10254                 && (! isASCII(uc) || (OP(node) != EXACTFA
10255                                     && OP(node) != EXACTFA_NO_TRIE)))
10256             {
10257                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10258             }
10259         }
10260     }
10261     else {  /* Pattern is UTF-8 */
10262         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10263         STRLEN foldlen = UTF8SKIP(s);
10264         const U8* e = s + bytelen;
10265         SV** listp;
10266
10267         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10268
10269         /* The only code points that aren't folded in a UTF EXACTFish
10270          * node are are the problematic ones in EXACTFL nodes */
10271         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10272             /* We need to check for the possibility that this EXACTFL
10273              * node begins with a multi-char fold.  Therefore we fold
10274              * the first few characters of it so that we can make that
10275              * check */
10276             U8 *d = folded;
10277             int i;
10278
10279             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10280                 if (isASCII(*s)) {
10281                     *(d++) = (U8) toFOLD(*s);
10282                     s++;
10283                 }
10284                 else {
10285                     STRLEN len;
10286                     toFOLD_utf8_safe(s, e, d, &len);
10287                     d += len;
10288                     s += UTF8SKIP(s);
10289                 }
10290             }
10291
10292             /* And set up so the code below that looks in this folded
10293              * buffer instead of the node's string */
10294             e = d;
10295             foldlen = UTF8SKIP(folded);
10296             s = folded;
10297         }
10298
10299         /* When we reach here 's' points to the fold of the first
10300          * character(s) of the node; and 'e' points to far enough along
10301          * the folded string to be just past any possible multi-char
10302          * fold. 'foldlen' is the length in bytes of the first
10303          * character in 's'
10304          *
10305          * Unlike the non-UTF-8 case, the macro for determining if a
10306          * string is a multi-char fold requires all the characters to
10307          * already be folded.  This is because of all the complications
10308          * if not.  Note that they are folded anyway, except in EXACTFL
10309          * nodes.  Like the non-UTF case above, we punt if the node
10310          * begins with a multi-char fold  */
10311
10312         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10313             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10314         }
10315         else {  /* Single char fold */
10316
10317             /* It matches all the things that fold to it, which are
10318              * found in PL_utf8_foldclosures (including itself) */
10319             invlist = add_cp_to_invlist(invlist, uc);
10320             if (! PL_utf8_foldclosures)
10321                 _load_PL_utf8_foldclosures();
10322             if ((listp = hv_fetch(PL_utf8_foldclosures,
10323                                 (char *) s, foldlen, FALSE)))
10324             {
10325                 AV* list = (AV*) *listp;
10326                 IV k;
10327                 for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
10328                     SV** c_p = av_fetch(list, k, FALSE);
10329                     UV c;
10330                     assert(c_p);
10331
10332                     c = SvUV(*c_p);
10333
10334                     /* /aa doesn't allow folds between ASCII and non- */
10335                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10336                         && isASCII(c) != isASCII(uc))
10337                     {
10338                         continue;
10339                     }
10340
10341                     invlist = add_cp_to_invlist(invlist, c);
10342                 }
10343             }
10344         }
10345     }
10346
10347     return invlist;
10348 }
10349
10350 #undef HEADER_LENGTH
10351 #undef TO_INTERNAL_SIZE
10352 #undef FROM_INTERNAL_SIZE
10353 #undef INVLIST_VERSION_ID
10354
10355 /* End of inversion list object */
10356
10357 STATIC void
10358 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10359 {
10360     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10361      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10362      * should point to the first flag; it is updated on output to point to the
10363      * final ')' or ':'.  There needs to be at least one flag, or this will
10364      * abort */
10365
10366     /* for (?g), (?gc), and (?o) warnings; warning
10367        about (?c) will warn about (?g) -- japhy    */
10368
10369 #define WASTED_O  0x01
10370 #define WASTED_G  0x02
10371 #define WASTED_C  0x04
10372 #define WASTED_GC (WASTED_G|WASTED_C)
10373     I32 wastedflags = 0x00;
10374     U32 posflags = 0, negflags = 0;
10375     U32 *flagsp = &posflags;
10376     char has_charset_modifier = '\0';
10377     regex_charset cs;
10378     bool has_use_defaults = FALSE;
10379     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10380     int x_mod_count = 0;
10381
10382     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10383
10384     /* '^' as an initial flag sets certain defaults */
10385     if (UCHARAT(RExC_parse) == '^') {
10386         RExC_parse++;
10387         has_use_defaults = TRUE;
10388         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10389         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10390                                         ? REGEX_UNICODE_CHARSET
10391                                         : REGEX_DEPENDS_CHARSET);
10392     }
10393
10394     cs = get_regex_charset(RExC_flags);
10395     if (cs == REGEX_DEPENDS_CHARSET
10396         && (RExC_utf8 || RExC_uni_semantics))
10397     {
10398         cs = REGEX_UNICODE_CHARSET;
10399     }
10400
10401     while (RExC_parse < RExC_end) {
10402         /* && strchr("iogcmsx", *RExC_parse) */
10403         /* (?g), (?gc) and (?o) are useless here
10404            and must be globally applied -- japhy */
10405         switch (*RExC_parse) {
10406
10407             /* Code for the imsxn flags */
10408             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10409
10410             case LOCALE_PAT_MOD:
10411                 if (has_charset_modifier) {
10412                     goto excess_modifier;
10413                 }
10414                 else if (flagsp == &negflags) {
10415                     goto neg_modifier;
10416                 }
10417                 cs = REGEX_LOCALE_CHARSET;
10418                 has_charset_modifier = LOCALE_PAT_MOD;
10419                 break;
10420             case UNICODE_PAT_MOD:
10421                 if (has_charset_modifier) {
10422                     goto excess_modifier;
10423                 }
10424                 else if (flagsp == &negflags) {
10425                     goto neg_modifier;
10426                 }
10427                 cs = REGEX_UNICODE_CHARSET;
10428                 has_charset_modifier = UNICODE_PAT_MOD;
10429                 break;
10430             case ASCII_RESTRICT_PAT_MOD:
10431                 if (flagsp == &negflags) {
10432                     goto neg_modifier;
10433                 }
10434                 if (has_charset_modifier) {
10435                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10436                         goto excess_modifier;
10437                     }
10438                     /* Doubled modifier implies more restricted */
10439                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10440                 }
10441                 else {
10442                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10443                 }
10444                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10445                 break;
10446             case DEPENDS_PAT_MOD:
10447                 if (has_use_defaults) {
10448                     goto fail_modifiers;
10449                 }
10450                 else if (flagsp == &negflags) {
10451                     goto neg_modifier;
10452                 }
10453                 else if (has_charset_modifier) {
10454                     goto excess_modifier;
10455                 }
10456
10457                 /* The dual charset means unicode semantics if the
10458                  * pattern (or target, not known until runtime) are
10459                  * utf8, or something in the pattern indicates unicode
10460                  * semantics */
10461                 cs = (RExC_utf8 || RExC_uni_semantics)
10462                      ? REGEX_UNICODE_CHARSET
10463                      : REGEX_DEPENDS_CHARSET;
10464                 has_charset_modifier = DEPENDS_PAT_MOD;
10465                 break;
10466               excess_modifier:
10467                 RExC_parse++;
10468                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10469                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10470                 }
10471                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10472                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10473                                         *(RExC_parse - 1));
10474                 }
10475                 else {
10476                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10477                 }
10478                 NOT_REACHED; /*NOTREACHED*/
10479               neg_modifier:
10480                 RExC_parse++;
10481                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10482                                     *(RExC_parse - 1));
10483                 NOT_REACHED; /*NOTREACHED*/
10484             case ONCE_PAT_MOD: /* 'o' */
10485             case GLOBAL_PAT_MOD: /* 'g' */
10486                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10487                     const I32 wflagbit = *RExC_parse == 'o'
10488                                          ? WASTED_O
10489                                          : WASTED_G;
10490                     if (! (wastedflags & wflagbit) ) {
10491                         wastedflags |= wflagbit;
10492                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10493                         vWARN5(
10494                             RExC_parse + 1,
10495                             "Useless (%s%c) - %suse /%c modifier",
10496                             flagsp == &negflags ? "?-" : "?",
10497                             *RExC_parse,
10498                             flagsp == &negflags ? "don't " : "",
10499                             *RExC_parse
10500                         );
10501                     }
10502                 }
10503                 break;
10504
10505             case CONTINUE_PAT_MOD: /* 'c' */
10506                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10507                     if (! (wastedflags & WASTED_C) ) {
10508                         wastedflags |= WASTED_GC;
10509                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10510                         vWARN3(
10511                             RExC_parse + 1,
10512                             "Useless (%sc) - %suse /gc modifier",
10513                             flagsp == &negflags ? "?-" : "?",
10514                             flagsp == &negflags ? "don't " : ""
10515                         );
10516                     }
10517                 }
10518                 break;
10519             case KEEPCOPY_PAT_MOD: /* 'p' */
10520                 if (flagsp == &negflags) {
10521                     if (PASS2)
10522                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10523                 } else {
10524                     *flagsp |= RXf_PMf_KEEPCOPY;
10525                 }
10526                 break;
10527             case '-':
10528                 /* A flag is a default iff it is following a minus, so
10529                  * if there is a minus, it means will be trying to
10530                  * re-specify a default which is an error */
10531                 if (has_use_defaults || flagsp == &negflags) {
10532                     goto fail_modifiers;
10533                 }
10534                 flagsp = &negflags;
10535                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10536                 x_mod_count = 0;
10537                 break;
10538             case ':':
10539             case ')':
10540
10541                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10542                     negflags |= RXf_PMf_EXTENDED_MORE;
10543                 }
10544                 RExC_flags |= posflags;
10545
10546                 if (negflags & RXf_PMf_EXTENDED) {
10547                     negflags |= RXf_PMf_EXTENDED_MORE;
10548                 }
10549                 RExC_flags &= ~negflags;
10550                 set_regex_charset(&RExC_flags, cs);
10551
10552                 return;
10553             default:
10554               fail_modifiers:
10555                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10556                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10557                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10558                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10559                 NOT_REACHED; /*NOTREACHED*/
10560         }
10561
10562         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10563     }
10564
10565     vFAIL("Sequence (?... not terminated");
10566 }
10567
10568 /*
10569  - reg - regular expression, i.e. main body or parenthesized thing
10570  *
10571  * Caller must absorb opening parenthesis.
10572  *
10573  * Combining parenthesis handling with the base level of regular expression
10574  * is a trifle forced, but the need to tie the tails of the branches to what
10575  * follows makes it hard to avoid.
10576  */
10577 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10578 #ifdef DEBUGGING
10579 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10580 #else
10581 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10582 #endif
10583
10584 PERL_STATIC_INLINE regnode *
10585 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10586                              I32 *flagp,
10587                              char * parse_start,
10588                              char ch
10589                       )
10590 {
10591     regnode *ret;
10592     char* name_start = RExC_parse;
10593     U32 num = 0;
10594     SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10595                                             ? REG_RSN_RETURN_NULL
10596                                             : REG_RSN_RETURN_DATA);
10597     GET_RE_DEBUG_FLAGS_DECL;
10598
10599     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10600
10601     if (RExC_parse == name_start || *RExC_parse != ch) {
10602         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10603         vFAIL2("Sequence %.3s... not terminated",parse_start);
10604     }
10605
10606     if (!SIZE_ONLY) {
10607         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10608         RExC_rxi->data->data[num]=(void*)sv_dat;
10609         SvREFCNT_inc_simple_void(sv_dat);
10610     }
10611     RExC_sawback = 1;
10612     ret = reganode(pRExC_state,
10613                    ((! FOLD)
10614                      ? NREF
10615                      : (ASCII_FOLD_RESTRICTED)
10616                        ? NREFFA
10617                        : (AT_LEAST_UNI_SEMANTICS)
10618                          ? NREFFU
10619                          : (LOC)
10620                            ? NREFFL
10621                            : NREFF),
10622                     num);
10623     *flagp |= HASWIDTH;
10624
10625     Set_Node_Offset(ret, parse_start+1);
10626     Set_Node_Cur_Length(ret, parse_start);
10627
10628     nextchar(pRExC_state);
10629     return ret;
10630 }
10631
10632 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10633    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10634    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10635    upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
10636    NULL, which cannot happen.  */
10637 STATIC regnode *
10638 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10639     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10640      * 2 is like 1, but indicates that nextchar() has been called to advance
10641      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10642      * this flag alerts us to the need to check for that */
10643 {
10644     regnode *ret;               /* Will be the head of the group. */
10645     regnode *br;
10646     regnode *lastbr;
10647     regnode *ender = NULL;
10648     I32 parno = 0;
10649     I32 flags;
10650     U32 oregflags = RExC_flags;
10651     bool have_branch = 0;
10652     bool is_open = 0;
10653     I32 freeze_paren = 0;
10654     I32 after_freeze = 0;
10655     I32 num; /* numeric backreferences */
10656
10657     char * parse_start = RExC_parse; /* MJD */
10658     char * const oregcomp_parse = RExC_parse;
10659
10660     GET_RE_DEBUG_FLAGS_DECL;
10661
10662     PERL_ARGS_ASSERT_REG;
10663     DEBUG_PARSE("reg ");
10664
10665     *flagp = 0;                         /* Tentatively. */
10666
10667     /* Having this true makes it feasible to have a lot fewer tests for the
10668      * parse pointer being in scope.  For example, we can write
10669      *      while(isFOO(*RExC_parse)) RExC_parse++;
10670      * instead of
10671      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10672      */
10673     assert(*RExC_end == '\0');
10674
10675     /* Make an OPEN node, if parenthesized. */
10676     if (paren) {
10677
10678         /* Under /x, space and comments can be gobbled up between the '(' and
10679          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
10680          * intervening space, as the sequence is a token, and a token should be
10681          * indivisible */
10682         bool has_intervening_patws = (paren == 2 || paren == 's')
10683                                   && *(RExC_parse - 1) != '(';
10684
10685         if (RExC_parse >= RExC_end) {
10686             vFAIL("Unmatched (");
10687         }
10688
10689         if (paren == 's') {
10690
10691             /* A nested script run  is a no-op besides clustering */
10692             if (RExC_in_script_run) {
10693                 paren = ':';
10694                 nextchar(pRExC_state);
10695                 ret = NULL;
10696                 goto parse_rest;
10697             }
10698             RExC_in_script_run = 1;
10699
10700             ret = reg_node(pRExC_state, SROPEN);
10701             is_open = 1;
10702         }
10703         else if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10704             char *start_verb = RExC_parse + 1;
10705             STRLEN verb_len;
10706             char *start_arg = NULL;
10707             unsigned char op = 0;
10708             int arg_required = 0;
10709             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10710
10711             if (has_intervening_patws) {
10712                 RExC_parse++;   /* past the '*' */
10713                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10714             }
10715             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10716                 if ( *RExC_parse == ':' ) {
10717                     start_arg = RExC_parse + 1;
10718                     break;
10719                 }
10720                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10721             }
10722             verb_len = RExC_parse - start_verb;
10723             if ( start_arg ) {
10724                 if (RExC_parse >= RExC_end) {
10725                     goto unterminated_verb_pattern;
10726                 }
10727
10728                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10729                 while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10730                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10731                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10732                   unterminated_verb_pattern:
10733                     vFAIL("Unterminated verb pattern argument");
10734                 if ( RExC_parse == start_arg )
10735                     start_arg = NULL;
10736             } else {
10737                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10738                     vFAIL("Unterminated verb pattern");
10739             }
10740
10741             /* Here, we know that RExC_parse < RExC_end */
10742
10743             switch ( *start_verb ) {
10744             case 'A':  /* (*ACCEPT) */
10745                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10746                     op = ACCEPT;
10747                     internal_argval = RExC_nestroot;
10748                 }
10749                 break;
10750             case 'C':  /* (*COMMIT) */
10751                 if ( memEQs(start_verb,verb_len,"COMMIT") )
10752                     op = COMMIT;
10753                 break;
10754             case 'F':  /* (*FAIL) */
10755                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10756                     op = OPFAIL;
10757                 }
10758                 break;
10759             case ':':  /* (*:NAME) */
10760             case 'M':  /* (*MARK:NAME) */
10761                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10762                     op = MARKPOINT;
10763                     arg_required = 1;
10764                 }
10765                 break;
10766             case 'P':  /* (*PRUNE) */
10767                 if ( memEQs(start_verb,verb_len,"PRUNE") )
10768                     op = PRUNE;
10769                 break;
10770             case 'S':   /* (*SKIP) */
10771                 if ( memEQs(start_verb,verb_len,"SKIP") )
10772                     op = SKIP;
10773                 break;
10774             case 'T':  /* (*THEN) */
10775                 /* [19:06] <TimToady> :: is then */
10776                 if ( memEQs(start_verb,verb_len,"THEN") ) {
10777                     op = CUTGROUP;
10778                     RExC_seen |= REG_CUTGROUP_SEEN;
10779                 }
10780                 break;
10781             }
10782             if ( ! op ) {
10783                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10784                 vFAIL2utf8f(
10785                     "Unknown verb pattern '%" UTF8f "'",
10786                     UTF8fARG(UTF, verb_len, start_verb));
10787             }
10788             if ( arg_required && !start_arg ) {
10789                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10790                     verb_len, start_verb);
10791             }
10792             if (internal_argval == -1) {
10793                 ret = reganode(pRExC_state, op, 0);
10794             } else {
10795                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10796             }
10797             RExC_seen |= REG_VERBARG_SEEN;
10798             if ( ! SIZE_ONLY ) {
10799                 if (start_arg) {
10800                     SV *sv = newSVpvn( start_arg,
10801                                        RExC_parse - start_arg);
10802                     ARG(ret) = add_data( pRExC_state,
10803                                          STR_WITH_LEN("S"));
10804                     RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10805                     ret->flags = 1;
10806                 } else {
10807                     ret->flags = 0;
10808                 }
10809                 if ( internal_argval != -1 )
10810                     ARG2L_SET(ret, internal_argval);
10811             }
10812             nextchar(pRExC_state);
10813             return ret;
10814         }
10815         else if (*RExC_parse == '+') { /* (+...) */
10816             RExC_parse++;
10817
10818             if (has_intervening_patws) {
10819                 /* XXX Note that a potential gotcha is that outside of /x '( +
10820                  * ...)' means to match a space at least once ...   This is a
10821                  * problem elsewhere too */
10822                 vFAIL("In '(+...)', the '(' and '+' must be adjacent");
10823             }
10824
10825             if (! memBEGINPs(RExC_parse, (STRLEN) (RExC_end - RExC_parse),
10826                              "script_run:"))
10827             {
10828                 RExC_parse += strcspn(RExC_parse, ":)");
10829                 vFAIL("Unknown (+ pattern");
10830             }
10831             else {
10832
10833                 /* This indicates Unicode rules. */
10834                 REQUIRE_UNI_RULES(flagp, NULL);
10835
10836                 RExC_parse += sizeof("script_run:") - 1;
10837
10838                 if (PASS2) {
10839                     Perl_ck_warner_d(aTHX_
10840                         packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
10841                         "The script_run feature is experimental"
10842                         REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
10843                 }
10844
10845                 ret = reg(pRExC_state, 's', &flags, depth+1);
10846                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10847                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10848                     return NULL;
10849                 }
10850
10851                 return ret;
10852             }
10853         }
10854         else if (*RExC_parse == '?') { /* (?...) */
10855             bool is_logical = 0;
10856             const char * const seqstart = RExC_parse;
10857             const char * endptr;
10858             if (has_intervening_patws) {
10859                 RExC_parse++;
10860                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10861             }
10862
10863             RExC_parse++;           /* past the '?' */
10864             paren = *RExC_parse;    /* might be a trailing NUL, if not
10865                                        well-formed */
10866             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10867             if (RExC_parse > RExC_end) {
10868                 paren = '\0';
10869             }
10870             ret = NULL;                 /* For look-ahead/behind. */
10871             switch (paren) {
10872
10873             case 'P':   /* (?P...) variants for those used to PCRE/Python */
10874                 paren = *RExC_parse;
10875                 if ( paren == '<') {    /* (?P<...>) named capture */
10876                     RExC_parse++;
10877                     if (RExC_parse >= RExC_end) {
10878                         vFAIL("Sequence (?P<... not terminated");
10879                     }
10880                     goto named_capture;
10881                 }
10882                 else if (paren == '>') {   /* (?P>name) named recursion */
10883                     RExC_parse++;
10884                     if (RExC_parse >= RExC_end) {
10885                         vFAIL("Sequence (?P>... not terminated");
10886                     }
10887                     goto named_recursion;
10888                 }
10889                 else if (paren == '=') {   /* (?P=...)  named backref */
10890                     RExC_parse++;
10891                     return handle_named_backref(pRExC_state, flagp,
10892                                                 parse_start, ')');
10893                 }
10894                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10895                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10896                 vFAIL3("Sequence (%.*s...) not recognized",
10897                                 RExC_parse-seqstart, seqstart);
10898                 NOT_REACHED; /*NOTREACHED*/
10899             case '<':           /* (?<...) */
10900                 if (*RExC_parse == '!')
10901                     paren = ',';
10902                 else if (*RExC_parse != '=')
10903               named_capture:
10904                 {               /* (?<...>) */
10905                     char *name_start;
10906                     SV *svname;
10907                     paren= '>';
10908                 /* FALLTHROUGH */
10909             case '\'':          /* (?'...') */
10910                     name_start = RExC_parse;
10911                     svname = reg_scan_name(pRExC_state,
10912                         SIZE_ONLY    /* reverse test from the others */
10913                         ? REG_RSN_RETURN_NAME
10914                         : REG_RSN_RETURN_NULL);
10915                     if (   RExC_parse == name_start
10916                         || RExC_parse >= RExC_end
10917                         || *RExC_parse != paren)
10918                     {
10919                         vFAIL2("Sequence (?%c... not terminated",
10920                             paren=='>' ? '<' : paren);
10921                     }
10922                     if (SIZE_ONLY) {
10923                         HE *he_str;
10924                         SV *sv_dat = NULL;
10925                         if (!svname) /* shouldn't happen */
10926                             Perl_croak(aTHX_
10927                                 "panic: reg_scan_name returned NULL");
10928                         if (!RExC_paren_names) {
10929                             RExC_paren_names= newHV();
10930                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10931 #ifdef DEBUGGING
10932                             RExC_paren_name_list= newAV();
10933                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10934 #endif
10935                         }
10936                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10937                         if ( he_str )
10938                             sv_dat = HeVAL(he_str);
10939                         if ( ! sv_dat ) {
10940                             /* croak baby croak */
10941                             Perl_croak(aTHX_
10942                                 "panic: paren_name hash element allocation failed");
10943                         } else if ( SvPOK(sv_dat) ) {
10944                             /* (?|...) can mean we have dupes so scan to check
10945                                its already been stored. Maybe a flag indicating
10946                                we are inside such a construct would be useful,
10947                                but the arrays are likely to be quite small, so
10948                                for now we punt -- dmq */
10949                             IV count = SvIV(sv_dat);
10950                             I32 *pv = (I32*)SvPVX(sv_dat);
10951                             IV i;
10952                             for ( i = 0 ; i < count ; i++ ) {
10953                                 if ( pv[i] == RExC_npar ) {
10954                                     count = 0;
10955                                     break;
10956                                 }
10957                             }
10958                             if ( count ) {
10959                                 pv = (I32*)SvGROW(sv_dat,
10960                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10961                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10962                                 pv[count] = RExC_npar;
10963                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10964                             }
10965                         } else {
10966                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10967                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10968                                                                 sizeof(I32));
10969                             SvIOK_on(sv_dat);
10970                             SvIV_set(sv_dat, 1);
10971                         }
10972 #ifdef DEBUGGING
10973                         /* Yes this does cause a memory leak in debugging Perls
10974                          * */
10975                         if (!av_store(RExC_paren_name_list,
10976                                       RExC_npar, SvREFCNT_inc(svname)))
10977                             SvREFCNT_dec_NN(svname);
10978 #endif
10979
10980                         /*sv_dump(sv_dat);*/
10981                     }
10982                     nextchar(pRExC_state);
10983                     paren = 1;
10984                     goto capturing_parens;
10985                 }
10986                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10987                 RExC_in_lookbehind++;
10988                 RExC_parse++;
10989                 if (RExC_parse >= RExC_end) {
10990                     vFAIL("Sequence (?... not terminated");
10991                 }
10992
10993                 /* FALLTHROUGH */
10994             case '=':           /* (?=...) */
10995                 RExC_seen_zerolen++;
10996                 break;
10997             case '!':           /* (?!...) */
10998                 RExC_seen_zerolen++;
10999                 /* check if we're really just a "FAIL" assertion */
11000                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11001                                         FALSE /* Don't force to /x */ );
11002                 if (*RExC_parse == ')') {
11003                     ret=reganode(pRExC_state, OPFAIL, 0);
11004                     nextchar(pRExC_state);
11005                     return ret;
11006                 }
11007                 break;
11008             case '|':           /* (?|...) */
11009                 /* branch reset, behave like a (?:...) except that
11010                    buffers in alternations share the same numbers */
11011                 paren = ':';
11012                 after_freeze = freeze_paren = RExC_npar;
11013                 break;
11014             case ':':           /* (?:...) */
11015             case '>':           /* (?>...) */
11016                 break;
11017             case '$':           /* (?$...) */
11018             case '@':           /* (?@...) */
11019                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11020                 break;
11021             case '0' :           /* (?0) */
11022             case 'R' :           /* (?R) */
11023                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11024                     FAIL("Sequence (?R) not terminated");
11025                 num = 0;
11026                 RExC_seen |= REG_RECURSE_SEEN;
11027                 *flagp |= POSTPONED;
11028                 goto gen_recurse_regop;
11029                 /*notreached*/
11030             /* named and numeric backreferences */
11031             case '&':            /* (?&NAME) */
11032                 parse_start = RExC_parse - 1;
11033               named_recursion:
11034                 {
11035                     SV *sv_dat = reg_scan_name(pRExC_state,
11036                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11037                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11038                 }
11039                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11040                     vFAIL("Sequence (?&... not terminated");
11041                 goto gen_recurse_regop;
11042                 /* NOTREACHED */
11043             case '+':
11044                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11045                     RExC_parse++;
11046                     vFAIL("Illegal pattern");
11047                 }
11048                 goto parse_recursion;
11049                 /* NOTREACHED*/
11050             case '-': /* (?-1) */
11051                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11052                     RExC_parse--; /* rewind to let it be handled later */
11053                     goto parse_flags;
11054                 }
11055                 /* FALLTHROUGH */
11056             case '1': case '2': case '3': case '4': /* (?1) */
11057             case '5': case '6': case '7': case '8': case '9':
11058                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11059               parse_recursion:
11060                 {
11061                     bool is_neg = FALSE;
11062                     UV unum;
11063                     parse_start = RExC_parse - 1; /* MJD */
11064                     if (*RExC_parse == '-') {
11065                         RExC_parse++;
11066                         is_neg = TRUE;
11067                     }
11068                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11069                         && unum <= I32_MAX
11070                     ) {
11071                         num = (I32)unum;
11072                         RExC_parse = (char*)endptr;
11073                     } else
11074                         num = I32_MAX;
11075                     if (is_neg) {
11076                         /* Some limit for num? */
11077                         num = -num;
11078                     }
11079                 }
11080                 if (*RExC_parse!=')')
11081                     vFAIL("Expecting close bracket");
11082
11083               gen_recurse_regop:
11084                 if ( paren == '-' ) {
11085                     /*
11086                     Diagram of capture buffer numbering.
11087                     Top line is the normal capture buffer numbers
11088                     Bottom line is the negative indexing as from
11089                     the X (the (?-2))
11090
11091                     +   1 2    3 4 5 X          6 7
11092                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11093                     -   5 4    3 2 1 X          x x
11094
11095                     */
11096                     num = RExC_npar + num;
11097                     if (num < 1)  {
11098                         RExC_parse++;
11099                         vFAIL("Reference to nonexistent group");
11100                     }
11101                 } else if ( paren == '+' ) {
11102                     num = RExC_npar + num - 1;
11103                 }
11104                 /* We keep track how many GOSUB items we have produced.
11105                    To start off the ARG2L() of the GOSUB holds its "id",
11106                    which is used later in conjunction with RExC_recurse
11107                    to calculate the offset we need to jump for the GOSUB,
11108                    which it will store in the final representation.
11109                    We have to defer the actual calculation until much later
11110                    as the regop may move.
11111                  */
11112
11113                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11114                 if (!SIZE_ONLY) {
11115                     if (num > (I32)RExC_rx->nparens) {
11116                         RExC_parse++;
11117                         vFAIL("Reference to nonexistent group");
11118                     }
11119                     RExC_recurse_count++;
11120                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11121                         "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11122                               22, "|    |", (int)(depth * 2 + 1), "",
11123                               (UV)ARG(ret), (IV)ARG2L(ret)));
11124                 }
11125                 RExC_seen |= REG_RECURSE_SEEN;
11126
11127                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
11128                 Set_Node_Offset(ret, parse_start); /* MJD */
11129
11130                 *flagp |= POSTPONED;
11131                 assert(*RExC_parse == ')');
11132                 nextchar(pRExC_state);
11133                 return ret;
11134
11135             /* NOTREACHED */
11136
11137             case '?':           /* (??...) */
11138                 is_logical = 1;
11139                 if (*RExC_parse != '{') {
11140                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
11141                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11142                     vFAIL2utf8f(
11143                         "Sequence (%" UTF8f "...) not recognized",
11144                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11145                     NOT_REACHED; /*NOTREACHED*/
11146                 }
11147                 *flagp |= POSTPONED;
11148                 paren = '{';
11149                 RExC_parse++;
11150                 /* FALLTHROUGH */
11151             case '{':           /* (?{...}) */
11152             {
11153                 U32 n = 0;
11154                 struct reg_code_block *cb;
11155
11156                 RExC_seen_zerolen++;
11157
11158                 if (   !pRExC_state->code_blocks
11159                     || pRExC_state->code_index
11160                                         >= pRExC_state->code_blocks->count
11161                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11162                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11163                             - RExC_start)
11164                 ) {
11165                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11166                         FAIL("panic: Sequence (?{...}): no code block found\n");
11167                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11168                 }
11169                 /* this is a pre-compiled code block (?{...}) */
11170                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11171                 RExC_parse = RExC_start + cb->end;
11172                 if (!SIZE_ONLY) {
11173                     OP *o = cb->block;
11174                     if (cb->src_regex) {
11175                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11176                         RExC_rxi->data->data[n] =
11177                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
11178                         RExC_rxi->data->data[n+1] = (void*)o;
11179                     }
11180                     else {
11181                         n = add_data(pRExC_state,
11182                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11183                         RExC_rxi->data->data[n] = (void*)o;
11184                     }
11185                 }
11186                 pRExC_state->code_index++;
11187                 nextchar(pRExC_state);
11188
11189                 if (is_logical) {
11190                     regnode *eval;
11191                     ret = reg_node(pRExC_state, LOGICAL);
11192
11193                     eval = reg2Lanode(pRExC_state, EVAL,
11194                                        n,
11195
11196                                        /* for later propagation into (??{})
11197                                         * return value */
11198                                        RExC_flags & RXf_PMf_COMPILETIME
11199                                       );
11200                     if (!SIZE_ONLY) {
11201                         ret->flags = 2;
11202                     }
11203                     REGTAIL(pRExC_state, ret, eval);
11204                     /* deal with the length of this later - MJD */
11205                     return ret;
11206                 }
11207                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11208                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
11209                 Set_Node_Offset(ret, parse_start);
11210                 return ret;
11211             }
11212             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11213             {
11214                 int is_define= 0;
11215                 const int DEFINE_len = sizeof("DEFINE") - 1;
11216                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
11217                     if (   RExC_parse < RExC_end - 1
11218                         && (   RExC_parse[1] == '='
11219                             || RExC_parse[1] == '!'
11220                             || RExC_parse[1] == '<'
11221                             || RExC_parse[1] == '{')
11222                     ) { /* Lookahead or eval. */
11223                         I32 flag;
11224                         regnode *tail;
11225
11226                         ret = reg_node(pRExC_state, LOGICAL);
11227                         if (!SIZE_ONLY)
11228                             ret->flags = 1;
11229
11230                         tail = reg(pRExC_state, 1, &flag, depth+1);
11231                         if (flag & (RESTART_PASS1|NEED_UTF8)) {
11232                             *flagp = flag & (RESTART_PASS1|NEED_UTF8);
11233                             return NULL;
11234                         }
11235                         REGTAIL(pRExC_state, ret, tail);
11236                         goto insert_if;
11237                     }
11238                     /* Fall through to ‘Unknown switch condition’ at the
11239                        end of the if/else chain. */
11240                 }
11241                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11242                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11243                 {
11244                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11245                     char *name_start= RExC_parse++;
11246                     U32 num = 0;
11247                     SV *sv_dat=reg_scan_name(pRExC_state,
11248                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11249                     if (   RExC_parse == name_start
11250                         || RExC_parse >= RExC_end
11251                         || *RExC_parse != ch)
11252                     {
11253                         vFAIL2("Sequence (?(%c... not terminated",
11254                             (ch == '>' ? '<' : ch));
11255                     }
11256                     RExC_parse++;
11257                     if (!SIZE_ONLY) {
11258                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11259                         RExC_rxi->data->data[num]=(void*)sv_dat;
11260                         SvREFCNT_inc_simple_void(sv_dat);
11261                     }
11262                     ret = reganode(pRExC_state,NGROUPP,num);
11263                     goto insert_if_check_paren;
11264                 }
11265                 else if (memBEGINs(RExC_parse,
11266                                    (STRLEN) (RExC_end - RExC_parse),
11267                                    "DEFINE"))
11268                 {
11269                     ret = reganode(pRExC_state,DEFINEP,0);
11270                     RExC_parse += DEFINE_len;
11271                     is_define = 1;
11272                     goto insert_if_check_paren;
11273                 }
11274                 else if (RExC_parse[0] == 'R') {
11275                     RExC_parse++;
11276                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11277                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11278                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11279                      */
11280                     parno = 0;
11281                     if (RExC_parse[0] == '0') {
11282                         parno = 1;
11283                         RExC_parse++;
11284                     }
11285                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11286                         UV uv;
11287                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11288                             && uv <= I32_MAX
11289                         ) {
11290                             parno = (I32)uv + 1;
11291                             RExC_parse = (char*)endptr;
11292                         }
11293                         /* else "Switch condition not recognized" below */
11294                     } else if (RExC_parse[0] == '&') {
11295                         SV *sv_dat;
11296                         RExC_parse++;
11297                         sv_dat = reg_scan_name(pRExC_state,
11298                             SIZE_ONLY
11299                             ? REG_RSN_RETURN_NULL
11300                             : REG_RSN_RETURN_DATA);
11301
11302                         /* we should only have a false sv_dat when
11303                          * SIZE_ONLY is true, and we always have false
11304                          * sv_dat when SIZE_ONLY is true.
11305                          * reg_scan_name() will VFAIL() if the name is
11306                          * unknown when SIZE_ONLY is false, and otherwise
11307                          * will return something, and when SIZE_ONLY is
11308                          * true, reg_scan_name() just parses the string,
11309                          * and doesnt return anything. (in theory) */
11310                         assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11311
11312                         if (sv_dat)
11313                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11314                     }
11315                     ret = reganode(pRExC_state,INSUBP,parno);
11316                     goto insert_if_check_paren;
11317                 }
11318                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11319                     /* (?(1)...) */
11320                     char c;
11321                     UV uv;
11322                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11323                         && uv <= I32_MAX
11324                     ) {
11325                         parno = (I32)uv;
11326                         RExC_parse = (char*)endptr;
11327                     }
11328                     else {
11329                         vFAIL("panic: grok_atoUV returned FALSE");
11330                     }
11331                     ret = reganode(pRExC_state, GROUPP, parno);
11332
11333                  insert_if_check_paren:
11334                     if (UCHARAT(RExC_parse) != ')') {
11335                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11336                         vFAIL("Switch condition not recognized");
11337                     }
11338                     nextchar(pRExC_state);
11339                   insert_if:
11340                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11341                     br = regbranch(pRExC_state, &flags, 1,depth+1);
11342                     if (br == NULL) {
11343                         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11344                             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11345                             return NULL;
11346                         }
11347                         FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11348                               (UV) flags);
11349                     } else
11350                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
11351                                                           LONGJMP, 0));
11352                     c = UCHARAT(RExC_parse);
11353                     nextchar(pRExC_state);
11354                     if (flags&HASWIDTH)
11355                         *flagp |= HASWIDTH;
11356                     if (c == '|') {
11357                         if (is_define)
11358                             vFAIL("(?(DEFINE)....) does not allow branches");
11359
11360                         /* Fake one for optimizer.  */
11361                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11362
11363                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
11364                             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11365                                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11366                                 return NULL;
11367                             }
11368                             FAIL2("panic: regbranch returned NULL, flags=%#" UVxf,
11369                                   (UV) flags);
11370                         }
11371                         REGTAIL(pRExC_state, ret, lastbr);
11372                         if (flags&HASWIDTH)
11373                             *flagp |= HASWIDTH;
11374                         c = UCHARAT(RExC_parse);
11375                         nextchar(pRExC_state);
11376                     }
11377                     else
11378                         lastbr = NULL;
11379                     if (c != ')') {
11380                         if (RExC_parse >= RExC_end)
11381                             vFAIL("Switch (?(condition)... not terminated");
11382                         else
11383                             vFAIL("Switch (?(condition)... contains too many branches");
11384                     }
11385                     ender = reg_node(pRExC_state, TAIL);
11386                     REGTAIL(pRExC_state, br, ender);
11387                     if (lastbr) {
11388                         REGTAIL(pRExC_state, lastbr, ender);
11389                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11390                     }
11391                     else
11392                         REGTAIL(pRExC_state, ret, ender);
11393                     RExC_size++; /* XXX WHY do we need this?!!
11394                                     For large programs it seems to be required
11395                                     but I can't figure out why. -- dmq*/
11396                     return ret;
11397                 }
11398                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11399                 vFAIL("Unknown switch condition (?(...))");
11400             }
11401             case '[':           /* (?[ ... ]) */
11402                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11403                                          oregcomp_parse);
11404             case 0: /* A NUL */
11405                 RExC_parse--; /* for vFAIL to print correctly */
11406                 vFAIL("Sequence (? incomplete");
11407                 break;
11408             default: /* e.g., (?i) */
11409                 RExC_parse = (char *) seqstart + 1;
11410               parse_flags:
11411                 parse_lparen_question_flags(pRExC_state);
11412                 if (UCHARAT(RExC_parse) != ':') {
11413                     if (RExC_parse < RExC_end)
11414                         nextchar(pRExC_state);
11415                     *flagp = TRYAGAIN;
11416                     return NULL;
11417                 }
11418                 paren = ':';
11419                 nextchar(pRExC_state);
11420                 ret = NULL;
11421                 goto parse_rest;
11422             } /* end switch */
11423         }
11424         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11425           capturing_parens:
11426             parno = RExC_npar;
11427             RExC_npar++;
11428
11429             ret = reganode(pRExC_state, OPEN, parno);
11430             if (!SIZE_ONLY ){
11431                 if (!RExC_nestroot)
11432                     RExC_nestroot = parno;
11433                 if (RExC_open_parens && !RExC_open_parens[parno])
11434                 {
11435                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11436                         "%*s%*s Setting open paren #%" IVdf " to %d\n",
11437                         22, "|    |", (int)(depth * 2 + 1), "",
11438                         (IV)parno, REG_NODE_NUM(ret)));
11439                     RExC_open_parens[parno]= ret;
11440                 }
11441             }
11442             Set_Node_Length(ret, 1); /* MJD */
11443             Set_Node_Offset(ret, RExC_parse); /* MJD */
11444             is_open = 1;
11445         } else {
11446             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11447             paren = ':';
11448             ret = NULL;
11449         }
11450     }
11451     else                        /* ! paren */
11452         ret = NULL;
11453
11454    parse_rest:
11455     /* Pick up the branches, linking them together. */
11456     parse_start = RExC_parse;   /* MJD */
11457     br = regbranch(pRExC_state, &flags, 1,depth+1);
11458
11459     /*     branch_len = (paren != 0); */
11460
11461     if (br == NULL) {
11462         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11463             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11464             return NULL;
11465         }
11466         FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11467     }
11468     if (*RExC_parse == '|') {
11469         if (!SIZE_ONLY && RExC_extralen) {
11470             reginsert(pRExC_state, BRANCHJ, br, depth+1);
11471         }
11472         else {                  /* MJD */
11473             reginsert(pRExC_state, BRANCH, br, depth+1);
11474             Set_Node_Length(br, paren != 0);
11475             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11476         }
11477         have_branch = 1;
11478         if (SIZE_ONLY)
11479             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
11480     }
11481     else if (paren == ':') {
11482         *flagp |= flags&SIMPLE;
11483     }
11484     if (is_open) {                              /* Starts with OPEN. */
11485         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
11486     }
11487     else if (paren != '?')              /* Not Conditional */
11488         ret = br;
11489     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11490     lastbr = br;
11491     while (*RExC_parse == '|') {
11492         if (!SIZE_ONLY && RExC_extralen) {
11493             ender = reganode(pRExC_state, LONGJMP,0);
11494
11495             /* Append to the previous. */
11496             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11497         }
11498         if (SIZE_ONLY)
11499             RExC_extralen += 2;         /* Account for LONGJMP. */
11500         nextchar(pRExC_state);
11501         if (freeze_paren) {
11502             if (RExC_npar > after_freeze)
11503                 after_freeze = RExC_npar;
11504             RExC_npar = freeze_paren;
11505         }
11506         br = regbranch(pRExC_state, &flags, 0, depth+1);
11507
11508         if (br == NULL) {
11509             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11510                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11511                 return NULL;
11512             }
11513             FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags);
11514         }
11515         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
11516         lastbr = br;
11517         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11518     }
11519
11520     if (have_branch || paren != ':') {
11521         /* Make a closing node, and hook it on the end. */
11522         switch (paren) {
11523         case ':':
11524             ender = reg_node(pRExC_state, TAIL);
11525             break;
11526         case 1: case 2:
11527             ender = reganode(pRExC_state, CLOSE, parno);
11528             if ( RExC_close_parens ) {
11529                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11530                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
11531                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11532                 RExC_close_parens[parno]= ender;
11533                 if (RExC_nestroot == parno)
11534                     RExC_nestroot = 0;
11535             }
11536             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11537             Set_Node_Length(ender,1); /* MJD */
11538             break;
11539         case 's':
11540             ender = reg_node(pRExC_state, SRCLOSE);
11541             RExC_in_script_run = 0;
11542             break;
11543         case '<':
11544         case ',':
11545         case '=':
11546         case '!':
11547             *flagp &= ~HASWIDTH;
11548             /* FALLTHROUGH */
11549         case '>':
11550             ender = reg_node(pRExC_state, SUCCEED);
11551             break;
11552         case 0:
11553             ender = reg_node(pRExC_state, END);
11554             if (!SIZE_ONLY) {
11555                 assert(!RExC_end_op); /* there can only be one! */
11556                 RExC_end_op = ender;
11557                 if (RExC_close_parens) {
11558                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11559                         "%*s%*s Setting close paren #0 (END) to %d\n",
11560                         22, "|    |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11561
11562                     RExC_close_parens[0]= ender;
11563                 }
11564             }
11565             break;
11566         }
11567         DEBUG_PARSE_r(if (!SIZE_ONLY) {
11568             DEBUG_PARSE_MSG("lsbr");
11569             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11570             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11571             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11572                           SvPV_nolen_const(RExC_mysv1),
11573                           (IV)REG_NODE_NUM(lastbr),
11574                           SvPV_nolen_const(RExC_mysv2),
11575                           (IV)REG_NODE_NUM(ender),
11576                           (IV)(ender - lastbr)
11577             );
11578         });
11579         REGTAIL(pRExC_state, lastbr, ender);
11580
11581         if (have_branch && !SIZE_ONLY) {
11582             char is_nothing= 1;
11583             if (depth==1)
11584                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11585
11586             /* Hook the tails of the branches to the closing node. */
11587             for (br = ret; br; br = regnext(br)) {
11588                 const U8 op = PL_regkind[OP(br)];
11589                 if (op == BRANCH) {
11590                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11591                     if ( OP(NEXTOPER(br)) != NOTHING
11592                          || regnext(NEXTOPER(br)) != ender)
11593                         is_nothing= 0;
11594                 }
11595                 else if (op == BRANCHJ) {
11596                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11597                     /* for now we always disable this optimisation * /
11598                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11599                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11600                     */
11601                         is_nothing= 0;
11602                 }
11603             }
11604             if (is_nothing) {
11605                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11606                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11607                     DEBUG_PARSE_MSG("NADA");
11608                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11609                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11610                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11611                                   SvPV_nolen_const(RExC_mysv1),
11612                                   (IV)REG_NODE_NUM(ret),
11613                                   SvPV_nolen_const(RExC_mysv2),
11614                                   (IV)REG_NODE_NUM(ender),
11615                                   (IV)(ender - ret)
11616                     );
11617                 });
11618                 OP(br)= NOTHING;
11619                 if (OP(ender) == TAIL) {
11620                     NEXT_OFF(br)= 0;
11621                     RExC_emit= br + 1;
11622                 } else {
11623                     regnode *opt;
11624                     for ( opt= br + 1; opt < ender ; opt++ )
11625                         OP(opt)= OPTIMIZED;
11626                     NEXT_OFF(br)= ender - br;
11627                 }
11628             }
11629         }
11630     }
11631
11632     {
11633         const char *p;
11634         static const char parens[] = "=!<,>";
11635
11636         if (paren && (p = strchr(parens, paren))) {
11637             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11638             int flag = (p - parens) > 1;
11639
11640             if (paren == '>')
11641                 node = SUSPEND, flag = 0;
11642             reginsert(pRExC_state, node,ret, depth+1);
11643             Set_Node_Cur_Length(ret, parse_start);
11644             Set_Node_Offset(ret, parse_start + 1);
11645             ret->flags = flag;
11646             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11647         }
11648     }
11649
11650     /* Check for proper termination. */
11651     if (paren) {
11652         /* restore original flags, but keep (?p) and, if we've changed from /d
11653          * rules to /u, keep the /u */
11654         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11655         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11656             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11657         }
11658         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11659             RExC_parse = oregcomp_parse;
11660             vFAIL("Unmatched (");
11661         }
11662         nextchar(pRExC_state);
11663     }
11664     else if (!paren && RExC_parse < RExC_end) {
11665         if (*RExC_parse == ')') {
11666             RExC_parse++;
11667             vFAIL("Unmatched )");
11668         }
11669         else
11670             FAIL("Junk on end of regexp");      /* "Can't happen". */
11671         NOT_REACHED; /* NOTREACHED */
11672     }
11673
11674     if (RExC_in_lookbehind) {
11675         RExC_in_lookbehind--;
11676     }
11677     if (after_freeze > RExC_npar)
11678         RExC_npar = after_freeze;
11679     return(ret);
11680 }
11681
11682 /*
11683  - regbranch - one alternative of an | operator
11684  *
11685  * Implements the concatenation operator.
11686  *
11687  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11688  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11689  */
11690 STATIC regnode *
11691 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11692 {
11693     regnode *ret;
11694     regnode *chain = NULL;
11695     regnode *latest;
11696     I32 flags = 0, c = 0;
11697     GET_RE_DEBUG_FLAGS_DECL;
11698
11699     PERL_ARGS_ASSERT_REGBRANCH;
11700
11701     DEBUG_PARSE("brnc");
11702
11703     if (first)
11704         ret = NULL;
11705     else {
11706         if (!SIZE_ONLY && RExC_extralen)
11707             ret = reganode(pRExC_state, BRANCHJ,0);
11708         else {
11709             ret = reg_node(pRExC_state, BRANCH);
11710             Set_Node_Length(ret, 1);
11711         }
11712     }
11713
11714     if (!first && SIZE_ONLY)
11715         RExC_extralen += 1;                     /* BRANCHJ */
11716
11717     *flagp = WORST;                     /* Tentatively. */
11718
11719     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11720                             FALSE /* Don't force to /x */ );
11721     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11722         flags &= ~TRYAGAIN;
11723         latest = regpiece(pRExC_state, &flags,depth+1);
11724         if (latest == NULL) {
11725             if (flags & TRYAGAIN)
11726                 continue;
11727             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11728                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11729                 return NULL;
11730             }
11731             FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags);
11732         }
11733         else if (ret == NULL)
11734             ret = latest;
11735         *flagp |= flags&(HASWIDTH|POSTPONED);
11736         if (chain == NULL)      /* First piece. */
11737             *flagp |= flags&SPSTART;
11738         else {
11739             /* FIXME adding one for every branch after the first is probably
11740              * excessive now we have TRIE support. (hv) */
11741             MARK_NAUGHTY(1);
11742             REGTAIL(pRExC_state, chain, latest);
11743         }
11744         chain = latest;
11745         c++;
11746     }
11747     if (chain == NULL) {        /* Loop ran zero times. */
11748         chain = reg_node(pRExC_state, NOTHING);
11749         if (ret == NULL)
11750             ret = chain;
11751     }
11752     if (c == 1) {
11753         *flagp |= flags&SIMPLE;
11754     }
11755
11756     return ret;
11757 }
11758
11759 /*
11760  - regpiece - something followed by possible quantifier * + ? {n,m}
11761  *
11762  * Note that the branching code sequences used for ? and the general cases
11763  * of * and + are somewhat optimized:  they use the same NOTHING node as
11764  * both the endmarker for their branch list and the body of the last branch.
11765  * It might seem that this node could be dispensed with entirely, but the
11766  * endmarker role is not redundant.
11767  *
11768  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11769  * TRYAGAIN.
11770  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11771  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11772  */
11773 STATIC regnode *
11774 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11775 {
11776     regnode *ret;
11777     char op;
11778     char *next;
11779     I32 flags;
11780     const char * const origparse = RExC_parse;
11781     I32 min;
11782     I32 max = REG_INFTY;
11783 #ifdef RE_TRACK_PATTERN_OFFSETS
11784     char *parse_start;
11785 #endif
11786     const char *maxpos = NULL;
11787     UV uv;
11788
11789     /* Save the original in case we change the emitted regop to a FAIL. */
11790     regnode * const orig_emit = RExC_emit;
11791
11792     GET_RE_DEBUG_FLAGS_DECL;
11793
11794     PERL_ARGS_ASSERT_REGPIECE;
11795
11796     DEBUG_PARSE("piec");
11797
11798     ret = regatom(pRExC_state, &flags,depth+1);
11799     if (ret == NULL) {
11800         if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11801             *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11802         else
11803             FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags);
11804         return(NULL);
11805     }
11806
11807     op = *RExC_parse;
11808
11809     if (op == '{' && regcurly(RExC_parse)) {
11810         maxpos = NULL;
11811 #ifdef RE_TRACK_PATTERN_OFFSETS
11812         parse_start = RExC_parse; /* MJD */
11813 #endif
11814         next = RExC_parse + 1;
11815         while (isDIGIT(*next) || *next == ',') {
11816             if (*next == ',') {
11817                 if (maxpos)
11818                     break;
11819                 else
11820                     maxpos = next;
11821             }
11822             next++;
11823         }
11824         if (*next == '}') {             /* got one */
11825             const char* endptr;
11826             if (!maxpos)
11827                 maxpos = next;
11828             RExC_parse++;
11829             if (isDIGIT(*RExC_parse)) {
11830                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11831                     vFAIL("Invalid quantifier in {,}");
11832                 if (uv >= REG_INFTY)
11833                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11834                 min = (I32)uv;
11835             } else {
11836                 min = 0;
11837             }
11838             if (*maxpos == ',')
11839                 maxpos++;
11840             else
11841                 maxpos = RExC_parse;
11842             if (isDIGIT(*maxpos)) {
11843                 if (!grok_atoUV(maxpos, &uv, &endptr))
11844                     vFAIL("Invalid quantifier in {,}");
11845                 if (uv >= REG_INFTY)
11846                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11847                 max = (I32)uv;
11848             } else {
11849                 max = REG_INFTY;                /* meaning "infinity" */
11850             }
11851             RExC_parse = next;
11852             nextchar(pRExC_state);
11853             if (max < min) {    /* If can't match, warn and optimize to fail
11854                                    unconditionally */
11855                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
11856                 if (PASS2) {
11857                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11858                     NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
11859                 }
11860                 return ret;
11861             }
11862             else if (min == max && *RExC_parse == '?')
11863             {
11864                 if (PASS2) {
11865                     ckWARN2reg(RExC_parse + 1,
11866                                "Useless use of greediness modifier '%c'",
11867                                *RExC_parse);
11868                 }
11869             }
11870
11871           do_curly:
11872             if ((flags&SIMPLE)) {
11873                 if (min == 0 && max == REG_INFTY) {
11874                     reginsert(pRExC_state, STAR, ret, depth+1);
11875                     MARK_NAUGHTY(4);
11876                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11877                     goto nest_check;
11878                 }
11879                 if (min == 1 && max == REG_INFTY) {
11880                     reginsert(pRExC_state, PLUS, ret, depth+1);
11881                     MARK_NAUGHTY(3);
11882                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11883                     goto nest_check;
11884                 }
11885                 MARK_NAUGHTY_EXP(2, 2);
11886                 reginsert(pRExC_state, CURLY, ret, depth+1);
11887                 Set_Node_Offset(ret, parse_start+1); /* MJD */
11888                 Set_Node_Cur_Length(ret, parse_start);
11889             }
11890             else {
11891                 regnode * const w = reg_node(pRExC_state, WHILEM);
11892
11893                 w->flags = 0;
11894                 REGTAIL(pRExC_state, ret, w);
11895                 if (!SIZE_ONLY && RExC_extralen) {
11896                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
11897                     reginsert(pRExC_state, NOTHING,ret, depth+1);
11898                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
11899                 }
11900                 reginsert(pRExC_state, CURLYX,ret, depth+1);
11901                                 /* MJD hk */
11902                 Set_Node_Offset(ret, parse_start+1);
11903                 Set_Node_Length(ret,
11904                                 op == '{' ? (RExC_parse - parse_start) : 1);
11905
11906                 if (!SIZE_ONLY && RExC_extralen)
11907                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
11908                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11909                 if (SIZE_ONLY)
11910                     RExC_whilem_seen++, RExC_extralen += 3;
11911                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
11912             }
11913             ret->flags = 0;
11914
11915             if (min > 0)
11916                 *flagp = WORST;
11917             if (max > 0)
11918                 *flagp |= HASWIDTH;
11919             if (!SIZE_ONLY) {
11920                 ARG1_SET(ret, (U16)min);
11921                 ARG2_SET(ret, (U16)max);
11922             }
11923             if (max == REG_INFTY)
11924                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11925
11926             goto nest_check;
11927         }
11928     }
11929
11930     if (!ISMULT1(op)) {
11931         *flagp = flags;
11932         return(ret);
11933     }
11934
11935 #if 0                           /* Now runtime fix should be reliable. */
11936
11937     /* if this is reinstated, don't forget to put this back into perldiag:
11938
11939             =item Regexp *+ operand could be empty at {#} in regex m/%s/
11940
11941            (F) The part of the regexp subject to either the * or + quantifier
11942            could match an empty string. The {#} shows in the regular
11943            expression about where the problem was discovered.
11944
11945     */
11946
11947     if (!(flags&HASWIDTH) && op != '?')
11948       vFAIL("Regexp *+ operand could be empty");
11949 #endif
11950
11951 #ifdef RE_TRACK_PATTERN_OFFSETS
11952     parse_start = RExC_parse;
11953 #endif
11954     nextchar(pRExC_state);
11955
11956     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11957
11958     if (op == '*') {
11959         min = 0;
11960         goto do_curly;
11961     }
11962     else if (op == '+') {
11963         min = 1;
11964         goto do_curly;
11965     }
11966     else if (op == '?') {
11967         min = 0; max = 1;
11968         goto do_curly;
11969     }
11970   nest_check:
11971     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11972         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11973         ckWARN2reg(RExC_parse,
11974                    "%" UTF8f " matches null string many times",
11975                    UTF8fARG(UTF, (RExC_parse >= origparse
11976                                  ? RExC_parse - origparse
11977                                  : 0),
11978                    origparse));
11979         (void)ReREFCNT_inc(RExC_rx_sv);
11980     }
11981
11982     if (*RExC_parse == '?') {
11983         nextchar(pRExC_state);
11984         reginsert(pRExC_state, MINMOD, ret, depth+1);
11985         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11986     }
11987     else if (*RExC_parse == '+') {
11988         regnode *ender;
11989         nextchar(pRExC_state);
11990         ender = reg_node(pRExC_state, SUCCEED);
11991         REGTAIL(pRExC_state, ret, ender);
11992         reginsert(pRExC_state, SUSPEND, ret, depth+1);
11993         ender = reg_node(pRExC_state, TAIL);
11994         REGTAIL(pRExC_state, ret, ender);
11995     }
11996
11997     if (ISMULT2(RExC_parse)) {
11998         RExC_parse++;
11999         vFAIL("Nested quantifiers");
12000     }
12001
12002     return(ret);
12003 }
12004
12005 STATIC bool
12006 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12007                 regnode ** node_p,
12008                 UV * code_point_p,
12009                 int * cp_count,
12010                 I32 * flagp,
12011                 const bool strict,
12012                 const U32 depth
12013     )
12014 {
12015  /* This routine teases apart the various meanings of \N and returns
12016   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12017   * in the current context.
12018   *
12019   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12020   *
12021   * If <code_point_p> is not NULL, the context is expecting the result to be a
12022   * single code point.  If this \N instance turns out to a single code point,
12023   * the function returns TRUE and sets *code_point_p to that code point.
12024   *
12025   * If <node_p> is not NULL, the context is expecting the result to be one of
12026   * the things representable by a regnode.  If this \N instance turns out to be
12027   * one such, the function generates the regnode, returns TRUE and sets *node_p
12028   * to point to that regnode.
12029   *
12030   * If this instance of \N isn't legal in any context, this function will
12031   * generate a fatal error and not return.
12032   *
12033   * On input, RExC_parse should point to the first char following the \N at the
12034   * time of the call.  On successful return, RExC_parse will have been updated
12035   * to point to just after the sequence identified by this routine.  Also
12036   * *flagp has been updated as needed.
12037   *
12038   * When there is some problem with the current context and this \N instance,
12039   * the function returns FALSE, without advancing RExC_parse, nor setting
12040   * *node_p, nor *code_point_p, nor *flagp.
12041   *
12042   * If <cp_count> is not NULL, the caller wants to know the length (in code
12043   * points) that this \N sequence matches.  This is set even if the function
12044   * returns FALSE, as detailed below.
12045   *
12046   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12047   *
12048   * Probably the most common case is for the \N to specify a single code point.
12049   * *cp_count will be set to 1, and *code_point_p will be set to that code
12050   * point.
12051   *
12052   * Another possibility is for the input to be an empty \N{}, which for
12053   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
12054   * will be set to a generated NOTHING node.
12055   *
12056   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12057   * set to 0. *node_p will be set to a generated REG_ANY node.
12058   *
12059   * The fourth possibility is that \N resolves to a sequence of more than one
12060   * code points.  *cp_count will be set to the number of code points in the
12061   * sequence. *node_p * will be set to a generated node returned by this
12062   * function calling S_reg().
12063   *
12064   * The final possibility is that it is premature to be calling this function;
12065   * that pass1 needs to be restarted.  This can happen when this changes from
12066   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12067   * latter occurs only when the fourth possibility would otherwise be in
12068   * effect, and is because one of those code points requires the pattern to be
12069   * recompiled as UTF-8.  The function returns FALSE, and sets the
12070   * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
12071   * happens, the caller needs to desist from continuing parsing, and return
12072   * this information to its caller.  This is not set for when there is only one
12073   * code point, as this can be called as part of an ANYOF node, and they can
12074   * store above-Latin1 code points without the pattern having to be in UTF-8.
12075   *
12076   * For non-single-quoted regexes, the tokenizer has resolved character and
12077   * sequence names inside \N{...} into their Unicode values, normalizing the
12078   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12079   * hex-represented code points in the sequence.  This is done there because
12080   * the names can vary based on what charnames pragma is in scope at the time,
12081   * so we need a way to take a snapshot of what they resolve to at the time of
12082   * the original parse. [perl #56444].
12083   *
12084   * That parsing is skipped for single-quoted regexes, so we may here get
12085   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
12086   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
12087   * is legal and handled here.  The code point is Unicode, and has to be
12088   * translated into the native character set for non-ASCII platforms.
12089   */
12090
12091     char * endbrace;    /* points to '}' following the name */
12092     char *endchar;      /* Points to '.' or '}' ending cur char in the input
12093                            stream */
12094     char* p = RExC_parse; /* Temporary */
12095
12096     GET_RE_DEBUG_FLAGS_DECL;
12097
12098     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12099
12100     GET_RE_DEBUG_FLAGS;
12101
12102     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12103     assert(! (node_p && cp_count));               /* At most 1 should be set */
12104
12105     if (cp_count) {     /* Initialize return for the most common case */
12106         *cp_count = 1;
12107     }
12108
12109     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12110      * modifier.  The other meanings do not, so use a temporary until we find
12111      * out which we are being called with */
12112     skip_to_be_ignored_text(pRExC_state, &p,
12113                             FALSE /* Don't force to /x */ );
12114
12115     /* Disambiguate between \N meaning a named character versus \N meaning
12116      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12117      * quantifier, or there is no '{' at all */
12118     if (*p != '{' || regcurly(p)) {
12119         RExC_parse = p;
12120         if (cp_count) {
12121             *cp_count = -1;
12122         }
12123
12124         if (! node_p) {
12125             return FALSE;
12126         }
12127
12128         *node_p = reg_node(pRExC_state, REG_ANY);
12129         *flagp |= HASWIDTH|SIMPLE;
12130         MARK_NAUGHTY(1);
12131         Set_Node_Length(*node_p, 1); /* MJD */
12132         return TRUE;
12133     }
12134
12135     /* Here, we have decided it should be a named character or sequence */
12136
12137     /* The test above made sure that the next real character is a '{', but
12138      * under the /x modifier, it could be separated by space (or a comment and
12139      * \n) and this is not allowed (for consistency with \x{...} and the
12140      * tokenizer handling of \N{NAME}). */
12141     if (*RExC_parse != '{') {
12142         vFAIL("Missing braces on \\N{}");
12143     }
12144
12145     RExC_parse++;       /* Skip past the '{' */
12146
12147     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12148     if (! endbrace) { /* no trailing brace */
12149         vFAIL2("Missing right brace on \\%c{}", 'N');
12150     }
12151     else if (!(   endbrace == RExC_parse        /* nothing between the {} */
12152                || memBEGINs(RExC_parse,   /* U+ (bad hex is checked below
12153                                                    for a  better error msg) */
12154                                   (STRLEN) (RExC_end - RExC_parse),
12155                                  "U+")))
12156     {
12157         RExC_parse = endbrace;  /* position msg's '<--HERE' */
12158         vFAIL("\\N{NAME} must be resolved by the lexer");
12159     }
12160
12161     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12162                                         semantics */
12163
12164     if (endbrace == RExC_parse) {   /* empty: \N{} */
12165         if (strict) {
12166             RExC_parse++;   /* Position after the "}" */
12167             vFAIL("Zero length \\N{}");
12168         }
12169         if (cp_count) {
12170             *cp_count = 0;
12171         }
12172         nextchar(pRExC_state);
12173         if (! node_p) {
12174             return FALSE;
12175         }
12176
12177         *node_p = reg_node(pRExC_state,NOTHING);
12178         return TRUE;
12179     }
12180
12181     RExC_parse += 2;    /* Skip past the 'U+' */
12182
12183     /* Because toke.c has generated a special construct for us guaranteed not
12184      * to have NULs, we can use a str function */
12185     endchar = RExC_parse + strcspn(RExC_parse, ".}");
12186
12187     /* Code points are separated by dots.  If none, there is only one code
12188      * point, and is terminated by the brace */
12189
12190     if (endchar >= endbrace) {
12191         STRLEN length_of_hex;
12192         I32 grok_hex_flags;
12193
12194         /* Here, exactly one code point.  If that isn't what is wanted, fail */
12195         if (! code_point_p) {
12196             RExC_parse = p;
12197             return FALSE;
12198         }
12199
12200         /* Convert code point from hex */
12201         length_of_hex = (STRLEN)(endchar - RExC_parse);
12202         grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
12203                        | PERL_SCAN_DISALLOW_PREFIX
12204
12205                            /* No errors in the first pass (See [perl
12206                             * #122671].)  We let the code below find the
12207                             * errors when there are multiple chars. */
12208                        | ((SIZE_ONLY)
12209                           ? PERL_SCAN_SILENT_ILLDIGIT
12210                           : 0);
12211
12212         /* This routine is the one place where both single- and double-quotish
12213          * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
12214          * must be converted to native. */
12215         *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
12216                                                &length_of_hex,
12217                                                &grok_hex_flags,
12218                                                NULL));
12219
12220         /* The tokenizer should have guaranteed validity, but it's possible to
12221          * bypass it by using single quoting, so check.  Don't do the check
12222          * here when there are multiple chars; we do it below anyway. */
12223         if (length_of_hex == 0
12224             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
12225         {
12226             RExC_parse += length_of_hex;        /* Includes all the valid */
12227             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
12228                             ? UTF8SKIP(RExC_parse)
12229                             : 1;
12230             /* Guard against malformed utf8 */
12231             if (RExC_parse >= endchar) {
12232                 RExC_parse = endchar;
12233             }
12234             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12235         }
12236
12237         RExC_parse = endbrace + 1;
12238         return TRUE;
12239     }
12240     else {  /* Is a multiple character sequence */
12241         SV * substitute_parse;
12242         STRLEN len;
12243         char *orig_end = RExC_end;
12244         char *save_start = RExC_start;
12245         I32 flags;
12246
12247         /* Count the code points, if desired, in the sequence */
12248         if (cp_count) {
12249             *cp_count = 0;
12250             while (RExC_parse < endbrace) {
12251                 /* Point to the beginning of the next character in the sequence. */
12252                 RExC_parse = endchar + 1;
12253                 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12254                 (*cp_count)++;
12255             }
12256         }
12257
12258         /* Fail if caller doesn't want to handle a multi-code-point sequence.
12259          * But don't backup up the pointer if the caller wants to know how many
12260          * code points there are (they can then handle things) */
12261         if (! node_p) {
12262             if (! cp_count) {
12263                 RExC_parse = p;
12264             }
12265             return FALSE;
12266         }
12267
12268         /* What is done here is to convert this to a sub-pattern of the form
12269          * \x{char1}\x{char2}...  and then call reg recursively to parse it
12270          * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
12271          * while not having to worry about special handling that some code
12272          * points may have. */
12273
12274         substitute_parse = newSVpvs("?:");
12275
12276         while (RExC_parse < endbrace) {
12277
12278             /* Convert to notation the rest of the code understands */
12279             sv_catpv(substitute_parse, "\\x{");
12280             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
12281             sv_catpv(substitute_parse, "}");
12282
12283             /* Point to the beginning of the next character in the sequence. */
12284             RExC_parse = endchar + 1;
12285             endchar = RExC_parse + strcspn(RExC_parse, ".}");
12286
12287         }
12288         sv_catpv(substitute_parse, ")");
12289
12290         len = SvCUR(substitute_parse);
12291
12292         /* Don't allow empty number */
12293         if (len < (STRLEN) 8) {
12294             RExC_parse = endbrace;
12295             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12296         }
12297
12298         RExC_parse = RExC_start = RExC_adjusted_start
12299                                               = SvPV_nolen(substitute_parse);
12300         RExC_end = RExC_parse + len;
12301
12302         /* The values are Unicode, and therefore not subject to recoding, but
12303          * have to be converted to native on a non-Unicode (meaning non-ASCII)
12304          * platform. */
12305 #ifdef EBCDIC
12306         RExC_recode_x_to_native = 1;
12307 #endif
12308
12309         *node_p = reg(pRExC_state, 1, &flags, depth+1);
12310
12311         /* Restore the saved values */
12312         RExC_start = RExC_adjusted_start = save_start;
12313         RExC_parse = endbrace;
12314         RExC_end = orig_end;
12315 #ifdef EBCDIC
12316         RExC_recode_x_to_native = 0;
12317 #endif
12318         SvREFCNT_dec_NN(substitute_parse);
12319
12320         if (! *node_p) {
12321             if (flags & (RESTART_PASS1|NEED_UTF8)) {
12322                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12323                 return FALSE;
12324             }
12325             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
12326                 (UV) flags);
12327         }
12328         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12329
12330         nextchar(pRExC_state);
12331
12332         return TRUE;
12333     }
12334 }
12335
12336
12337 PERL_STATIC_INLINE U8
12338 S_compute_EXACTish(RExC_state_t *pRExC_state)
12339 {
12340     U8 op;
12341
12342     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12343
12344     if (! FOLD) {
12345         return (LOC)
12346                 ? EXACTL
12347                 : EXACT;
12348     }
12349
12350     op = get_regex_charset(RExC_flags);
12351     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12352         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12353                  been, so there is no hole */
12354     }
12355
12356     return op + EXACTF;
12357 }
12358
12359 PERL_STATIC_INLINE void
12360 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12361                          regnode *node, I32* flagp, STRLEN len, UV code_point,
12362                          bool downgradable)
12363 {
12364     /* This knows the details about sizing an EXACTish node, setting flags for
12365      * it (by setting <*flagp>, and potentially populating it with a single
12366      * character.
12367      *
12368      * If <len> (the length in bytes) is non-zero, this function assumes that
12369      * the node has already been populated, and just does the sizing.  In this
12370      * case <code_point> should be the final code point that has already been
12371      * placed into the node.  This value will be ignored except that under some
12372      * circumstances <*flagp> is set based on it.
12373      *
12374      * If <len> is zero, the function assumes that the node is to contain only
12375      * the single character given by <code_point> and calculates what <len>
12376      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
12377      * additionally will populate the node's STRING with <code_point> or its
12378      * fold if folding.
12379      *
12380      * In both cases <*flagp> is appropriately set
12381      *
12382      * It knows that under FOLD, the Latin Sharp S and UTF characters above
12383      * 255, must be folded (the former only when the rules indicate it can
12384      * match 'ss')
12385      *
12386      * When it does the populating, it looks at the flag 'downgradable'.  If
12387      * true with a node that folds, it checks if the single code point
12388      * participates in a fold, and if not downgrades the node to an EXACT.
12389      * This helps the optimizer */
12390
12391     bool len_passed_in = cBOOL(len != 0);
12392     U8 character[UTF8_MAXBYTES_CASE+1];
12393
12394     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12395
12396     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12397      * sizing difference, and is extra work that is thrown away */
12398     if (downgradable && ! PASS2) {
12399         downgradable = FALSE;
12400     }
12401
12402     if (! len_passed_in) {
12403         if (UTF) {
12404             if (UVCHR_IS_INVARIANT(code_point)) {
12405                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
12406                     *character = (U8) code_point;
12407                 }
12408                 else { /* Here is /i and not /l. (toFOLD() is defined on just
12409                           ASCII, which isn't the same thing as INVARIANT on
12410                           EBCDIC, but it works there, as the extra invariants
12411                           fold to themselves) */
12412                     *character = toFOLD((U8) code_point);
12413
12414                     /* We can downgrade to an EXACT node if this character
12415                      * isn't a folding one.  Note that this assumes that
12416                      * nothing above Latin1 folds to some other invariant than
12417                      * one of these alphabetics; otherwise we would also have
12418                      * to check:
12419                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12420                      *      || ASCII_FOLD_RESTRICTED))
12421                      */
12422                     if (downgradable && PL_fold[code_point] == code_point) {
12423                         OP(node) = EXACT;
12424                     }
12425                 }
12426                 len = 1;
12427             }
12428             else if (FOLD && (! LOC
12429                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12430             {   /* Folding, and ok to do so now */
12431                 UV folded = _to_uni_fold_flags(
12432                                    code_point,
12433                                    character,
12434                                    &len,
12435                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12436                                                       ? FOLD_FLAGS_NOMIX_ASCII
12437                                                       : 0));
12438                 if (downgradable
12439                     && folded == code_point /* This quickly rules out many
12440                                                cases, avoiding the
12441                                                _invlist_contains_cp() overhead
12442                                                for those.  */
12443                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12444                 {
12445                     OP(node) = (LOC)
12446                                ? EXACTL
12447                                : EXACT;
12448                 }
12449             }
12450             else if (code_point <= MAX_UTF8_TWO_BYTE) {
12451
12452                 /* Not folding this cp, and can output it directly */
12453                 *character = UTF8_TWO_BYTE_HI(code_point);
12454                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12455                 len = 2;
12456             }
12457             else {
12458                 uvchr_to_utf8( character, code_point);
12459                 len = UTF8SKIP(character);
12460             }
12461         } /* Else pattern isn't UTF8.  */
12462         else if (! FOLD) {
12463             *character = (U8) code_point;
12464             len = 1;
12465         } /* Else is folded non-UTF8 */
12466 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12467    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12468                                       || UNICODE_DOT_DOT_VERSION > 0)
12469         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12470 #else
12471         else if (1) {
12472 #endif
12473             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
12474              * comments at join_exact()); */
12475             *character = (U8) code_point;
12476             len = 1;
12477
12478             /* Can turn into an EXACT node if we know the fold at compile time,
12479              * and it folds to itself and doesn't particpate in other folds */
12480             if (downgradable
12481                 && ! LOC
12482                 && PL_fold_latin1[code_point] == code_point
12483                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12484                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12485             {
12486                 OP(node) = EXACT;
12487             }
12488         } /* else is Sharp s.  May need to fold it */
12489         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12490             *character = 's';
12491             *(character + 1) = 's';
12492             len = 2;
12493         }
12494         else {
12495             *character = LATIN_SMALL_LETTER_SHARP_S;
12496             len = 1;
12497         }
12498     }
12499
12500     if (SIZE_ONLY) {
12501         RExC_size += STR_SZ(len);
12502     }
12503     else {
12504         RExC_emit += STR_SZ(len);
12505         STR_LEN(node) = len;
12506         if (! len_passed_in) {
12507             Copy((char *) character, STRING(node), len, char);
12508         }
12509     }
12510
12511     *flagp |= HASWIDTH;
12512
12513     /* A single character node is SIMPLE, except for the special-cased SHARP S
12514      * under /di. */
12515     if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12516 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12517    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12518                                       || UNICODE_DOT_DOT_VERSION > 0)
12519         && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12520             || ! FOLD || ! DEPENDS_SEMANTICS)
12521 #endif
12522     ) {
12523         *flagp |= SIMPLE;
12524     }
12525
12526     /* The OP may not be well defined in PASS1 */
12527     if (PASS2 && OP(node) == EXACTFL) {
12528         RExC_contains_locale = 1;
12529     }
12530 }
12531
12532 STATIC bool
12533 S_new_regcurly(const char *s, const char *e)
12534 {
12535     /* This is a temporary function designed to match the most lenient form of
12536      * a {m,n} quantifier we ever envision, with either number omitted, and
12537      * spaces anywhere between/before/after them.
12538      *
12539      * If this function fails, then the string it matches is very unlikely to
12540      * ever be considered a valid quantifier, so we can allow the '{' that
12541      * begins it to be considered as a literal */
12542
12543     bool has_min = FALSE;
12544     bool has_max = FALSE;
12545
12546     PERL_ARGS_ASSERT_NEW_REGCURLY;
12547
12548     if (s >= e || *s++ != '{')
12549         return FALSE;
12550
12551     while (s < e && isSPACE(*s)) {
12552         s++;
12553     }
12554     while (s < e && isDIGIT(*s)) {
12555         has_min = TRUE;
12556         s++;
12557     }
12558     while (s < e && isSPACE(*s)) {
12559         s++;
12560     }
12561
12562     if (*s == ',') {
12563         s++;
12564         while (s < e && isSPACE(*s)) {
12565             s++;
12566         }
12567         while (s < e && isDIGIT(*s)) {
12568             has_max = TRUE;
12569             s++;
12570         }
12571         while (s < e && isSPACE(*s)) {
12572             s++;
12573         }
12574     }
12575
12576     return s < e && *s == '}' && (has_min || has_max);
12577 }
12578
12579 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12580  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12581
12582 static I32
12583 S_backref_value(char *p)
12584 {
12585     const char* endptr;
12586     UV val;
12587     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12588         return (I32)val;
12589     return I32_MAX;
12590 }
12591
12592
12593 /*
12594  - regatom - the lowest level
12595
12596    Try to identify anything special at the start of the current parse position.
12597    If there is, then handle it as required. This may involve generating a
12598    single regop, such as for an assertion; or it may involve recursing, such as
12599    to handle a () structure.
12600
12601    If the string doesn't start with something special then we gobble up
12602    as much literal text as we can.  If we encounter a quantifier, we have to
12603    back off the final literal character, as that quantifier applies to just it
12604    and not to the whole string of literals.
12605
12606    Once we have been able to handle whatever type of thing started the
12607    sequence, we return.
12608
12609    Note: we have to be careful with escapes, as they can be both literal
12610    and special, and in the case of \10 and friends, context determines which.
12611
12612    A summary of the code structure is:
12613
12614    switch (first_byte) {
12615         cases for each special:
12616             handle this special;
12617             break;
12618         case '\\':
12619             switch (2nd byte) {
12620                 cases for each unambiguous special:
12621                     handle this special;
12622                     break;
12623                 cases for each ambigous special/literal:
12624                     disambiguate;
12625                     if (special)  handle here
12626                     else goto defchar;
12627                 default: // unambiguously literal:
12628                     goto defchar;
12629             }
12630         default:  // is a literal char
12631             // FALL THROUGH
12632         defchar:
12633             create EXACTish node for literal;
12634             while (more input and node isn't full) {
12635                 switch (input_byte) {
12636                    cases for each special;
12637                        make sure parse pointer is set so that the next call to
12638                            regatom will see this special first
12639                        goto loopdone; // EXACTish node terminated by prev. char
12640                    default:
12641                        append char to EXACTISH node;
12642                 }
12643                 get next input byte;
12644             }
12645         loopdone:
12646    }
12647    return the generated node;
12648
12649    Specifically there are two separate switches for handling
12650    escape sequences, with the one for handling literal escapes requiring
12651    a dummy entry for all of the special escapes that are actually handled
12652    by the other.
12653
12654    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12655    TRYAGAIN.
12656    Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12657    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12658    Otherwise does not return NULL.
12659 */
12660
12661 STATIC regnode *
12662 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12663 {
12664     regnode *ret = NULL;
12665     I32 flags = 0;
12666     char *parse_start;
12667     U8 op;
12668     int invert = 0;
12669     U8 arg;
12670
12671     GET_RE_DEBUG_FLAGS_DECL;
12672
12673     *flagp = WORST;             /* Tentatively. */
12674
12675     DEBUG_PARSE("atom");
12676
12677     PERL_ARGS_ASSERT_REGATOM;
12678
12679   tryagain:
12680     parse_start = RExC_parse;
12681     assert(RExC_parse < RExC_end);
12682     switch ((U8)*RExC_parse) {
12683     case '^':
12684         RExC_seen_zerolen++;
12685         nextchar(pRExC_state);
12686         if (RExC_flags & RXf_PMf_MULTILINE)
12687             ret = reg_node(pRExC_state, MBOL);
12688         else
12689             ret = reg_node(pRExC_state, SBOL);
12690         Set_Node_Length(ret, 1); /* MJD */
12691         break;
12692     case '$':
12693         nextchar(pRExC_state);
12694         if (*RExC_parse)
12695             RExC_seen_zerolen++;
12696         if (RExC_flags & RXf_PMf_MULTILINE)
12697             ret = reg_node(pRExC_state, MEOL);
12698         else
12699             ret = reg_node(pRExC_state, SEOL);
12700         Set_Node_Length(ret, 1); /* MJD */
12701         break;
12702     case '.':
12703         nextchar(pRExC_state);
12704         if (RExC_flags & RXf_PMf_SINGLELINE)
12705             ret = reg_node(pRExC_state, SANY);
12706         else
12707             ret = reg_node(pRExC_state, REG_ANY);
12708         *flagp |= HASWIDTH|SIMPLE;
12709         MARK_NAUGHTY(1);
12710         Set_Node_Length(ret, 1); /* MJD */
12711         break;
12712     case '[':
12713     {
12714         char * const oregcomp_parse = ++RExC_parse;
12715         ret = regclass(pRExC_state, flagp,depth+1,
12716                        FALSE, /* means parse the whole char class */
12717                        TRUE, /* allow multi-char folds */
12718                        FALSE, /* don't silence non-portable warnings. */
12719                        (bool) RExC_strict,
12720                        TRUE, /* Allow an optimized regnode result */
12721                        NULL,
12722                        NULL);
12723         if (ret == NULL) {
12724             if (*flagp & (RESTART_PASS1|NEED_UTF8))
12725                 return NULL;
12726             FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
12727                   (UV) *flagp);
12728         }
12729         if (*RExC_parse != ']') {
12730             RExC_parse = oregcomp_parse;
12731             vFAIL("Unmatched [");
12732         }
12733         nextchar(pRExC_state);
12734         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12735         break;
12736     }
12737     case '(':
12738         nextchar(pRExC_state);
12739         ret = reg(pRExC_state, 2, &flags,depth+1);
12740         if (ret == NULL) {
12741                 if (flags & TRYAGAIN) {
12742                     if (RExC_parse >= RExC_end) {
12743                          /* Make parent create an empty node if needed. */
12744                         *flagp |= TRYAGAIN;
12745                         return(NULL);
12746                     }
12747                     goto tryagain;
12748                 }
12749                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12750                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12751                     return NULL;
12752                 }
12753                 FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf,
12754                                                                  (UV) flags);
12755         }
12756         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12757         break;
12758     case '|':
12759     case ')':
12760         if (flags & TRYAGAIN) {
12761             *flagp |= TRYAGAIN;
12762             return NULL;
12763         }
12764         vFAIL("Internal urp");
12765                                 /* Supposed to be caught earlier. */
12766         break;
12767     case '?':
12768     case '+':
12769     case '*':
12770         RExC_parse++;
12771         vFAIL("Quantifier follows nothing");
12772         break;
12773     case '\\':
12774         /* Special Escapes
12775
12776            This switch handles escape sequences that resolve to some kind
12777            of special regop and not to literal text. Escape sequnces that
12778            resolve to literal text are handled below in the switch marked
12779            "Literal Escapes".
12780
12781            Every entry in this switch *must* have a corresponding entry
12782            in the literal escape switch. However, the opposite is not
12783            required, as the default for this switch is to jump to the
12784            literal text handling code.
12785         */
12786         RExC_parse++;
12787         switch ((U8)*RExC_parse) {
12788         /* Special Escapes */
12789         case 'A':
12790             RExC_seen_zerolen++;
12791             ret = reg_node(pRExC_state, SBOL);
12792             /* SBOL is shared with /^/ so we set the flags so we can tell
12793              * /\A/ from /^/ in split. We check ret because first pass we
12794              * have no regop struct to set the flags on. */
12795             if (PASS2)
12796                 ret->flags = 1;
12797             *flagp |= SIMPLE;
12798             goto finish_meta_pat;
12799         case 'G':
12800             ret = reg_node(pRExC_state, GPOS);
12801             RExC_seen |= REG_GPOS_SEEN;
12802             *flagp |= SIMPLE;
12803             goto finish_meta_pat;
12804         case 'K':
12805             RExC_seen_zerolen++;
12806             ret = reg_node(pRExC_state, KEEPS);
12807             *flagp |= SIMPLE;
12808             /* XXX:dmq : disabling in-place substitution seems to
12809              * be necessary here to avoid cases of memory corruption, as
12810              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12811              */
12812             RExC_seen |= REG_LOOKBEHIND_SEEN;
12813             goto finish_meta_pat;
12814         case 'Z':
12815             ret = reg_node(pRExC_state, SEOL);
12816             *flagp |= SIMPLE;
12817             RExC_seen_zerolen++;                /* Do not optimize RE away */
12818             goto finish_meta_pat;
12819         case 'z':
12820             ret = reg_node(pRExC_state, EOS);
12821             *flagp |= SIMPLE;
12822             RExC_seen_zerolen++;                /* Do not optimize RE away */
12823             goto finish_meta_pat;
12824         case 'C':
12825             vFAIL("\\C no longer supported");
12826         case 'X':
12827             ret = reg_node(pRExC_state, CLUMP);
12828             *flagp |= HASWIDTH;
12829             goto finish_meta_pat;
12830
12831         case 'W':
12832             invert = 1;
12833             /* FALLTHROUGH */
12834         case 'w':
12835             arg = ANYOF_WORDCHAR;
12836             goto join_posix;
12837
12838         case 'B':
12839             invert = 1;
12840             /* FALLTHROUGH */
12841         case 'b':
12842           {
12843             regex_charset charset = get_regex_charset(RExC_flags);
12844
12845             RExC_seen_zerolen++;
12846             RExC_seen |= REG_LOOKBEHIND_SEEN;
12847             op = BOUND + charset;
12848
12849             if (op == BOUNDL) {
12850                 RExC_contains_locale = 1;
12851             }
12852
12853             ret = reg_node(pRExC_state, op);
12854             *flagp |= SIMPLE;
12855             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12856                 FLAGS(ret) = TRADITIONAL_BOUND;
12857                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
12858                     OP(ret) = BOUNDA;
12859                 }
12860             }
12861             else {
12862                 STRLEN length;
12863                 char name = *RExC_parse;
12864                 char * endbrace = NULL;
12865                 RExC_parse += 2;
12866                 if (RExC_parse < RExC_end) {
12867                     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12868                 }
12869
12870                 if (! endbrace) {
12871                     vFAIL2("Missing right brace on \\%c{}", name);
12872                 }
12873                 /* XXX Need to decide whether to take spaces or not.  Should be
12874                  * consistent with \p{}, but that currently is SPACE, which
12875                  * means vertical too, which seems wrong
12876                  * while (isBLANK(*RExC_parse)) {
12877                     RExC_parse++;
12878                 }*/
12879                 if (endbrace == RExC_parse) {
12880                     RExC_parse++;  /* After the '}' */
12881                     vFAIL2("Empty \\%c{}", name);
12882                 }
12883                 length = endbrace - RExC_parse;
12884                 /*while (isBLANK(*(RExC_parse + length - 1))) {
12885                     length--;
12886                 }*/
12887                 switch (*RExC_parse) {
12888                     case 'g':
12889                         if (    length != 1
12890                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
12891                         {
12892                             goto bad_bound_type;
12893                         }
12894                         FLAGS(ret) = GCB_BOUND;
12895                         break;
12896                     case 'l':
12897                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12898                             goto bad_bound_type;
12899                         }
12900                         FLAGS(ret) = LB_BOUND;
12901                         break;
12902                     case 's':
12903                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12904                             goto bad_bound_type;
12905                         }
12906                         FLAGS(ret) = SB_BOUND;
12907                         break;
12908                     case 'w':
12909                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12910                             goto bad_bound_type;
12911                         }
12912                         FLAGS(ret) = WB_BOUND;
12913                         break;
12914                     default:
12915                       bad_bound_type:
12916                         RExC_parse = endbrace;
12917                         vFAIL2utf8f(
12918                             "'%" UTF8f "' is an unknown bound type",
12919                             UTF8fARG(UTF, length, endbrace - length));
12920                         NOT_REACHED; /*NOTREACHED*/
12921                 }
12922                 RExC_parse = endbrace;
12923                 REQUIRE_UNI_RULES(flagp, NULL);
12924
12925                 if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
12926                     OP(ret) = BOUNDU;
12927                     length += 4;
12928
12929                     /* Don't have to worry about UTF-8, in this message because
12930                      * to get here the contents of the \b must be ASCII */
12931                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
12932                               "Using /u for '%.*s' instead of /%s",
12933                               (unsigned) length,
12934                               endbrace - length + 1,
12935                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12936                               ? ASCII_RESTRICT_PAT_MODS
12937                               : ASCII_MORE_RESTRICT_PAT_MODS);
12938                 }
12939             }
12940
12941             if (PASS2 && invert) {
12942                 OP(ret) += NBOUND - BOUND;
12943             }
12944             goto finish_meta_pat;
12945           }
12946
12947         case 'D':
12948             invert = 1;
12949             /* FALLTHROUGH */
12950         case 'd':
12951             arg = ANYOF_DIGIT;
12952             if (! DEPENDS_SEMANTICS) {
12953                 goto join_posix;
12954             }
12955
12956             /* \d doesn't have any matches in the upper Latin1 range, hence /d
12957              * is equivalent to /u.  Changing to /u saves some branches at
12958              * runtime */
12959             op = POSIXU;
12960             goto join_posix_op_known;
12961
12962         case 'R':
12963             ret = reg_node(pRExC_state, LNBREAK);
12964             *flagp |= HASWIDTH|SIMPLE;
12965             goto finish_meta_pat;
12966
12967         case 'H':
12968             invert = 1;
12969             /* FALLTHROUGH */
12970         case 'h':
12971             arg = ANYOF_BLANK;
12972             op = POSIXU;
12973             goto join_posix_op_known;
12974
12975         case 'V':
12976             invert = 1;
12977             /* FALLTHROUGH */
12978         case 'v':
12979             arg = ANYOF_VERTWS;
12980             op = POSIXU;
12981             goto join_posix_op_known;
12982
12983         case 'S':
12984             invert = 1;
12985             /* FALLTHROUGH */
12986         case 's':
12987             arg = ANYOF_SPACE;
12988
12989           join_posix:
12990
12991             op = POSIXD + get_regex_charset(RExC_flags);
12992             if (op > POSIXA) {  /* /aa is same as /a */
12993                 op = POSIXA;
12994             }
12995             else if (op == POSIXL) {
12996                 RExC_contains_locale = 1;
12997             }
12998
12999           join_posix_op_known:
13000
13001             if (invert) {
13002                 op += NPOSIXD - POSIXD;
13003             }
13004
13005             ret = reg_node(pRExC_state, op);
13006             if (! SIZE_ONLY) {
13007                 FLAGS(ret) = namedclass_to_classnum(arg);
13008             }
13009
13010             *flagp |= HASWIDTH|SIMPLE;
13011             /* FALLTHROUGH */
13012
13013           finish_meta_pat:
13014             if (   UCHARAT(RExC_parse + 1) == '{'
13015                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13016             {
13017                 RExC_parse += 2;
13018                 vFAIL("Unescaped left brace in regex is illegal here");
13019             }
13020             nextchar(pRExC_state);
13021             Set_Node_Length(ret, 2); /* MJD */
13022             break;
13023         case 'p':
13024         case 'P':
13025             RExC_parse--;
13026
13027             ret = regclass(pRExC_state, flagp,depth+1,
13028                            TRUE, /* means just parse this element */
13029                            FALSE, /* don't allow multi-char folds */
13030                            FALSE, /* don't silence non-portable warnings.  It
13031                                      would be a bug if these returned
13032                                      non-portables */
13033                            (bool) RExC_strict,
13034                            TRUE, /* Allow an optimized regnode result */
13035                            NULL,
13036                            NULL);
13037             if (*flagp & RESTART_PASS1)
13038                 return NULL;
13039             /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13040              * multi-char folds are allowed.  */
13041             if (!ret)
13042                 FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf,
13043                       (UV) *flagp);
13044
13045             RExC_parse--;
13046
13047             Set_Node_Offset(ret, parse_start);
13048             Set_Node_Cur_Length(ret, parse_start - 2);
13049             nextchar(pRExC_state);
13050             break;
13051         case 'N':
13052             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13053              * \N{...} evaluates to a sequence of more than one code points).
13054              * The function call below returns a regnode, which is our result.
13055              * The parameters cause it to fail if the \N{} evaluates to a
13056              * single code point; we handle those like any other literal.  The
13057              * reason that the multicharacter case is handled here and not as
13058              * part of the EXACtish code is because of quantifiers.  In
13059              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13060              * this way makes that Just Happen. dmq.
13061              * join_exact() will join this up with adjacent EXACTish nodes
13062              * later on, if appropriate. */
13063             ++RExC_parse;
13064             if (grok_bslash_N(pRExC_state,
13065                               &ret,     /* Want a regnode returned */
13066                               NULL,     /* Fail if evaluates to a single code
13067                                            point */
13068                               NULL,     /* Don't need a count of how many code
13069                                            points */
13070                               flagp,
13071                               RExC_strict,
13072                               depth)
13073             ) {
13074                 break;
13075             }
13076
13077             if (*flagp & RESTART_PASS1)
13078                 return NULL;
13079
13080             /* Here, evaluates to a single code point.  Go get that */
13081             RExC_parse = parse_start;
13082             goto defchar;
13083
13084         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13085       parse_named_seq:
13086         {
13087             char ch;
13088             if (   RExC_parse >= RExC_end - 1
13089                 || ((   ch = RExC_parse[1]) != '<'
13090                                       && ch != '\''
13091                                       && ch != '{'))
13092             {
13093                 RExC_parse++;
13094                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13095                 vFAIL2("Sequence %.2s... not terminated",parse_start);
13096             } else {
13097                 RExC_parse += 2;
13098                 ret = handle_named_backref(pRExC_state,
13099                                            flagp,
13100                                            parse_start,
13101                                            (ch == '<')
13102                                            ? '>'
13103                                            : (ch == '{')
13104                                              ? '}'
13105                                              : '\'');
13106             }
13107             break;
13108         }
13109         case 'g':
13110         case '1': case '2': case '3': case '4':
13111         case '5': case '6': case '7': case '8': case '9':
13112             {
13113                 I32 num;
13114                 bool hasbrace = 0;
13115
13116                 if (*RExC_parse == 'g') {
13117                     bool isrel = 0;
13118
13119                     RExC_parse++;
13120                     if (*RExC_parse == '{') {
13121                         RExC_parse++;
13122                         hasbrace = 1;
13123                     }
13124                     if (*RExC_parse == '-') {
13125                         RExC_parse++;
13126                         isrel = 1;
13127                     }
13128                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13129                         if (isrel) RExC_parse--;
13130                         RExC_parse -= 2;
13131                         goto parse_named_seq;
13132                     }
13133
13134                     if (RExC_parse >= RExC_end) {
13135                         goto unterminated_g;
13136                     }
13137                     num = S_backref_value(RExC_parse);
13138                     if (num == 0)
13139                         vFAIL("Reference to invalid group 0");
13140                     else if (num == I32_MAX) {
13141                          if (isDIGIT(*RExC_parse))
13142                             vFAIL("Reference to nonexistent group");
13143                         else
13144                           unterminated_g:
13145                             vFAIL("Unterminated \\g... pattern");
13146                     }
13147
13148                     if (isrel) {
13149                         num = RExC_npar - num;
13150                         if (num < 1)
13151                             vFAIL("Reference to nonexistent or unclosed group");
13152                     }
13153                 }
13154                 else {
13155                     num = S_backref_value(RExC_parse);
13156                     /* bare \NNN might be backref or octal - if it is larger
13157                      * than or equal RExC_npar then it is assumed to be an
13158                      * octal escape. Note RExC_npar is +1 from the actual
13159                      * number of parens. */
13160                     /* Note we do NOT check if num == I32_MAX here, as that is
13161                      * handled by the RExC_npar check */
13162
13163                     if (
13164                         /* any numeric escape < 10 is always a backref */
13165                         num > 9
13166                         /* any numeric escape < RExC_npar is a backref */
13167                         && num >= RExC_npar
13168                         /* cannot be an octal escape if it starts with 8 */
13169                         && *RExC_parse != '8'
13170                         /* cannot be an octal escape it it starts with 9 */
13171                         && *RExC_parse != '9'
13172                     )
13173                     {
13174                         /* Probably not a backref, instead likely to be an
13175                          * octal character escape, e.g. \35 or \777.
13176                          * The above logic should make it obvious why using
13177                          * octal escapes in patterns is problematic. - Yves */
13178                         RExC_parse = parse_start;
13179                         goto defchar;
13180                     }
13181                 }
13182
13183                 /* At this point RExC_parse points at a numeric escape like
13184                  * \12 or \88 or something similar, which we should NOT treat
13185                  * as an octal escape. It may or may not be a valid backref
13186                  * escape. For instance \88888888 is unlikely to be a valid
13187                  * backref. */
13188                 while (isDIGIT(*RExC_parse))
13189                     RExC_parse++;
13190                 if (hasbrace) {
13191                     if (*RExC_parse != '}')
13192                         vFAIL("Unterminated \\g{...} pattern");
13193                     RExC_parse++;
13194                 }
13195                 if (!SIZE_ONLY) {
13196                     if (num > (I32)RExC_rx->nparens)
13197                         vFAIL("Reference to nonexistent group");
13198                 }
13199                 RExC_sawback = 1;
13200                 ret = reganode(pRExC_state,
13201                                ((! FOLD)
13202                                  ? REF
13203                                  : (ASCII_FOLD_RESTRICTED)
13204                                    ? REFFA
13205                                    : (AT_LEAST_UNI_SEMANTICS)
13206                                      ? REFFU
13207                                      : (LOC)
13208                                        ? REFFL
13209                                        : REFF),
13210                                 num);
13211                 *flagp |= HASWIDTH;
13212
13213                 /* override incorrect value set in reganode MJD */
13214                 Set_Node_Offset(ret, parse_start);
13215                 Set_Node_Cur_Length(ret, parse_start-1);
13216                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13217                                         FALSE /* Don't force to /x */ );
13218             }
13219             break;
13220         case '\0':
13221             if (RExC_parse >= RExC_end)
13222                 FAIL("Trailing \\");
13223             /* FALLTHROUGH */
13224         default:
13225             /* Do not generate "unrecognized" warnings here, we fall
13226                back into the quick-grab loop below */
13227             RExC_parse = parse_start;
13228             goto defchar;
13229         } /* end of switch on a \foo sequence */
13230         break;
13231
13232     case '#':
13233
13234         /* '#' comments should have been spaced over before this function was
13235          * called */
13236         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13237         /*
13238         if (RExC_flags & RXf_PMf_EXTENDED) {
13239             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13240             if (RExC_parse < RExC_end)
13241                 goto tryagain;
13242         }
13243         */
13244
13245         /* FALLTHROUGH */
13246
13247     default:
13248           defchar: {
13249
13250             /* Here, we have determined that the next thing is probably a
13251              * literal character.  RExC_parse points to the first byte of its
13252              * definition.  (It still may be an escape sequence that evaluates
13253              * to a single character) */
13254
13255             STRLEN len = 0;
13256             UV ender = 0;
13257             char *p;
13258             char *s;
13259 #define MAX_NODE_STRING_SIZE 127
13260             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
13261             char *s0;
13262             U8 upper_parse = MAX_NODE_STRING_SIZE;
13263             U8 node_type = compute_EXACTish(pRExC_state);
13264             bool next_is_quantifier;
13265             char * oldp = NULL;
13266
13267             /* We can convert EXACTF nodes to EXACTFU if they contain only
13268              * characters that match identically regardless of the target
13269              * string's UTF8ness.  The reason to do this is that EXACTF is not
13270              * trie-able, EXACTFU is.
13271              *
13272              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13273              * contain only above-Latin1 characters (hence must be in UTF8),
13274              * which don't participate in folds with Latin1-range characters,
13275              * as the latter's folds aren't known until runtime.  (We don't
13276              * need to figure this out until pass 2) */
13277             bool maybe_exactfu = PASS2
13278                                && (node_type == EXACTF || node_type == EXACTFL);
13279
13280             /* If a folding node contains only code points that don't
13281              * participate in folds, it can be changed into an EXACT node,
13282              * which allows the optimizer more things to look for */
13283             bool maybe_exact;
13284
13285             ret = reg_node(pRExC_state, node_type);
13286
13287             /* In pass1, folded, we use a temporary buffer instead of the
13288              * actual node, as the node doesn't exist yet */
13289             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
13290
13291             s0 = s;
13292
13293           reparse:
13294
13295             /* We look for the EXACTFish to EXACT node optimizaton only if
13296              * folding.  (And we don't need to figure this out until pass 2).
13297              * XXX It might actually make sense to split the node into portions
13298              * that are exact and ones that aren't, so that we could later use
13299              * the exact ones to find the longest fixed and floating strings.
13300              * One would want to join them back into a larger node.  One could
13301              * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
13302             maybe_exact = FOLD && PASS2;
13303
13304             /* XXX The node can hold up to 255 bytes, yet this only goes to
13305              * 127.  I (khw) do not know why.  Keeping it somewhat less than
13306              * 255 allows us to not have to worry about overflow due to
13307              * converting to utf8 and fold expansion, but that value is
13308              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
13309              * split up by this limit into a single one using the real max of
13310              * 255.  Even at 127, this breaks under rare circumstances.  If
13311              * folding, we do not want to split a node at a character that is a
13312              * non-final in a multi-char fold, as an input string could just
13313              * happen to want to match across the node boundary.  The join
13314              * would solve that problem if the join actually happens.  But a
13315              * series of more than two nodes in a row each of 127 would cause
13316              * the first join to succeed to get to 254, but then there wouldn't
13317              * be room for the next one, which could at be one of those split
13318              * multi-char folds.  I don't know of any fool-proof solution.  One
13319              * could back off to end with only a code point that isn't such a
13320              * non-final, but it is possible for there not to be any in the
13321              * entire node. */
13322
13323             assert(   ! UTF     /* Is at the beginning of a character */
13324                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13325                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13326
13327             /* Here, we have a literal character.  Find the maximal string of
13328              * them in the input that we can fit into a single EXACTish node.
13329              * We quit at the first non-literal or when the node gets full */
13330             for (p = RExC_parse;
13331                  len < upper_parse && p < RExC_end;
13332                  len++)
13333             {
13334                 oldp = p;
13335
13336                 /* White space has already been ignored */
13337                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13338                        || ! is_PATWS_safe((p), RExC_end, UTF));
13339
13340                 switch ((U8)*p) {
13341                 case '^':
13342                 case '$':
13343                 case '.':
13344                 case '[':
13345                 case '(':
13346                 case ')':
13347                 case '|':
13348                     goto loopdone;
13349                 case '\\':
13350                     /* Literal Escapes Switch
13351
13352                        This switch is meant to handle escape sequences that
13353                        resolve to a literal character.
13354
13355                        Every escape sequence that represents something
13356                        else, like an assertion or a char class, is handled
13357                        in the switch marked 'Special Escapes' above in this
13358                        routine, but also has an entry here as anything that
13359                        isn't explicitly mentioned here will be treated as
13360                        an unescaped equivalent literal.
13361                     */
13362
13363                     switch ((U8)*++p) {
13364                     /* These are all the special escapes. */
13365                     case 'A':             /* Start assertion */
13366                     case 'b': case 'B':   /* Word-boundary assertion*/
13367                     case 'C':             /* Single char !DANGEROUS! */
13368                     case 'd': case 'D':   /* digit class */
13369                     case 'g': case 'G':   /* generic-backref, pos assertion */
13370                     case 'h': case 'H':   /* HORIZWS */
13371                     case 'k': case 'K':   /* named backref, keep marker */
13372                     case 'p': case 'P':   /* Unicode property */
13373                               case 'R':   /* LNBREAK */
13374                     case 's': case 'S':   /* space class */
13375                     case 'v': case 'V':   /* VERTWS */
13376                     case 'w': case 'W':   /* word class */
13377                     case 'X':             /* eXtended Unicode "combining
13378                                              character sequence" */
13379                     case 'z': case 'Z':   /* End of line/string assertion */
13380                         --p;
13381                         goto loopdone;
13382
13383                     /* Anything after here is an escape that resolves to a
13384                        literal. (Except digits, which may or may not)
13385                      */
13386                     case 'n':
13387                         ender = '\n';
13388                         p++;
13389                         break;
13390                     case 'N': /* Handle a single-code point named character. */
13391                         RExC_parse = p + 1;
13392                         if (! grok_bslash_N(pRExC_state,
13393                                             NULL,   /* Fail if evaluates to
13394                                                        anything other than a
13395                                                        single code point */
13396                                             &ender, /* The returned single code
13397                                                        point */
13398                                             NULL,   /* Don't need a count of
13399                                                        how many code points */
13400                                             flagp,
13401                                             RExC_strict,
13402                                             depth)
13403                         ) {
13404                             if (*flagp & NEED_UTF8)
13405                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13406                             if (*flagp & RESTART_PASS1)
13407                                 return NULL;
13408
13409                             /* Here, it wasn't a single code point.  Go close
13410                              * up this EXACTish node.  The switch() prior to
13411                              * this switch handles the other cases */
13412                             RExC_parse = p = oldp;
13413                             goto loopdone;
13414                         }
13415                         p = RExC_parse;
13416                         RExC_parse = parse_start;
13417                         if (ender > 0xff) {
13418                             REQUIRE_UTF8(flagp);
13419                         }
13420                         break;
13421                     case 'r':
13422                         ender = '\r';
13423                         p++;
13424                         break;
13425                     case 't':
13426                         ender = '\t';
13427                         p++;
13428                         break;
13429                     case 'f':
13430                         ender = '\f';
13431                         p++;
13432                         break;
13433                     case 'e':
13434                         ender = ESC_NATIVE;
13435                         p++;
13436                         break;
13437                     case 'a':
13438                         ender = '\a';
13439                         p++;
13440                         break;
13441                     case 'o':
13442                         {
13443                             UV result;
13444                             const char* error_msg;
13445
13446                             bool valid = grok_bslash_o(&p,
13447                                                        RExC_end,
13448                                                        &result,
13449                                                        &error_msg,
13450                                                        PASS2, /* out warnings */
13451                                                        (bool) RExC_strict,
13452                                                        TRUE, /* Output warnings
13453                                                                 for non-
13454                                                                 portables */
13455                                                        UTF);
13456                             if (! valid) {
13457                                 RExC_parse = p; /* going to die anyway; point
13458                                                    to exact spot of failure */
13459                                 vFAIL(error_msg);
13460                             }
13461                             ender = result;
13462                             if (ender > 0xff) {
13463                                 REQUIRE_UTF8(flagp);
13464                             }
13465                             break;
13466                         }
13467                     case 'x':
13468                         {
13469                             UV result = UV_MAX; /* initialize to erroneous
13470                                                    value */
13471                             const char* error_msg;
13472
13473                             bool valid = grok_bslash_x(&p,
13474                                                        RExC_end,
13475                                                        &result,
13476                                                        &error_msg,
13477                                                        PASS2, /* out warnings */
13478                                                        (bool) RExC_strict,
13479                                                        TRUE, /* Silence warnings
13480                                                                 for non-
13481                                                                 portables */
13482                                                        UTF);
13483                             if (! valid) {
13484                                 RExC_parse = p; /* going to die anyway; point
13485                                                    to exact spot of failure */
13486                                 vFAIL(error_msg);
13487                             }
13488                             ender = result;
13489
13490                             if (ender < 0x100) {
13491 #ifdef EBCDIC
13492                                 if (RExC_recode_x_to_native) {
13493                                     ender = LATIN1_TO_NATIVE(ender);
13494                                 }
13495 #endif
13496                             }
13497                             else {
13498                                 REQUIRE_UTF8(flagp);
13499                             }
13500                             break;
13501                         }
13502                     case 'c':
13503                         p++;
13504                         ender = grok_bslash_c(*p++, PASS2);
13505                         break;
13506                     case '8': case '9': /* must be a backreference */
13507                         --p;
13508                         /* we have an escape like \8 which cannot be an octal escape
13509                          * so we exit the loop, and let the outer loop handle this
13510                          * escape which may or may not be a legitimate backref. */
13511                         goto loopdone;
13512                     case '1': case '2': case '3':case '4':
13513                     case '5': case '6': case '7':
13514                         /* When we parse backslash escapes there is ambiguity
13515                          * between backreferences and octal escapes. Any escape
13516                          * from \1 - \9 is a backreference, any multi-digit
13517                          * escape which does not start with 0 and which when
13518                          * evaluated as decimal could refer to an already
13519                          * parsed capture buffer is a back reference. Anything
13520                          * else is octal.
13521                          *
13522                          * Note this implies that \118 could be interpreted as
13523                          * 118 OR as "\11" . "8" depending on whether there
13524                          * were 118 capture buffers defined already in the
13525                          * pattern.  */
13526
13527                         /* NOTE, RExC_npar is 1 more than the actual number of
13528                          * parens we have seen so far, hence the < RExC_npar below. */
13529
13530                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13531                         {  /* Not to be treated as an octal constant, go
13532                                    find backref */
13533                             --p;
13534                             goto loopdone;
13535                         }
13536                         /* FALLTHROUGH */
13537                     case '0':
13538                         {
13539                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13540                             STRLEN numlen = 3;
13541                             ender = grok_oct(p, &numlen, &flags, NULL);
13542                             if (ender > 0xff) {
13543                                 REQUIRE_UTF8(flagp);
13544                             }
13545                             p += numlen;
13546                             if (PASS2   /* like \08, \178 */
13547                                 && numlen < 3
13548                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13549                             {
13550                                 reg_warn_non_literal_string(
13551                                          p + 1,
13552                                          form_short_octal_warning(p, numlen));
13553                             }
13554                         }
13555                         break;
13556                     case '\0':
13557                         if (p >= RExC_end)
13558                             FAIL("Trailing \\");
13559                         /* FALLTHROUGH */
13560                     default:
13561                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13562                             /* Include any left brace following the alpha to emphasize
13563                              * that it could be part of an escape at some point
13564                              * in the future */
13565                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13566                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13567                         }
13568                         goto normal_default;
13569                     } /* End of switch on '\' */
13570                     break;
13571                 case '{':
13572                     /* Currently we allow an lbrace at the start of a construct
13573                      * without raising a warning.  This is because we think we
13574                      * will never want such a brace to be meant to be other
13575                      * than taken literally. */
13576                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
13577
13578                         /* But, we raise a fatal warning otherwise, as the
13579                          * deprecation cycle has come and gone.  Except that it
13580                          * turns out that some heavily-relied on upstream
13581                          * software, notably GNU Autoconf, have failed to fix
13582                          * their uses.  For these, don't make it fatal unless
13583                          * we anticipate using the '{' for something else.
13584                          * This happens after any alpha, and for a looser {m,n}
13585                          * quantifier specification */
13586                         if (      RExC_strict
13587                             || (  p > parse_start + 1
13588                                 && isALPHA_A(*(p - 1))
13589                                 && *(p - 2) == '\\')
13590                             || new_regcurly(p, RExC_end))
13591                         {
13592                             RExC_parse = p + 1;
13593                             vFAIL("Unescaped left brace in regex is "
13594                                   "illegal here");
13595                         }
13596                         if (PASS2) {
13597                             ckWARNregdep(p + 1,
13598                                         "Unescaped left brace in regex is "
13599                                         "deprecated here (and will be fatal "
13600                                         "in Perl 5.30), passed through");
13601                         }
13602                     }
13603                     goto normal_default;
13604                 case '}':
13605                 case ']':
13606                     if (PASS2 && p > RExC_parse && RExC_strict) {
13607                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
13608                     }
13609                     /*FALLTHROUGH*/
13610                 default:    /* A literal character */
13611                   normal_default:
13612                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
13613                         STRLEN numlen;
13614                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13615                                                &numlen, UTF8_ALLOW_DEFAULT);
13616                         p += numlen;
13617                     }
13618                     else
13619                         ender = (U8) *p++;
13620                     break;
13621                 } /* End of switch on the literal */
13622
13623                 /* Here, have looked at the literal character and <ender>
13624                  * contains its ordinal, <p> points to the character after it.
13625                  * We need to check if the next non-ignored thing is a
13626                  * quantifier.  Move <p> to after anything that should be
13627                  * ignored, which, as a side effect, positions <p> for the next
13628                  * loop iteration */
13629                 skip_to_be_ignored_text(pRExC_state, &p,
13630                                         FALSE /* Don't force to /x */ );
13631
13632                 /* If the next thing is a quantifier, it applies to this
13633                  * character only, which means that this character has to be in
13634                  * its own node and can't just be appended to the string in an
13635                  * existing node, so if there are already other characters in
13636                  * the node, close the node with just them, and set up to do
13637                  * this character again next time through, when it will be the
13638                  * only thing in its new node */
13639
13640                 next_is_quantifier =    LIKELY(p < RExC_end)
13641                                      && UNLIKELY(ISMULT2(p));
13642
13643                 if (next_is_quantifier && LIKELY(len)) {
13644                     p = oldp;
13645                     goto loopdone;
13646                 }
13647
13648                 /* Ready to add 'ender' to the node */
13649
13650                 if (! FOLD) {  /* The simple case, just append the literal */
13651
13652                     /* In the sizing pass, we need only the size of the
13653                      * character we are appending, hence we can delay getting
13654                      * its representation until PASS2. */
13655                     if (SIZE_ONLY) {
13656                         if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
13657                             const STRLEN unilen = UVCHR_SKIP(ender);
13658                             s += unilen;
13659
13660                             /* We have to subtract 1 just below (and again in
13661                              * the corresponding PASS2 code) because the loop
13662                              * increments <len> each time, as all but this path
13663                              * (and one other) through it add a single byte to
13664                              * the EXACTish node.  But these paths would change
13665                              * len to be the correct final value, so cancel out
13666                              * the increment that follows */
13667                             len += unilen - 1;
13668                         }
13669                         else {
13670                             s++;
13671                         }
13672                     } else { /* PASS2 */
13673                       not_fold_common:
13674                         if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
13675                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13676                             len += (char *) new_s - s - 1;
13677                             s = (char *) new_s;
13678                         }
13679                         else {
13680                             *(s++) = (char) ender;
13681                         }
13682                     }
13683                 }
13684                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13685
13686                     /* Here are folding under /l, and the code point is
13687                      * problematic.  First, we know we can't simplify things */
13688                     maybe_exact = FALSE;
13689                     maybe_exactfu = FALSE;
13690
13691                     /* A problematic code point in this context means that its
13692                      * fold isn't known until runtime, so we can't fold it now.
13693                      * (The non-problematic code points are the above-Latin1
13694                      * ones that fold to also all above-Latin1.  Their folds
13695                      * don't vary no matter what the locale is.) But here we
13696                      * have characters whose fold depends on the locale.
13697                      * Unlike the non-folding case above, we have to keep track
13698                      * of these in the sizing pass, so that we can make sure we
13699                      * don't split too-long nodes in the middle of a potential
13700                      * multi-char fold.  And unlike the regular fold case
13701                      * handled in the else clauses below, we don't actually
13702                      * fold and don't have special cases to consider.  What we
13703                      * do for both passes is the PASS2 code for non-folding */
13704                     goto not_fold_common;
13705                 }
13706                 else /* A regular FOLD code point */
13707                     if (! (   UTF
13708 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13709    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13710                                       || UNICODE_DOT_DOT_VERSION > 0)
13711                             /* See comments for join_exact() as to why we fold
13712                              * this non-UTF at compile time */
13713                             || (   node_type == EXACTFU
13714                                 && ender == LATIN_SMALL_LETTER_SHARP_S)
13715 #endif
13716                 )) {
13717                     /* Here, are folding and are not UTF-8 encoded; therefore
13718                      * the character must be in the range 0-255, and is not /l
13719                      * (Not /l because we already handled these under /l in
13720                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13721                     if (IS_IN_SOME_FOLD_L1(ender)) {
13722                         maybe_exact = FALSE;
13723
13724                         /* See if the character's fold differs between /d and
13725                          * /u.  This includes the multi-char fold SHARP S to
13726                          * 'ss' */
13727                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13728                             RExC_seen_unfolded_sharp_s = 1;
13729                             maybe_exactfu = FALSE;
13730                         }
13731                         else if (maybe_exactfu
13732                             && (PL_fold[ender] != PL_fold_latin1[ender]
13733 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13734    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13735                                       || UNICODE_DOT_DOT_VERSION > 0)
13736                                 || (   len > 0
13737                                     && isALPHA_FOLD_EQ(ender, 's')
13738                                     && isALPHA_FOLD_EQ(*(s-1), 's'))
13739 #endif
13740                         )) {
13741                             maybe_exactfu = FALSE;
13742                         }
13743                     }
13744
13745                     /* Even when folding, we store just the input character, as
13746                      * we have an array that finds its fold quickly */
13747                     *(s++) = (char) ender;
13748                 }
13749                 else {  /* FOLD, and UTF (or sharp s) */
13750                     /* Unlike the non-fold case, we do actually have to
13751                      * calculate the results here in pass 1.  This is for two
13752                      * reasons, the folded length may be longer than the
13753                      * unfolded, and we have to calculate how many EXACTish
13754                      * nodes it will take; and we may run out of room in a node
13755                      * in the middle of a potential multi-char fold, and have
13756                      * to back off accordingly.  */
13757
13758                     UV folded;
13759                     if (isASCII_uni(ender)) {
13760                         folded = toFOLD(ender);
13761                         *(s)++ = (U8) folded;
13762                     }
13763                     else {
13764                         STRLEN foldlen;
13765
13766                         folded = _to_uni_fold_flags(
13767                                      ender,
13768                                      (U8 *) s,
13769                                      &foldlen,
13770                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13771                                                         ? FOLD_FLAGS_NOMIX_ASCII
13772                                                         : 0));
13773                         s += foldlen;
13774
13775                         /* The loop increments <len> each time, as all but this
13776                          * path (and one other) through it add a single byte to
13777                          * the EXACTish node.  But this one has changed len to
13778                          * be the correct final value, so subtract one to
13779                          * cancel out the increment that follows */
13780                         len += foldlen - 1;
13781                     }
13782                     /* If this node only contains non-folding code points so
13783                      * far, see if this new one is also non-folding */
13784                     if (maybe_exact) {
13785                         if (folded != ender) {
13786                             maybe_exact = FALSE;
13787                         }
13788                         else {
13789                             /* Here the fold is the original; we have to check
13790                              * further to see if anything folds to it */
13791                             if (_invlist_contains_cp(PL_utf8_foldable,
13792                                                         ender))
13793                             {
13794                                 maybe_exact = FALSE;
13795                             }
13796                         }
13797                     }
13798                     ender = folded;
13799                 }
13800
13801                 if (next_is_quantifier) {
13802
13803                     /* Here, the next input is a quantifier, and to get here,
13804                      * the current character is the only one in the node.
13805                      * Also, here <len> doesn't include the final byte for this
13806                      * character */
13807                     len++;
13808                     goto loopdone;
13809                 }
13810
13811             } /* End of loop through literal characters */
13812
13813             /* Here we have either exhausted the input or ran out of room in
13814              * the node.  (If we encountered a character that can't be in the
13815              * node, transfer is made directly to <loopdone>, and so we
13816              * wouldn't have fallen off the end of the loop.)  In the latter
13817              * case, we artificially have to split the node into two, because
13818              * we just don't have enough space to hold everything.  This
13819              * creates a problem if the final character participates in a
13820              * multi-character fold in the non-final position, as a match that
13821              * should have occurred won't, due to the way nodes are matched,
13822              * and our artificial boundary.  So back off until we find a non-
13823              * problematic character -- one that isn't at the beginning or
13824              * middle of such a fold.  (Either it doesn't participate in any
13825              * folds, or appears only in the final position of all the folds it
13826              * does participate in.)  A better solution with far fewer false
13827              * positives, and that would fill the nodes more completely, would
13828              * be to actually have available all the multi-character folds to
13829              * test against, and to back-off only far enough to be sure that
13830              * this node isn't ending with a partial one.  <upper_parse> is set
13831              * further below (if we need to reparse the node) to include just
13832              * up through that final non-problematic character that this code
13833              * identifies, so when it is set to less than the full node, we can
13834              * skip the rest of this */
13835             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13836
13837                 const STRLEN full_len = len;
13838
13839                 assert(len >= MAX_NODE_STRING_SIZE);
13840
13841                 /* Here, <s> points to the final byte of the final character.
13842                  * Look backwards through the string until find a non-
13843                  * problematic character */
13844
13845                 if (! UTF) {
13846
13847                     /* This has no multi-char folds to non-UTF characters */
13848                     if (ASCII_FOLD_RESTRICTED) {
13849                         goto loopdone;
13850                     }
13851
13852                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13853                     len = s - s0 + 1;
13854                 }
13855                 else {
13856                     if (!  PL_NonL1NonFinalFold) {
13857                         PL_NonL1NonFinalFold = _new_invlist_C_array(
13858                                         NonL1_Perl_Non_Final_Folds_invlist);
13859                     }
13860
13861                     /* Point to the first byte of the final character */
13862                     s = (char *) utf8_hop((U8 *) s, -1);
13863
13864                     while (s >= s0) {   /* Search backwards until find
13865                                            non-problematic char */
13866                         if (UTF8_IS_INVARIANT(*s)) {
13867
13868                             /* There are no ascii characters that participate
13869                              * in multi-char folds under /aa.  In EBCDIC, the
13870                              * non-ascii invariants are all control characters,
13871                              * so don't ever participate in any folds. */
13872                             if (ASCII_FOLD_RESTRICTED
13873                                 || ! IS_NON_FINAL_FOLD(*s))
13874                             {
13875                                 break;
13876                             }
13877                         }
13878                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13879                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13880                                                                   *s, *(s+1))))
13881                             {
13882                                 break;
13883                             }
13884                         }
13885                         else if (! _invlist_contains_cp(
13886                                         PL_NonL1NonFinalFold,
13887                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
13888                         {
13889                             break;
13890                         }
13891
13892                         /* Here, the current character is problematic in that
13893                          * it does occur in the non-final position of some
13894                          * fold, so try the character before it, but have to
13895                          * special case the very first byte in the string, so
13896                          * we don't read outside the string */
13897                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13898                     } /* End of loop backwards through the string */
13899
13900                     /* If there were only problematic characters in the string,
13901                      * <s> will point to before s0, in which case the length
13902                      * should be 0, otherwise include the length of the
13903                      * non-problematic character just found */
13904                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13905                 }
13906
13907                 /* Here, have found the final character, if any, that is
13908                  * non-problematic as far as ending the node without splitting
13909                  * it across a potential multi-char fold.  <len> contains the
13910                  * number of bytes in the node up-to and including that
13911                  * character, or is 0 if there is no such character, meaning
13912                  * the whole node contains only problematic characters.  In
13913                  * this case, give up and just take the node as-is.  We can't
13914                  * do any better */
13915                 if (len == 0) {
13916                     len = full_len;
13917
13918                     /* If the node ends in an 's' we make sure it stays EXACTF,
13919                      * as if it turns into an EXACTFU, it could later get
13920                      * joined with another 's' that would then wrongly match
13921                      * the sharp s */
13922                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13923                     {
13924                         maybe_exactfu = FALSE;
13925                     }
13926                 } else {
13927
13928                     /* Here, the node does contain some characters that aren't
13929                      * problematic.  If one such is the final character in the
13930                      * node, we are done */
13931                     if (len == full_len) {
13932                         goto loopdone;
13933                     }
13934                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13935
13936                         /* If the final character is problematic, but the
13937                          * penultimate is not, back-off that last character to
13938                          * later start a new node with it */
13939                         p = oldp;
13940                         goto loopdone;
13941                     }
13942
13943                     /* Here, the final non-problematic character is earlier
13944                      * in the input than the penultimate character.  What we do
13945                      * is reparse from the beginning, going up only as far as
13946                      * this final ok one, thus guaranteeing that the node ends
13947                      * in an acceptable character.  The reason we reparse is
13948                      * that we know how far in the character is, but we don't
13949                      * know how to correlate its position with the input parse.
13950                      * An alternate implementation would be to build that
13951                      * correlation as we go along during the original parse,
13952                      * but that would entail extra work for every node, whereas
13953                      * this code gets executed only when the string is too
13954                      * large for the node, and the final two characters are
13955                      * problematic, an infrequent occurrence.  Yet another
13956                      * possible strategy would be to save the tail of the
13957                      * string, and the next time regatom is called, initialize
13958                      * with that.  The problem with this is that unless you
13959                      * back off one more character, you won't be guaranteed
13960                      * regatom will get called again, unless regbranch,
13961                      * regpiece ... are also changed.  If you do back off that
13962                      * extra character, so that there is input guaranteed to
13963                      * force calling regatom, you can't handle the case where
13964                      * just the first character in the node is acceptable.  I
13965                      * (khw) decided to try this method which doesn't have that
13966                      * pitfall; if performance issues are found, we can do a
13967                      * combination of the current approach plus that one */
13968                     upper_parse = len;
13969                     len = 0;
13970                     s = s0;
13971                     goto reparse;
13972                 }
13973             }   /* End of verifying node ends with an appropriate char */
13974
13975           loopdone:   /* Jumped to when encounters something that shouldn't be
13976                          in the node */
13977
13978             /* I (khw) don't know if you can get here with zero length, but the
13979              * old code handled this situation by creating a zero-length EXACT
13980              * node.  Might as well be NOTHING instead */
13981             if (len == 0) {
13982                 OP(ret) = NOTHING;
13983             }
13984             else {
13985                 if (FOLD) {
13986                     /* If 'maybe_exact' is still set here, means there are no
13987                      * code points in the node that participate in folds;
13988                      * similarly for 'maybe_exactfu' and code points that match
13989                      * differently depending on UTF8ness of the target string
13990                      * (for /u), or depending on locale for /l */
13991                     if (maybe_exact) {
13992                         OP(ret) = (LOC)
13993                                   ? EXACTL
13994                                   : EXACT;
13995                     }
13996                     else if (maybe_exactfu) {
13997                         OP(ret) = (LOC)
13998                                   ? EXACTFLU8
13999                                   : EXACTFU;
14000                     }
14001                 }
14002                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
14003                                            FALSE /* Don't look to see if could
14004                                                     be turned into an EXACT
14005                                                     node, as we have already
14006                                                     computed that */
14007                                           );
14008             }
14009
14010             RExC_parse = p - 1;
14011             Set_Node_Cur_Length(ret, parse_start);
14012             RExC_parse = p;
14013             {
14014                 /* len is STRLEN which is unsigned, need to copy to signed */
14015                 IV iv = len;
14016                 if (iv < 0)
14017                     vFAIL("Internal disaster");
14018             }
14019
14020         } /* End of label 'defchar:' */
14021         break;
14022     } /* End of giant switch on input character */
14023
14024     /* Position parse to next real character */
14025     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14026                                             FALSE /* Don't force to /x */ );
14027     if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
14028         ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through");
14029     }
14030
14031     return(ret);
14032 }
14033
14034
14035 STATIC void
14036 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14037 {
14038     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
14039      * sets up the bitmap and any flags, removing those code points from the
14040      * inversion list, setting it to NULL should it become completely empty */
14041
14042     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14043     assert(PL_regkind[OP(node)] == ANYOF);
14044
14045     ANYOF_BITMAP_ZERO(node);
14046     if (*invlist_ptr) {
14047
14048         /* This gets set if we actually need to modify things */
14049         bool change_invlist = FALSE;
14050
14051         UV start, end;
14052
14053         /* Start looking through *invlist_ptr */
14054         invlist_iterinit(*invlist_ptr);
14055         while (invlist_iternext(*invlist_ptr, &start, &end)) {
14056             UV high;
14057             int i;
14058
14059             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14060                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14061             }
14062
14063             /* Quit if are above what we should change */
14064             if (start >= NUM_ANYOF_CODE_POINTS) {
14065                 break;
14066             }
14067
14068             change_invlist = TRUE;
14069
14070             /* Set all the bits in the range, up to the max that we are doing */
14071             high = (end < NUM_ANYOF_CODE_POINTS - 1)
14072                    ? end
14073                    : NUM_ANYOF_CODE_POINTS - 1;
14074             for (i = start; i <= (int) high; i++) {
14075                 if (! ANYOF_BITMAP_TEST(node, i)) {
14076                     ANYOF_BITMAP_SET(node, i);
14077                 }
14078             }
14079         }
14080         invlist_iterfinish(*invlist_ptr);
14081
14082         /* Done with loop; remove any code points that are in the bitmap from
14083          * *invlist_ptr; similarly for code points above the bitmap if we have
14084          * a flag to match all of them anyways */
14085         if (change_invlist) {
14086             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14087         }
14088         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14089             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14090         }
14091
14092         /* If have completely emptied it, remove it completely */
14093         if (_invlist_len(*invlist_ptr) == 0) {
14094             SvREFCNT_dec_NN(*invlist_ptr);
14095             *invlist_ptr = NULL;
14096         }
14097     }
14098 }
14099
14100 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14101    Character classes ([:foo:]) can also be negated ([:^foo:]).
14102    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14103    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14104    but trigger failures because they are currently unimplemented. */
14105
14106 #define POSIXCC_DONE(c)   ((c) == ':')
14107 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14108 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14109 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14110
14111 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
14112 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
14113 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
14114
14115 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14116
14117 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14118  * routine. q.v. */
14119 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
14120         if (posix_warnings) {                                               \
14121             if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
14122             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                          \
14123                                              WARNING_PREFIX                 \
14124                                              text                           \
14125                                              REPORT_LOCATION,               \
14126                                              REPORT_LOCATION_ARGS(p)));     \
14127         }                                                                   \
14128     } STMT_END
14129 #define CLEAR_POSIX_WARNINGS()                                              \
14130     STMT_START {                                                            \
14131         if (posix_warnings && RExC_warn_text)                               \
14132             av_clear(RExC_warn_text);                                       \
14133     } STMT_END
14134
14135 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
14136     STMT_START {                                                            \
14137         CLEAR_POSIX_WARNINGS();                                             \
14138         return ret;                                                         \
14139     } STMT_END
14140
14141 STATIC int
14142 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14143
14144     const char * const s,      /* Where the putative posix class begins.
14145                                   Normally, this is one past the '['.  This
14146                                   parameter exists so it can be somewhere
14147                                   besides RExC_parse. */
14148     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14149                                   NULL */
14150     AV ** posix_warnings,      /* Where to place any generated warnings, or
14151                                   NULL */
14152     const bool check_only      /* Don't die if error */
14153 )
14154 {
14155     /* This parses what the caller thinks may be one of the three POSIX
14156      * constructs:
14157      *  1) a character class, like [:blank:]
14158      *  2) a collating symbol, like [. .]
14159      *  3) an equivalence class, like [= =]
14160      * In the latter two cases, it croaks if it finds a syntactically legal
14161      * one, as these are not handled by Perl.
14162      *
14163      * The main purpose is to look for a POSIX character class.  It returns:
14164      *  a) the class number
14165      *      if it is a completely syntactically and semantically legal class.
14166      *      'updated_parse_ptr', if not NULL, is set to point to just after the
14167      *      closing ']' of the class
14168      *  b) OOB_NAMEDCLASS
14169      *      if it appears that one of the three POSIX constructs was meant, but
14170      *      its specification was somehow defective.  'updated_parse_ptr', if
14171      *      not NULL, is set to point to the character just after the end
14172      *      character of the class.  See below for handling of warnings.
14173      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14174      *      if it  doesn't appear that a POSIX construct was intended.
14175      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
14176      *      raised.
14177      *
14178      * In b) there may be errors or warnings generated.  If 'check_only' is
14179      * TRUE, then any errors are discarded.  Warnings are returned to the
14180      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
14181      * instead it is NULL, warnings are suppressed.  This is done in all
14182      * passes.  The reason for this is that the rest of the parsing is heavily
14183      * dependent on whether this routine found a valid posix class or not.  If
14184      * it did, the closing ']' is absorbed as part of the class.  If no class,
14185      * or an invalid one is found, any ']' will be considered the terminator of
14186      * the outer bracketed character class, leading to very different results.
14187      * In particular, a '(?[ ])' construct will likely have a syntax error if
14188      * the class is parsed other than intended, and this will happen in pass1,
14189      * before the warnings would normally be output.  This mechanism allows the
14190      * caller to output those warnings in pass1 just before dieing, giving a
14191      * much better clue as to what is wrong.
14192      *
14193      * The reason for this function, and its complexity is that a bracketed
14194      * character class can contain just about anything.  But it's easy to
14195      * mistype the very specific posix class syntax but yielding a valid
14196      * regular bracketed class, so it silently gets compiled into something
14197      * quite unintended.
14198      *
14199      * The solution adopted here maintains backward compatibility except that
14200      * it adds a warning if it looks like a posix class was intended but
14201      * improperly specified.  The warning is not raised unless what is input
14202      * very closely resembles one of the 14 legal posix classes.  To do this,
14203      * it uses fuzzy parsing.  It calculates how many single-character edits it
14204      * would take to transform what was input into a legal posix class.  Only
14205      * if that number is quite small does it think that the intention was a
14206      * posix class.  Obviously these are heuristics, and there will be cases
14207      * where it errs on one side or another, and they can be tweaked as
14208      * experience informs.
14209      *
14210      * The syntax for a legal posix class is:
14211      *
14212      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14213      *
14214      * What this routine considers syntactically to be an intended posix class
14215      * is this (the comments indicate some restrictions that the pattern
14216      * doesn't show):
14217      *
14218      *  qr/(?x: \[?                         # The left bracket, possibly
14219      *                                      # omitted
14220      *          \h*                         # possibly followed by blanks
14221      *          (?: \^ \h* )?               # possibly a misplaced caret
14222      *          [:;]?                       # The opening class character,
14223      *                                      # possibly omitted.  A typo
14224      *                                      # semi-colon can also be used.
14225      *          \h*
14226      *          \^?                         # possibly a correctly placed
14227      *                                      # caret, but not if there was also
14228      *                                      # a misplaced one
14229      *          \h*
14230      *          .{3,15}                     # The class name.  If there are
14231      *                                      # deviations from the legal syntax,
14232      *                                      # its edit distance must be close
14233      *                                      # to a real class name in order
14234      *                                      # for it to be considered to be
14235      *                                      # an intended posix class.
14236      *          \h*
14237      *          [[:punct:]]?                # The closing class character,
14238      *                                      # possibly omitted.  If not a colon
14239      *                                      # nor semi colon, the class name
14240      *                                      # must be even closer to a valid
14241      *                                      # one
14242      *          \h*
14243      *          \]?                         # The right bracket, possibly
14244      *                                      # omitted.
14245      *     )/
14246      *
14247      * In the above, \h must be ASCII-only.
14248      *
14249      * These are heuristics, and can be tweaked as field experience dictates.
14250      * There will be cases when someone didn't intend to specify a posix class
14251      * that this warns as being so.  The goal is to minimize these, while
14252      * maximizing the catching of things intended to be a posix class that
14253      * aren't parsed as such.
14254      */
14255
14256     const char* p             = s;
14257     const char * const e      = RExC_end;
14258     unsigned complement       = 0;      /* If to complement the class */
14259     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
14260     bool has_opening_bracket  = FALSE;
14261     bool has_opening_colon    = FALSE;
14262     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
14263                                                    valid class */
14264     const char * possible_end = NULL;   /* used for a 2nd parse pass */
14265     const char* name_start;             /* ptr to class name first char */
14266
14267     /* If the number of single-character typos the input name is away from a
14268      * legal name is no more than this number, it is considered to have meant
14269      * the legal name */
14270     int max_distance          = 2;
14271
14272     /* to store the name.  The size determines the maximum length before we
14273      * decide that no posix class was intended.  Should be at least
14274      * sizeof("alphanumeric") */
14275     UV input_text[15];
14276     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14277
14278     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14279
14280     CLEAR_POSIX_WARNINGS();
14281
14282     if (p >= e) {
14283         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14284     }
14285
14286     if (*(p - 1) != '[') {
14287         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14288         found_problem = TRUE;
14289     }
14290     else {
14291         has_opening_bracket = TRUE;
14292     }
14293
14294     /* They could be confused and think you can put spaces between the
14295      * components */
14296     if (isBLANK(*p)) {
14297         found_problem = TRUE;
14298
14299         do {
14300             p++;
14301         } while (p < e && isBLANK(*p));
14302
14303         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14304     }
14305
14306     /* For [. .] and [= =].  These are quite different internally from [: :],
14307      * so they are handled separately.  */
14308     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14309                                             and 1 for at least one char in it
14310                                           */
14311     {
14312         const char open_char  = *p;
14313         const char * temp_ptr = p + 1;
14314
14315         /* These two constructs are not handled by perl, and if we find a
14316          * syntactically valid one, we croak.  khw, who wrote this code, finds
14317          * this explanation of them very unclear:
14318          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14319          * And searching the rest of the internet wasn't very helpful either.
14320          * It looks like just about any byte can be in these constructs,
14321          * depending on the locale.  But unless the pattern is being compiled
14322          * under /l, which is very rare, Perl runs under the C or POSIX locale.
14323          * In that case, it looks like [= =] isn't allowed at all, and that
14324          * [. .] could be any single code point, but for longer strings the
14325          * constituent characters would have to be the ASCII alphabetics plus
14326          * the minus-hyphen.  Any sensible locale definition would limit itself
14327          * to these.  And any portable one definitely should.  Trying to parse
14328          * the general case is a nightmare (see [perl #127604]).  So, this code
14329          * looks only for interiors of these constructs that match:
14330          *      qr/.|[-\w]{2,}/
14331          * Using \w relaxes the apparent rules a little, without adding much
14332          * danger of mistaking something else for one of these constructs.
14333          *
14334          * [. .] in some implementations described on the internet is usable to
14335          * escape a character that otherwise is special in bracketed character
14336          * classes.  For example [.].] means a literal right bracket instead of
14337          * the ending of the class
14338          *
14339          * [= =] can legitimately contain a [. .] construct, but we don't
14340          * handle this case, as that [. .] construct will later get parsed
14341          * itself and croak then.  And [= =] is checked for even when not under
14342          * /l, as Perl has long done so.
14343          *
14344          * The code below relies on there being a trailing NUL, so it doesn't
14345          * have to keep checking if the parse ptr < e.
14346          */
14347         if (temp_ptr[1] == open_char) {
14348             temp_ptr++;
14349         }
14350         else while (    temp_ptr < e
14351                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14352         {
14353             temp_ptr++;
14354         }
14355
14356         if (*temp_ptr == open_char) {
14357             temp_ptr++;
14358             if (*temp_ptr == ']') {
14359                 temp_ptr++;
14360                 if (! found_problem && ! check_only) {
14361                     RExC_parse = (char *) temp_ptr;
14362                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
14363                             "extensions", open_char, open_char);
14364                 }
14365
14366                 /* Here, the syntax wasn't completely valid, or else the call
14367                  * is to check-only */
14368                 if (updated_parse_ptr) {
14369                     *updated_parse_ptr = (char *) temp_ptr;
14370                 }
14371
14372                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14373             }
14374         }
14375
14376         /* If we find something that started out to look like one of these
14377          * constructs, but isn't, we continue below so that it can be checked
14378          * for being a class name with a typo of '.' or '=' instead of a colon.
14379          * */
14380     }
14381
14382     /* Here, we think there is a possibility that a [: :] class was meant, and
14383      * we have the first real character.  It could be they think the '^' comes
14384      * first */
14385     if (*p == '^') {
14386         found_problem = TRUE;
14387         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14388         complement = 1;
14389         p++;
14390
14391         if (isBLANK(*p)) {
14392             found_problem = TRUE;
14393
14394             do {
14395                 p++;
14396             } while (p < e && isBLANK(*p));
14397
14398             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14399         }
14400     }
14401
14402     /* But the first character should be a colon, which they could have easily
14403      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14404      * distinguish from a colon, so treat that as a colon).  */
14405     if (*p == ':') {
14406         p++;
14407         has_opening_colon = TRUE;
14408     }
14409     else if (*p == ';') {
14410         found_problem = TRUE;
14411         p++;
14412         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14413         has_opening_colon = TRUE;
14414     }
14415     else {
14416         found_problem = TRUE;
14417         ADD_POSIX_WARNING(p, "there must be a starting ':'");
14418
14419         /* Consider an initial punctuation (not one of the recognized ones) to
14420          * be a left terminator */
14421         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14422             p++;
14423         }
14424     }
14425
14426     /* They may think that you can put spaces between the components */
14427     if (isBLANK(*p)) {
14428         found_problem = TRUE;
14429
14430         do {
14431             p++;
14432         } while (p < e && isBLANK(*p));
14433
14434         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14435     }
14436
14437     if (*p == '^') {
14438
14439         /* We consider something like [^:^alnum:]] to not have been intended to
14440          * be a posix class, but XXX maybe we should */
14441         if (complement) {
14442             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14443         }
14444
14445         complement = 1;
14446         p++;
14447     }
14448
14449     /* Again, they may think that you can put spaces between the components */
14450     if (isBLANK(*p)) {
14451         found_problem = TRUE;
14452
14453         do {
14454             p++;
14455         } while (p < e && isBLANK(*p));
14456
14457         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14458     }
14459
14460     if (*p == ']') {
14461
14462         /* XXX This ']' may be a typo, and something else was meant.  But
14463          * treating it as such creates enough complications, that that
14464          * possibility isn't currently considered here.  So we assume that the
14465          * ']' is what is intended, and if we've already found an initial '[',
14466          * this leaves this construct looking like [:] or [:^], which almost
14467          * certainly weren't intended to be posix classes */
14468         if (has_opening_bracket) {
14469             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14470         }
14471
14472         /* But this function can be called when we parse the colon for
14473          * something like qr/[alpha:]]/, so we back up to look for the
14474          * beginning */
14475         p--;
14476
14477         if (*p == ';') {
14478             found_problem = TRUE;
14479             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14480         }
14481         else if (*p != ':') {
14482
14483             /* XXX We are currently very restrictive here, so this code doesn't
14484              * consider the possibility that, say, /[alpha.]]/ was intended to
14485              * be a posix class. */
14486             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14487         }
14488
14489         /* Here we have something like 'foo:]'.  There was no initial colon,
14490          * and we back up over 'foo.  XXX Unlike the going forward case, we
14491          * don't handle typos of non-word chars in the middle */
14492         has_opening_colon = FALSE;
14493         p--;
14494
14495         while (p > RExC_start && isWORDCHAR(*p)) {
14496             p--;
14497         }
14498         p++;
14499
14500         /* Here, we have positioned ourselves to where we think the first
14501          * character in the potential class is */
14502     }
14503
14504     /* Now the interior really starts.  There are certain key characters that
14505      * can end the interior, or these could just be typos.  To catch both
14506      * cases, we may have to do two passes.  In the first pass, we keep on
14507      * going unless we come to a sequence that matches
14508      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
14509      * This means it takes a sequence to end the pass, so two typos in a row if
14510      * that wasn't what was intended.  If the class is perfectly formed, just
14511      * this one pass is needed.  We also stop if there are too many characters
14512      * being accumulated, but this number is deliberately set higher than any
14513      * real class.  It is set high enough so that someone who thinks that
14514      * 'alphanumeric' is a correct name would get warned that it wasn't.
14515      * While doing the pass, we keep track of where the key characters were in
14516      * it.  If we don't find an end to the class, and one of the key characters
14517      * was found, we redo the pass, but stop when we get to that character.
14518      * Thus the key character was considered a typo in the first pass, but a
14519      * terminator in the second.  If two key characters are found, we stop at
14520      * the second one in the first pass.  Again this can miss two typos, but
14521      * catches a single one
14522      *
14523      * In the first pass, 'possible_end' starts as NULL, and then gets set to
14524      * point to the first key character.  For the second pass, it starts as -1.
14525      * */
14526
14527     name_start = p;
14528   parse_name:
14529     {
14530         bool has_blank               = FALSE;
14531         bool has_upper               = FALSE;
14532         bool has_terminating_colon   = FALSE;
14533         bool has_terminating_bracket = FALSE;
14534         bool has_semi_colon          = FALSE;
14535         unsigned int name_len        = 0;
14536         int punct_count              = 0;
14537
14538         while (p < e) {
14539
14540             /* Squeeze out blanks when looking up the class name below */
14541             if (isBLANK(*p) ) {
14542                 has_blank = TRUE;
14543                 found_problem = TRUE;
14544                 p++;
14545                 continue;
14546             }
14547
14548             /* The name will end with a punctuation */
14549             if (isPUNCT(*p)) {
14550                 const char * peek = p + 1;
14551
14552                 /* Treat any non-']' punctuation followed by a ']' (possibly
14553                  * with intervening blanks) as trying to terminate the class.
14554                  * ']]' is very likely to mean a class was intended (but
14555                  * missing the colon), but the warning message that gets
14556                  * generated shows the error position better if we exit the
14557                  * loop at the bottom (eventually), so skip it here. */
14558                 if (*p != ']') {
14559                     if (peek < e && isBLANK(*peek)) {
14560                         has_blank = TRUE;
14561                         found_problem = TRUE;
14562                         do {
14563                             peek++;
14564                         } while (peek < e && isBLANK(*peek));
14565                     }
14566
14567                     if (peek < e && *peek == ']') {
14568                         has_terminating_bracket = TRUE;
14569                         if (*p == ':') {
14570                             has_terminating_colon = TRUE;
14571                         }
14572                         else if (*p == ';') {
14573                             has_semi_colon = TRUE;
14574                             has_terminating_colon = TRUE;
14575                         }
14576                         else {
14577                             found_problem = TRUE;
14578                         }
14579                         p = peek + 1;
14580                         goto try_posix;
14581                     }
14582                 }
14583
14584                 /* Here we have punctuation we thought didn't end the class.
14585                  * Keep track of the position of the key characters that are
14586                  * more likely to have been class-enders */
14587                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14588
14589                     /* Allow just one such possible class-ender not actually
14590                      * ending the class. */
14591                     if (possible_end) {
14592                         break;
14593                     }
14594                     possible_end = p;
14595                 }
14596
14597                 /* If we have too many punctuation characters, no use in
14598                  * keeping going */
14599                 if (++punct_count > max_distance) {
14600                     break;
14601                 }
14602
14603                 /* Treat the punctuation as a typo. */
14604                 input_text[name_len++] = *p;
14605                 p++;
14606             }
14607             else if (isUPPER(*p)) { /* Use lowercase for lookup */
14608                 input_text[name_len++] = toLOWER(*p);
14609                 has_upper = TRUE;
14610                 found_problem = TRUE;
14611                 p++;
14612             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14613                 input_text[name_len++] = *p;
14614                 p++;
14615             }
14616             else {
14617                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14618                 p+= UTF8SKIP(p);
14619             }
14620
14621             /* The declaration of 'input_text' is how long we allow a potential
14622              * class name to be, before saying they didn't mean a class name at
14623              * all */
14624             if (name_len >= C_ARRAY_LENGTH(input_text)) {
14625                 break;
14626             }
14627         }
14628
14629         /* We get to here when the possible class name hasn't been properly
14630          * terminated before:
14631          *   1) we ran off the end of the pattern; or
14632          *   2) found two characters, each of which might have been intended to
14633          *      be the name's terminator
14634          *   3) found so many punctuation characters in the purported name,
14635          *      that the edit distance to a valid one is exceeded
14636          *   4) we decided it was more characters than anyone could have
14637          *      intended to be one. */
14638
14639         found_problem = TRUE;
14640
14641         /* In the final two cases, we know that looking up what we've
14642          * accumulated won't lead to a match, even a fuzzy one. */
14643         if (   name_len >= C_ARRAY_LENGTH(input_text)
14644             || punct_count > max_distance)
14645         {
14646             /* If there was an intermediate key character that could have been
14647              * an intended end, redo the parse, but stop there */
14648             if (possible_end && possible_end != (char *) -1) {
14649                 possible_end = (char *) -1; /* Special signal value to say
14650                                                we've done a first pass */
14651                 p = name_start;
14652                 goto parse_name;
14653             }
14654
14655             /* Otherwise, it can't have meant to have been a class */
14656             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14657         }
14658
14659         /* If we ran off the end, and the final character was a punctuation
14660          * one, back up one, to look at that final one just below.  Later, we
14661          * will restore the parse pointer if appropriate */
14662         if (name_len && p == e && isPUNCT(*(p-1))) {
14663             p--;
14664             name_len--;
14665         }
14666
14667         if (p < e && isPUNCT(*p)) {
14668             if (*p == ']') {
14669                 has_terminating_bracket = TRUE;
14670
14671                 /* If this is a 2nd ']', and the first one is just below this
14672                  * one, consider that to be the real terminator.  This gives a
14673                  * uniform and better positioning for the warning message  */
14674                 if (   possible_end
14675                     && possible_end != (char *) -1
14676                     && *possible_end == ']'
14677                     && name_len && input_text[name_len - 1] == ']')
14678                 {
14679                     name_len--;
14680                     p = possible_end;
14681
14682                     /* And this is actually equivalent to having done the 2nd
14683                      * pass now, so set it to not try again */
14684                     possible_end = (char *) -1;
14685                 }
14686             }
14687             else {
14688                 if (*p == ':') {
14689                     has_terminating_colon = TRUE;
14690                 }
14691                 else if (*p == ';') {
14692                     has_semi_colon = TRUE;
14693                     has_terminating_colon = TRUE;
14694                 }
14695                 p++;
14696             }
14697         }
14698
14699     try_posix:
14700
14701         /* Here, we have a class name to look up.  We can short circuit the
14702          * stuff below for short names that can't possibly be meant to be a
14703          * class name.  (We can do this on the first pass, as any second pass
14704          * will yield an even shorter name) */
14705         if (name_len < 3) {
14706             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14707         }
14708
14709         /* Find which class it is.  Initially switch on the length of the name.
14710          * */
14711         switch (name_len) {
14712             case 4:
14713                 if (memEQs(name_start, 4, "word")) {
14714                     /* this is not POSIX, this is the Perl \w */
14715                     class_number = ANYOF_WORDCHAR;
14716                 }
14717                 break;
14718             case 5:
14719                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14720                  *                        graph lower print punct space upper
14721                  * Offset 4 gives the best switch position.  */
14722                 switch (name_start[4]) {
14723                     case 'a':
14724                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
14725                             class_number = ANYOF_ALPHA;
14726                         break;
14727                     case 'e':
14728                         if (memBEGINs(name_start, 5, "spac")) /* space */
14729                             class_number = ANYOF_SPACE;
14730                         break;
14731                     case 'h':
14732                         if (memBEGINs(name_start, 5, "grap")) /* graph */
14733                             class_number = ANYOF_GRAPH;
14734                         break;
14735                     case 'i':
14736                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
14737                             class_number = ANYOF_ASCII;
14738                         break;
14739                     case 'k':
14740                         if (memBEGINs(name_start, 5, "blan")) /* blank */
14741                             class_number = ANYOF_BLANK;
14742                         break;
14743                     case 'l':
14744                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
14745                             class_number = ANYOF_CNTRL;
14746                         break;
14747                     case 'm':
14748                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
14749                             class_number = ANYOF_ALPHANUMERIC;
14750                         break;
14751                     case 'r':
14752                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
14753                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14754                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
14755                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14756                         break;
14757                     case 't':
14758                         if (memBEGINs(name_start, 5, "digi")) /* digit */
14759                             class_number = ANYOF_DIGIT;
14760                         else if (memBEGINs(name_start, 5, "prin")) /* print */
14761                             class_number = ANYOF_PRINT;
14762                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
14763                             class_number = ANYOF_PUNCT;
14764                         break;
14765                 }
14766                 break;
14767             case 6:
14768                 if (memEQs(name_start, 6, "xdigit"))
14769                     class_number = ANYOF_XDIGIT;
14770                 break;
14771         }
14772
14773         /* If the name exactly matches a posix class name the class number will
14774          * here be set to it, and the input almost certainly was meant to be a
14775          * posix class, so we can skip further checking.  If instead the syntax
14776          * is exactly correct, but the name isn't one of the legal ones, we
14777          * will return that as an error below.  But if neither of these apply,
14778          * it could be that no posix class was intended at all, or that one
14779          * was, but there was a typo.  We tease these apart by doing fuzzy
14780          * matching on the name */
14781         if (class_number == OOB_NAMEDCLASS && found_problem) {
14782             const UV posix_names[][6] = {
14783                                                 { 'a', 'l', 'n', 'u', 'm' },
14784                                                 { 'a', 'l', 'p', 'h', 'a' },
14785                                                 { 'a', 's', 'c', 'i', 'i' },
14786                                                 { 'b', 'l', 'a', 'n', 'k' },
14787                                                 { 'c', 'n', 't', 'r', 'l' },
14788                                                 { 'd', 'i', 'g', 'i', 't' },
14789                                                 { 'g', 'r', 'a', 'p', 'h' },
14790                                                 { 'l', 'o', 'w', 'e', 'r' },
14791                                                 { 'p', 'r', 'i', 'n', 't' },
14792                                                 { 'p', 'u', 'n', 'c', 't' },
14793                                                 { 's', 'p', 'a', 'c', 'e' },
14794                                                 { 'u', 'p', 'p', 'e', 'r' },
14795                                                 { 'w', 'o', 'r', 'd' },
14796                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
14797                                             };
14798             /* The names of the above all have added NULs to make them the same
14799              * size, so we need to also have the real lengths */
14800             const UV posix_name_lengths[] = {
14801                                                 sizeof("alnum") - 1,
14802                                                 sizeof("alpha") - 1,
14803                                                 sizeof("ascii") - 1,
14804                                                 sizeof("blank") - 1,
14805                                                 sizeof("cntrl") - 1,
14806                                                 sizeof("digit") - 1,
14807                                                 sizeof("graph") - 1,
14808                                                 sizeof("lower") - 1,
14809                                                 sizeof("print") - 1,
14810                                                 sizeof("punct") - 1,
14811                                                 sizeof("space") - 1,
14812                                                 sizeof("upper") - 1,
14813                                                 sizeof("word")  - 1,
14814                                                 sizeof("xdigit")- 1
14815                                             };
14816             unsigned int i;
14817             int temp_max = max_distance;    /* Use a temporary, so if we
14818                                                reparse, we haven't changed the
14819                                                outer one */
14820
14821             /* Use a smaller max edit distance if we are missing one of the
14822              * delimiters */
14823             if (   has_opening_bracket + has_opening_colon < 2
14824                 || has_terminating_bracket + has_terminating_colon < 2)
14825             {
14826                 temp_max--;
14827             }
14828
14829             /* See if the input name is close to a legal one */
14830             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14831
14832                 /* Short circuit call if the lengths are too far apart to be
14833                  * able to match */
14834                 if (abs( (int) (name_len - posix_name_lengths[i]))
14835                     > temp_max)
14836                 {
14837                     continue;
14838                 }
14839
14840                 if (edit_distance(input_text,
14841                                   posix_names[i],
14842                                   name_len,
14843                                   posix_name_lengths[i],
14844                                   temp_max
14845                                  )
14846                     > -1)
14847                 { /* If it is close, it probably was intended to be a class */
14848                     goto probably_meant_to_be;
14849                 }
14850             }
14851
14852             /* Here the input name is not close enough to a valid class name
14853              * for us to consider it to be intended to be a posix class.  If
14854              * we haven't already done so, and the parse found a character that
14855              * could have been terminators for the name, but which we absorbed
14856              * as typos during the first pass, repeat the parse, signalling it
14857              * to stop at that character */
14858             if (possible_end && possible_end != (char *) -1) {
14859                 possible_end = (char *) -1;
14860                 p = name_start;
14861                 goto parse_name;
14862             }
14863
14864             /* Here neither pass found a close-enough class name */
14865             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14866         }
14867
14868     probably_meant_to_be:
14869
14870         /* Here we think that a posix specification was intended.  Update any
14871          * parse pointer */
14872         if (updated_parse_ptr) {
14873             *updated_parse_ptr = (char *) p;
14874         }
14875
14876         /* If a posix class name was intended but incorrectly specified, we
14877          * output or return the warnings */
14878         if (found_problem) {
14879
14880             /* We set flags for these issues in the parse loop above instead of
14881              * adding them to the list of warnings, because we can parse it
14882              * twice, and we only want one warning instance */
14883             if (has_upper) {
14884                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14885             }
14886             if (has_blank) {
14887                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14888             }
14889             if (has_semi_colon) {
14890                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14891             }
14892             else if (! has_terminating_colon) {
14893                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14894             }
14895             if (! has_terminating_bracket) {
14896                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14897             }
14898
14899             if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
14900                 *posix_warnings = RExC_warn_text;
14901             }
14902         }
14903         else if (class_number != OOB_NAMEDCLASS) {
14904             /* If it is a known class, return the class.  The class number
14905              * #defines are structured so each complement is +1 to the normal
14906              * one */
14907             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
14908         }
14909         else if (! check_only) {
14910
14911             /* Here, it is an unrecognized class.  This is an error (unless the
14912             * call is to check only, which we've already handled above) */
14913             const char * const complement_string = (complement)
14914                                                    ? "^"
14915                                                    : "";
14916             RExC_parse = (char *) p;
14917             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
14918                         complement_string,
14919                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14920         }
14921     }
14922
14923     return OOB_NAMEDCLASS;
14924 }
14925 #undef ADD_POSIX_WARNING
14926
14927 STATIC unsigned  int
14928 S_regex_set_precedence(const U8 my_operator) {
14929
14930     /* Returns the precedence in the (?[...]) construct of the input operator,
14931      * specified by its character representation.  The precedence follows
14932      * general Perl rules, but it extends this so that ')' and ']' have (low)
14933      * precedence even though they aren't really operators */
14934
14935     switch (my_operator) {
14936         case '!':
14937             return 5;
14938         case '&':
14939             return 4;
14940         case '^':
14941         case '|':
14942         case '+':
14943         case '-':
14944             return 3;
14945         case ')':
14946             return 2;
14947         case ']':
14948             return 1;
14949     }
14950
14951     NOT_REACHED; /* NOTREACHED */
14952     return 0;   /* Silence compiler warning */
14953 }
14954
14955 STATIC regnode *
14956 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14957                     I32 *flagp, U32 depth,
14958                     char * const oregcomp_parse)
14959 {
14960     /* Handle the (?[...]) construct to do set operations */
14961
14962     U8 curchar;                     /* Current character being parsed */
14963     UV start, end;                  /* End points of code point ranges */
14964     SV* final = NULL;               /* The end result inversion list */
14965     SV* result_string;              /* 'final' stringified */
14966     AV* stack;                      /* stack of operators and operands not yet
14967                                        resolved */
14968     AV* fence_stack = NULL;         /* A stack containing the positions in
14969                                        'stack' of where the undealt-with left
14970                                        parens would be if they were actually
14971                                        put there */
14972     /* The 'volatile' is a workaround for an optimiser bug
14973      * in Solaris Studio 12.3. See RT #127455 */
14974     volatile IV fence = 0;          /* Position of where most recent undealt-
14975                                        with left paren in stack is; -1 if none.
14976                                      */
14977     STRLEN len;                     /* Temporary */
14978     regnode* node;                  /* Temporary, and final regnode returned by
14979                                        this function */
14980     const bool save_fold = FOLD;    /* Temporary */
14981     char *save_end, *save_parse;    /* Temporaries */
14982     const bool in_locale = LOC;     /* we turn off /l during processing */
14983     AV* posix_warnings = NULL;
14984
14985     GET_RE_DEBUG_FLAGS_DECL;
14986
14987     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14988
14989     DEBUG_PARSE("xcls");
14990
14991     if (in_locale) {
14992         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14993     }
14994
14995     REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
14996                                          This is required so that the compile
14997                                          time values are valid in all runtime
14998                                          cases */
14999
15000     /* This will return only an ANYOF regnode, or (unlikely) something smaller
15001      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
15002      * call regclass to handle '[]' so as to not have to reinvent its parsing
15003      * rules here (throwing away the size it computes each time).  And, we exit
15004      * upon an unescaped ']' that isn't one ending a regclass.  To do both
15005      * these things, we need to realize that something preceded by a backslash
15006      * is escaped, so we have to keep track of backslashes */
15007     if (SIZE_ONLY) {
15008         UV nest_depth = 0; /* how many nested (?[...]) constructs */
15009
15010         while (RExC_parse < RExC_end) {
15011             SV* current = NULL;
15012
15013             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15014                                     TRUE /* Force /x */ );
15015
15016             switch (*RExC_parse) {
15017                 case '(':
15018                     if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
15019                         nest_depth++, RExC_parse+=2;
15020                     /* FALLTHROUGH */
15021                 default:
15022                     break;
15023                 case '\\':
15024                     /* Skip past this, so the next character gets skipped, after
15025                      * the switch */
15026                     RExC_parse++;
15027                     if (*RExC_parse == 'c') {
15028                             /* Skip the \cX notation for control characters */
15029                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15030                     }
15031                     break;
15032
15033                 case '[':
15034                 {
15035                     /* See if this is a [:posix:] class. */
15036                     bool is_posix_class = (OOB_NAMEDCLASS
15037                             < handle_possible_posix(pRExC_state,
15038                                                 RExC_parse + 1,
15039                                                 NULL,
15040                                                 NULL,
15041                                                 TRUE /* checking only */));
15042                     /* If it is a posix class, leave the parse pointer at the
15043                      * '[' to fool regclass() into thinking it is part of a
15044                      * '[[:posix:]]'. */
15045                     if (! is_posix_class) {
15046                         RExC_parse++;
15047                     }
15048
15049                     /* regclass() can only return RESTART_PASS1 and NEED_UTF8
15050                      * if multi-char folds are allowed.  */
15051                     if (!regclass(pRExC_state, flagp,depth+1,
15052                                   is_posix_class, /* parse the whole char
15053                                                      class only if not a
15054                                                      posix class */
15055                                   FALSE, /* don't allow multi-char folds */
15056                                   TRUE, /* silence non-portable warnings. */
15057                                   TRUE, /* strict */
15058                                   FALSE, /* Require return to be an ANYOF */
15059                                   &current,
15060                                   &posix_warnings
15061                                  ))
15062                         FAIL2("panic: regclass returned NULL to handle_sets, "
15063                               "flags=%#" UVxf, (UV) *flagp);
15064
15065                     /* function call leaves parse pointing to the ']', except
15066                      * if we faked it */
15067                     if (is_posix_class) {
15068                         RExC_parse--;
15069                     }
15070
15071                     SvREFCNT_dec(current);   /* In case it returned something */
15072                     break;
15073                 }
15074
15075                 case ']':
15076                     if (RExC_parse[1] == ')') {
15077                         RExC_parse++;
15078                         if (nest_depth--) break;
15079                         node = reganode(pRExC_state, ANYOF, 0);
15080                         RExC_size += ANYOF_SKIP;
15081                         nextchar(pRExC_state);
15082                         Set_Node_Length(node,
15083                                 RExC_parse - oregcomp_parse + 1); /* MJD */
15084                         if (in_locale) {
15085                             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15086                         }
15087
15088                         return node;
15089                     }
15090                     /* We output the messages even if warnings are off, because we'll fail
15091                      * the very next thing, and these give a likely diagnosis for that */
15092                     if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15093                         output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15094                     }
15095                     RExC_parse++;
15096                     vFAIL("Unexpected ']' with no following ')' in (?[...");
15097             }
15098
15099             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15100         }
15101
15102         /* We output the messages even if warnings are off, because we'll fail
15103          * the very next thing, and these give a likely diagnosis for that */
15104         if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15105             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15106         }
15107
15108         vFAIL("Syntax error in (?[...])");
15109     }
15110
15111     /* Pass 2 only after this. */
15112     Perl_ck_warner_d(aTHX_
15113         packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
15114         "The regex_sets feature is experimental" REPORT_LOCATION,
15115         REPORT_LOCATION_ARGS(RExC_parse));
15116
15117     /* Everything in this construct is a metacharacter.  Operands begin with
15118      * either a '\' (for an escape sequence), or a '[' for a bracketed
15119      * character class.  Any other character should be an operator, or
15120      * parenthesis for grouping.  Both types of operands are handled by calling
15121      * regclass() to parse them.  It is called with a parameter to indicate to
15122      * return the computed inversion list.  The parsing here is implemented via
15123      * a stack.  Each entry on the stack is a single character representing one
15124      * of the operators; or else a pointer to an operand inversion list. */
15125
15126 #define IS_OPERATOR(a) SvIOK(a)
15127 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
15128
15129     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
15130      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15131      * with pronouncing it called it Reverse Polish instead, but now that YOU
15132      * know how to pronounce it you can use the correct term, thus giving due
15133      * credit to the person who invented it, and impressing your geek friends.
15134      * Wikipedia says that the pronounciation of "Ł" has been changing so that
15135      * it is now more like an English initial W (as in wonk) than an L.)
15136      *
15137      * This means that, for example, 'a | b & c' is stored on the stack as
15138      *
15139      * c  [4]
15140      * b  [3]
15141      * &  [2]
15142      * a  [1]
15143      * |  [0]
15144      *
15145      * where the numbers in brackets give the stack [array] element number.
15146      * In this implementation, parentheses are not stored on the stack.
15147      * Instead a '(' creates a "fence" so that the part of the stack below the
15148      * fence is invisible except to the corresponding ')' (this allows us to
15149      * replace testing for parens, by using instead subtraction of the fence
15150      * position).  As new operands are processed they are pushed onto the stack
15151      * (except as noted in the next paragraph).  New operators of higher
15152      * precedence than the current final one are inserted on the stack before
15153      * the lhs operand (so that when the rhs is pushed next, everything will be
15154      * in the correct positions shown above.  When an operator of equal or
15155      * lower precedence is encountered in parsing, all the stacked operations
15156      * of equal or higher precedence are evaluated, leaving the result as the
15157      * top entry on the stack.  This makes higher precedence operations
15158      * evaluate before lower precedence ones, and causes operations of equal
15159      * precedence to left associate.
15160      *
15161      * The only unary operator '!' is immediately pushed onto the stack when
15162      * encountered.  When an operand is encountered, if the top of the stack is
15163      * a '!", the complement is immediately performed, and the '!' popped.  The
15164      * resulting value is treated as a new operand, and the logic in the
15165      * previous paragraph is executed.  Thus in the expression
15166      *      [a] + ! [b]
15167      * the stack looks like
15168      *
15169      * !
15170      * a
15171      * +
15172      *
15173      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15174      * becomes
15175      *
15176      * !b
15177      * a
15178      * +
15179      *
15180      * A ')' is treated as an operator with lower precedence than all the
15181      * aforementioned ones, which causes all operations on the stack above the
15182      * corresponding '(' to be evaluated down to a single resultant operand.
15183      * Then the fence for the '(' is removed, and the operand goes through the
15184      * algorithm above, without the fence.
15185      *
15186      * A separate stack is kept of the fence positions, so that the position of
15187      * the latest so-far unbalanced '(' is at the top of it.
15188      *
15189      * The ']' ending the construct is treated as the lowest operator of all,
15190      * so that everything gets evaluated down to a single operand, which is the
15191      * result */
15192
15193     sv_2mortal((SV *)(stack = newAV()));
15194     sv_2mortal((SV *)(fence_stack = newAV()));
15195
15196     while (RExC_parse < RExC_end) {
15197         I32 top_index;              /* Index of top-most element in 'stack' */
15198         SV** top_ptr;               /* Pointer to top 'stack' element */
15199         SV* current = NULL;         /* To contain the current inversion list
15200                                        operand */
15201         SV* only_to_avoid_leaks;
15202
15203         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15204                                 TRUE /* Force /x */ );
15205         if (RExC_parse >= RExC_end) {
15206             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
15207         }
15208
15209         curchar = UCHARAT(RExC_parse);
15210
15211 redo_curchar:
15212
15213 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15214                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15215         DEBUG_U(dump_regex_sets_structures(pRExC_state,
15216                                            stack, fence, fence_stack));
15217 #endif
15218
15219         top_index = av_tindex_skip_len_mg(stack);
15220
15221         switch (curchar) {
15222             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15223             char stacked_operator;  /* The topmost operator on the 'stack'. */
15224             SV* lhs;                /* Operand to the left of the operator */
15225             SV* rhs;                /* Operand to the right of the operator */
15226             SV* fence_ptr;          /* Pointer to top element of the fence
15227                                        stack */
15228
15229             case '(':
15230
15231                 if (   RExC_parse < RExC_end - 1
15232                     && (UCHARAT(RExC_parse + 1) == '?'))
15233                 {
15234                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
15235                      * This happens when we have some thing like
15236                      *
15237                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15238                      *   ...
15239                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15240                      *
15241                      * Here we would be handling the interpolated
15242                      * '$thai_or_lao'.  We handle this by a recursive call to
15243                      * ourselves which returns the inversion list the
15244                      * interpolated expression evaluates to.  We use the flags
15245                      * from the interpolated pattern. */
15246                     U32 save_flags = RExC_flags;
15247                     const char * save_parse;
15248
15249                     RExC_parse += 2;        /* Skip past the '(?' */
15250                     save_parse = RExC_parse;
15251
15252                     /* Parse any flags for the '(?' */
15253                     parse_lparen_question_flags(pRExC_state);
15254
15255                     if (RExC_parse == save_parse  /* Makes sure there was at
15256                                                      least one flag (or else
15257                                                      this embedding wasn't
15258                                                      compiled) */
15259                         || RExC_parse >= RExC_end - 4
15260                         || UCHARAT(RExC_parse) != ':'
15261                         || UCHARAT(++RExC_parse) != '('
15262                         || UCHARAT(++RExC_parse) != '?'
15263                         || UCHARAT(++RExC_parse) != '[')
15264                     {
15265
15266                         /* In combination with the above, this moves the
15267                          * pointer to the point just after the first erroneous
15268                          * character (or if there are no flags, to where they
15269                          * should have been) */
15270                         if (RExC_parse >= RExC_end - 4) {
15271                             RExC_parse = RExC_end;
15272                         }
15273                         else if (RExC_parse != save_parse) {
15274                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15275                         }
15276                         vFAIL("Expecting '(?flags:(?[...'");
15277                     }
15278
15279                     /* Recurse, with the meat of the embedded expression */
15280                     RExC_parse++;
15281                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15282                                                     depth+1, oregcomp_parse);
15283
15284                     /* Here, 'current' contains the embedded expression's
15285                      * inversion list, and RExC_parse points to the trailing
15286                      * ']'; the next character should be the ')' */
15287                     RExC_parse++;
15288                     if (UCHARAT(RExC_parse) != ')')
15289                         vFAIL("Expecting close paren for nested extended charclass");
15290
15291                     /* Then the ')' matching the original '(' handled by this
15292                      * case: statement */
15293                     RExC_parse++;
15294                     if (UCHARAT(RExC_parse) != ')')
15295                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
15296
15297                     RExC_parse++;
15298                     RExC_flags = save_flags;
15299                     goto handle_operand;
15300                 }
15301
15302                 /* A regular '('.  Look behind for illegal syntax */
15303                 if (top_index - fence >= 0) {
15304                     /* If the top entry on the stack is an operator, it had
15305                      * better be a '!', otherwise the entry below the top
15306                      * operand should be an operator */
15307                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
15308                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15309                         || (   IS_OPERAND(*top_ptr)
15310                             && (   top_index - fence < 1
15311                                 || ! (stacked_ptr = av_fetch(stack,
15312                                                              top_index - 1,
15313                                                              FALSE))
15314                                 || ! IS_OPERATOR(*stacked_ptr))))
15315                     {
15316                         RExC_parse++;
15317                         vFAIL("Unexpected '(' with no preceding operator");
15318                     }
15319                 }
15320
15321                 /* Stack the position of this undealt-with left paren */
15322                 av_push(fence_stack, newSViv(fence));
15323                 fence = top_index + 1;
15324                 break;
15325
15326             case '\\':
15327                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15328                  * multi-char folds are allowed.  */
15329                 if (!regclass(pRExC_state, flagp,depth+1,
15330                               TRUE, /* means parse just the next thing */
15331                               FALSE, /* don't allow multi-char folds */
15332                               FALSE, /* don't silence non-portable warnings.  */
15333                               TRUE,  /* strict */
15334                               FALSE, /* Require return to be an ANYOF */
15335                               &current,
15336                               NULL))
15337                 {
15338                     FAIL2("panic: regclass returned NULL to handle_sets, "
15339                           "flags=%#" UVxf, (UV) *flagp);
15340                 }
15341
15342                 /* regclass() will return with parsing just the \ sequence,
15343                  * leaving the parse pointer at the next thing to parse */
15344                 RExC_parse--;
15345                 goto handle_operand;
15346
15347             case '[':   /* Is a bracketed character class */
15348             {
15349                 /* See if this is a [:posix:] class. */
15350                 bool is_posix_class = (OOB_NAMEDCLASS
15351                             < handle_possible_posix(pRExC_state,
15352                                                 RExC_parse + 1,
15353                                                 NULL,
15354                                                 NULL,
15355                                                 TRUE /* checking only */));
15356                 /* If it is a posix class, leave the parse pointer at the '['
15357                  * to fool regclass() into thinking it is part of a
15358                  * '[[:posix:]]'. */
15359                 if (! is_posix_class) {
15360                     RExC_parse++;
15361                 }
15362
15363                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15364                  * multi-char folds are allowed.  */
15365                 if (!regclass(pRExC_state, flagp,depth+1,
15366                                 is_posix_class, /* parse the whole char
15367                                                     class only if not a
15368                                                     posix class */
15369                                 FALSE, /* don't allow multi-char folds */
15370                                 TRUE, /* silence non-portable warnings. */
15371                                 TRUE, /* strict */
15372                                 FALSE, /* Require return to be an ANYOF */
15373                                 &current,
15374                                 NULL
15375                                 ))
15376                 {
15377                     FAIL2("panic: regclass returned NULL to handle_sets, "
15378                           "flags=%#" UVxf, (UV) *flagp);
15379                 }
15380
15381                 /* function call leaves parse pointing to the ']', except if we
15382                  * faked it */
15383                 if (is_posix_class) {
15384                     RExC_parse--;
15385                 }
15386
15387                 goto handle_operand;
15388             }
15389
15390             case ']':
15391                 if (top_index >= 1) {
15392                     goto join_operators;
15393                 }
15394
15395                 /* Only a single operand on the stack: are done */
15396                 goto done;
15397
15398             case ')':
15399                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15400                     RExC_parse++;
15401                     vFAIL("Unexpected ')'");
15402                 }
15403
15404                 /* If nothing after the fence, is missing an operand */
15405                 if (top_index - fence < 0) {
15406                     RExC_parse++;
15407                     goto bad_syntax;
15408                 }
15409                 /* If at least two things on the stack, treat this as an
15410                   * operator */
15411                 if (top_index - fence >= 1) {
15412                     goto join_operators;
15413                 }
15414
15415                 /* Here only a single thing on the fenced stack, and there is a
15416                  * fence.  Get rid of it */
15417                 fence_ptr = av_pop(fence_stack);
15418                 assert(fence_ptr);
15419                 fence = SvIV(fence_ptr) - 1;
15420                 SvREFCNT_dec_NN(fence_ptr);
15421                 fence_ptr = NULL;
15422
15423                 if (fence < 0) {
15424                     fence = 0;
15425                 }
15426
15427                 /* Having gotten rid of the fence, we pop the operand at the
15428                  * stack top and process it as a newly encountered operand */
15429                 current = av_pop(stack);
15430                 if (IS_OPERAND(current)) {
15431                     goto handle_operand;
15432                 }
15433
15434                 RExC_parse++;
15435                 goto bad_syntax;
15436
15437             case '&':
15438             case '|':
15439             case '+':
15440             case '-':
15441             case '^':
15442
15443                 /* These binary operators should have a left operand already
15444                  * parsed */
15445                 if (   top_index - fence < 0
15446                     || top_index - fence == 1
15447                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15448                     || ! IS_OPERAND(*top_ptr))
15449                 {
15450                     goto unexpected_binary;
15451                 }
15452
15453                 /* If only the one operand is on the part of the stack visible
15454                  * to us, we just place this operator in the proper position */
15455                 if (top_index - fence < 2) {
15456
15457                     /* Place the operator before the operand */
15458
15459                     SV* lhs = av_pop(stack);
15460                     av_push(stack, newSVuv(curchar));
15461                     av_push(stack, lhs);
15462                     break;
15463                 }
15464
15465                 /* But if there is something else on the stack, we need to
15466                  * process it before this new operator if and only if the
15467                  * stacked operation has equal or higher precedence than the
15468                  * new one */
15469
15470              join_operators:
15471
15472                 /* The operator on the stack is supposed to be below both its
15473                  * operands */
15474                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15475                     || IS_OPERAND(*stacked_ptr))
15476                 {
15477                     /* But if not, it's legal and indicates we are completely
15478                      * done if and only if we're currently processing a ']',
15479                      * which should be the final thing in the expression */
15480                     if (curchar == ']') {
15481                         goto done;
15482                     }
15483
15484                   unexpected_binary:
15485                     RExC_parse++;
15486                     vFAIL2("Unexpected binary operator '%c' with no "
15487                            "preceding operand", curchar);
15488                 }
15489                 stacked_operator = (char) SvUV(*stacked_ptr);
15490
15491                 if (regex_set_precedence(curchar)
15492                     > regex_set_precedence(stacked_operator))
15493                 {
15494                     /* Here, the new operator has higher precedence than the
15495                      * stacked one.  This means we need to add the new one to
15496                      * the stack to await its rhs operand (and maybe more
15497                      * stuff).  We put it before the lhs operand, leaving
15498                      * untouched the stacked operator and everything below it
15499                      * */
15500                     lhs = av_pop(stack);
15501                     assert(IS_OPERAND(lhs));
15502
15503                     av_push(stack, newSVuv(curchar));
15504                     av_push(stack, lhs);
15505                     break;
15506                 }
15507
15508                 /* Here, the new operator has equal or lower precedence than
15509                  * what's already there.  This means the operation already
15510                  * there should be performed now, before the new one. */
15511
15512                 rhs = av_pop(stack);
15513                 if (! IS_OPERAND(rhs)) {
15514
15515                     /* This can happen when a ! is not followed by an operand,
15516                      * like in /(?[\t &!])/ */
15517                     goto bad_syntax;
15518                 }
15519
15520                 lhs = av_pop(stack);
15521
15522                 if (! IS_OPERAND(lhs)) {
15523
15524                     /* This can happen when there is an empty (), like in
15525                      * /(?[[0]+()+])/ */
15526                     goto bad_syntax;
15527                 }
15528
15529                 switch (stacked_operator) {
15530                     case '&':
15531                         _invlist_intersection(lhs, rhs, &rhs);
15532                         break;
15533
15534                     case '|':
15535                     case '+':
15536                         _invlist_union(lhs, rhs, &rhs);
15537                         break;
15538
15539                     case '-':
15540                         _invlist_subtract(lhs, rhs, &rhs);
15541                         break;
15542
15543                     case '^':   /* The union minus the intersection */
15544                     {
15545                         SV* i = NULL;
15546                         SV* u = NULL;
15547
15548                         _invlist_union(lhs, rhs, &u);
15549                         _invlist_intersection(lhs, rhs, &i);
15550                         _invlist_subtract(u, i, &rhs);
15551                         SvREFCNT_dec_NN(i);
15552                         SvREFCNT_dec_NN(u);
15553                         break;
15554                     }
15555                 }
15556                 SvREFCNT_dec(lhs);
15557
15558                 /* Here, the higher precedence operation has been done, and the
15559                  * result is in 'rhs'.  We overwrite the stacked operator with
15560                  * the result.  Then we redo this code to either push the new
15561                  * operator onto the stack or perform any higher precedence
15562                  * stacked operation */
15563                 only_to_avoid_leaks = av_pop(stack);
15564                 SvREFCNT_dec(only_to_avoid_leaks);
15565                 av_push(stack, rhs);
15566                 goto redo_curchar;
15567
15568             case '!':   /* Highest priority, right associative */
15569
15570                 /* If what's already at the top of the stack is another '!",
15571                  * they just cancel each other out */
15572                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
15573                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15574                 {
15575                     only_to_avoid_leaks = av_pop(stack);
15576                     SvREFCNT_dec(only_to_avoid_leaks);
15577                 }
15578                 else { /* Otherwise, since it's right associative, just push
15579                           onto the stack */
15580                     av_push(stack, newSVuv(curchar));
15581                 }
15582                 break;
15583
15584             default:
15585                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15586                 vFAIL("Unexpected character");
15587
15588           handle_operand:
15589
15590             /* Here 'current' is the operand.  If something is already on the
15591              * stack, we have to check if it is a !.  But first, the code above
15592              * may have altered the stack in the time since we earlier set
15593              * 'top_index'.  */
15594
15595             top_index = av_tindex_skip_len_mg(stack);
15596             if (top_index - fence >= 0) {
15597                 /* If the top entry on the stack is an operator, it had better
15598                  * be a '!', otherwise the entry below the top operand should
15599                  * be an operator */
15600                 top_ptr = av_fetch(stack, top_index, FALSE);
15601                 assert(top_ptr);
15602                 if (IS_OPERATOR(*top_ptr)) {
15603
15604                     /* The only permissible operator at the top of the stack is
15605                      * '!', which is applied immediately to this operand. */
15606                     curchar = (char) SvUV(*top_ptr);
15607                     if (curchar != '!') {
15608                         SvREFCNT_dec(current);
15609                         vFAIL2("Unexpected binary operator '%c' with no "
15610                                 "preceding operand", curchar);
15611                     }
15612
15613                     _invlist_invert(current);
15614
15615                     only_to_avoid_leaks = av_pop(stack);
15616                     SvREFCNT_dec(only_to_avoid_leaks);
15617
15618                     /* And we redo with the inverted operand.  This allows
15619                      * handling multiple ! in a row */
15620                     goto handle_operand;
15621                 }
15622                           /* Single operand is ok only for the non-binary ')'
15623                            * operator */
15624                 else if ((top_index - fence == 0 && curchar != ')')
15625                          || (top_index - fence > 0
15626                              && (! (stacked_ptr = av_fetch(stack,
15627                                                            top_index - 1,
15628                                                            FALSE))
15629                                  || IS_OPERAND(*stacked_ptr))))
15630                 {
15631                     SvREFCNT_dec(current);
15632                     vFAIL("Operand with no preceding operator");
15633                 }
15634             }
15635
15636             /* Here there was nothing on the stack or the top element was
15637              * another operand.  Just add this new one */
15638             av_push(stack, current);
15639
15640         } /* End of switch on next parse token */
15641
15642         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15643     } /* End of loop parsing through the construct */
15644
15645   done:
15646     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
15647         vFAIL("Unmatched (");
15648     }
15649
15650     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
15651         || ((final = av_pop(stack)) == NULL)
15652         || ! IS_OPERAND(final)
15653         || SvTYPE(final) != SVt_INVLIST
15654         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
15655     {
15656       bad_syntax:
15657         SvREFCNT_dec(final);
15658         vFAIL("Incomplete expression within '(?[ ])'");
15659     }
15660
15661     /* Here, 'final' is the resultant inversion list from evaluating the
15662      * expression.  Return it if so requested */
15663     if (return_invlist) {
15664         *return_invlist = final;
15665         return END;
15666     }
15667
15668     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
15669      * expecting a string of ranges and individual code points */
15670     invlist_iterinit(final);
15671     result_string = newSVpvs("");
15672     while (invlist_iternext(final, &start, &end)) {
15673         if (start == end) {
15674             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
15675         }
15676         else {
15677             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
15678                                                      start,          end);
15679         }
15680     }
15681
15682     /* About to generate an ANYOF (or similar) node from the inversion list we
15683      * have calculated */
15684     save_parse = RExC_parse;
15685     RExC_parse = SvPV(result_string, len);
15686     save_end = RExC_end;
15687     RExC_end = RExC_parse + len;
15688
15689     /* We turn off folding around the call, as the class we have constructed
15690      * already has all folding taken into consideration, and we don't want
15691      * regclass() to add to that */
15692     RExC_flags &= ~RXf_PMf_FOLD;
15693     /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15694      * folds are allowed.  */
15695     node = regclass(pRExC_state, flagp,depth+1,
15696                     FALSE, /* means parse the whole char class */
15697                     FALSE, /* don't allow multi-char folds */
15698                     TRUE, /* silence non-portable warnings.  The above may very
15699                              well have generated non-portable code points, but
15700                              they're valid on this machine */
15701                     FALSE, /* similarly, no need for strict */
15702                     FALSE, /* Require return to be an ANYOF */
15703                     NULL,
15704                     NULL
15705                 );
15706     if (!node)
15707         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf,
15708                     PTR2UV(flagp));
15709
15710     /* Fix up the node type if we are in locale.  (We have pretended we are
15711      * under /u for the purposes of regclass(), as this construct will only
15712      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
15713      * as to cause any warnings about bad locales to be output in regexec.c),
15714      * and add the flag that indicates to check if not in a UTF-8 locale.  The
15715      * reason we above forbid optimization into something other than an ANYOF
15716      * node is simply to minimize the number of code changes in regexec.c.
15717      * Otherwise we would have to create new EXACTish node types and deal with
15718      * them.  This decision could be revisited should this construct become
15719      * popular.
15720      *
15721      * (One might think we could look at the resulting ANYOF node and suppress
15722      * the flag if everything is above 255, as those would be UTF-8 only,
15723      * but this isn't true, as the components that led to that result could
15724      * have been locale-affected, and just happen to cancel each other out
15725      * under UTF-8 locales.) */
15726     if (in_locale) {
15727         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15728
15729         assert(OP(node) == ANYOF);
15730
15731         OP(node) = ANYOFL;
15732         ANYOF_FLAGS(node)
15733                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15734     }
15735
15736     if (save_fold) {
15737         RExC_flags |= RXf_PMf_FOLD;
15738     }
15739
15740     RExC_parse = save_parse + 1;
15741     RExC_end = save_end;
15742     SvREFCNT_dec_NN(final);
15743     SvREFCNT_dec_NN(result_string);
15744
15745     nextchar(pRExC_state);
15746     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15747     return node;
15748 }
15749
15750 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15751
15752 STATIC void
15753 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
15754                              AV * stack, const IV fence, AV * fence_stack)
15755 {   /* Dumps the stacks in handle_regex_sets() */
15756
15757     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
15758     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
15759     SSize_t i;
15760
15761     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
15762
15763     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
15764
15765     if (stack_top < 0) {
15766         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
15767     }
15768     else {
15769         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
15770         for (i = stack_top; i >= 0; i--) {
15771             SV ** element_ptr = av_fetch(stack, i, FALSE);
15772             if (! element_ptr) {
15773             }
15774
15775             if (IS_OPERATOR(*element_ptr)) {
15776                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
15777                                             (int) i, (int) SvIV(*element_ptr));
15778             }
15779             else {
15780                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
15781                 sv_dump(*element_ptr);
15782             }
15783         }
15784     }
15785
15786     if (fence_stack_top < 0) {
15787         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
15788     }
15789     else {
15790         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
15791         for (i = fence_stack_top; i >= 0; i--) {
15792             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
15793             if (! element_ptr) {
15794             }
15795
15796             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
15797                                             (int) i, (int) SvIV(*element_ptr));
15798         }
15799     }
15800 }
15801
15802 #endif
15803
15804 #undef IS_OPERATOR
15805 #undef IS_OPERAND
15806
15807 STATIC void
15808 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15809 {
15810     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15811      * innocent-looking character class, like /[ks]/i won't have to go out to
15812      * disk to find the possible matches.
15813      *
15814      * This should be called only for a Latin1-range code points, cp, which is
15815      * known to be involved in a simple fold with other code points above
15816      * Latin1.  It would give false results if /aa has been specified.
15817      * Multi-char folds are outside the scope of this, and must be handled
15818      * specially.
15819      *
15820      * XXX It would be better to generate these via regen, in case a new
15821      * version of the Unicode standard adds new mappings, though that is not
15822      * really likely, and may be caught by the default: case of the switch
15823      * below. */
15824
15825     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15826
15827     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15828
15829     switch (cp) {
15830         case 'k':
15831         case 'K':
15832           *invlist =
15833              add_cp_to_invlist(*invlist, KELVIN_SIGN);
15834             break;
15835         case 's':
15836         case 'S':
15837           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15838             break;
15839         case MICRO_SIGN:
15840           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15841           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15842             break;
15843         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15844         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15845           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15846             break;
15847         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15848           *invlist = add_cp_to_invlist(*invlist,
15849                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15850             break;
15851
15852 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15853
15854         case LATIN_SMALL_LETTER_SHARP_S:
15855           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15856             break;
15857
15858 #endif
15859
15860 #if    UNICODE_MAJOR_VERSION < 3                                        \
15861    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15862
15863         /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15864          * U+0131.  */
15865         case 'i':
15866         case 'I':
15867           *invlist =
15868              add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15869 #   if UNICODE_DOT_DOT_VERSION == 1
15870           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15871 #   endif
15872             break;
15873 #endif
15874
15875         default:
15876             /* Use deprecated warning to increase the chances of this being
15877              * output */
15878             if (PASS2) {
15879                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15880             }
15881             break;
15882     }
15883 }
15884
15885 STATIC void
15886 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15887 {
15888     /* If the final parameter is NULL, output the elements of the array given
15889      * by '*posix_warnings' as REGEXP warnings.  Otherwise, the elements are
15890      * pushed onto it, (creating if necessary) */
15891
15892     SV * msg;
15893     const bool first_is_fatal =  ! return_posix_warnings
15894                                 && ckDEAD(packWARN(WARN_REGEXP));
15895
15896     PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15897
15898     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15899         if (return_posix_warnings) {
15900             if (! *return_posix_warnings) { /* mortalize to not leak if
15901                                                warnings are fatal */
15902                 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15903             }
15904             av_push(*return_posix_warnings, msg);
15905         }
15906         else {
15907             if (first_is_fatal) {           /* Avoid leaking this */
15908                 av_undef(posix_warnings);   /* This isn't necessary if the
15909                                                array is mortal, but is a
15910                                                fail-safe */
15911                 (void) sv_2mortal(msg);
15912                 if (PASS2) {
15913                     SAVEFREESV(RExC_rx_sv);
15914                 }
15915             }
15916             Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15917             SvREFCNT_dec_NN(msg);
15918         }
15919     }
15920 }
15921
15922 STATIC AV *
15923 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15924 {
15925     /* This adds the string scalar <multi_string> to the array
15926      * <multi_char_matches>.  <multi_string> is known to have exactly
15927      * <cp_count> code points in it.  This is used when constructing a
15928      * bracketed character class and we find something that needs to match more
15929      * than a single character.
15930      *
15931      * <multi_char_matches> is actually an array of arrays.  Each top-level
15932      * element is an array that contains all the strings known so far that are
15933      * the same length.  And that length (in number of code points) is the same
15934      * as the index of the top-level array.  Hence, the [2] element is an
15935      * array, each element thereof is a string containing TWO code points;
15936      * while element [3] is for strings of THREE characters, and so on.  Since
15937      * this is for multi-char strings there can never be a [0] nor [1] element.
15938      *
15939      * When we rewrite the character class below, we will do so such that the
15940      * longest strings are written first, so that it prefers the longest
15941      * matching strings first.  This is done even if it turns out that any
15942      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
15943      * Christiansen has agreed that this is ok.  This makes the test for the
15944      * ligature 'ffi' come before the test for 'ff', for example */
15945
15946     AV* this_array;
15947     AV** this_array_ptr;
15948
15949     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15950
15951     if (! multi_char_matches) {
15952         multi_char_matches = newAV();
15953     }
15954
15955     if (av_exists(multi_char_matches, cp_count)) {
15956         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15957         this_array = *this_array_ptr;
15958     }
15959     else {
15960         this_array = newAV();
15961         av_store(multi_char_matches, cp_count,
15962                  (SV*) this_array);
15963     }
15964     av_push(this_array, multi_string);
15965
15966     return multi_char_matches;
15967 }
15968
15969 /* The names of properties whose definitions are not known at compile time are
15970  * stored in this SV, after a constant heading.  So if the length has been
15971  * changed since initialization, then there is a run-time definition. */
15972 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
15973                                         (SvCUR(listsv) != initial_listsv_len)
15974
15975 /* There is a restricted set of white space characters that are legal when
15976  * ignoring white space in a bracketed character class.  This generates the
15977  * code to skip them.
15978  *
15979  * There is a line below that uses the same white space criteria but is outside
15980  * this macro.  Both here and there must use the same definition */
15981 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
15982     STMT_START {                                                        \
15983         if (do_skip) {                                                  \
15984             while (isBLANK_A(UCHARAT(p)))                               \
15985             {                                                           \
15986                 p++;                                                    \
15987             }                                                           \
15988         }                                                               \
15989     } STMT_END
15990
15991 STATIC regnode *
15992 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15993                  const bool stop_at_1,  /* Just parse the next thing, don't
15994                                            look for a full character class */
15995                  bool allow_multi_folds,
15996                  const bool silence_non_portable,   /* Don't output warnings
15997                                                        about too large
15998                                                        characters */
15999                  const bool strict,
16000                  bool optimizable,                  /* ? Allow a non-ANYOF return
16001                                                        node */
16002                  SV** ret_invlist, /* Return an inversion list, not a node */
16003                  AV** return_posix_warnings
16004           )
16005 {
16006     /* parse a bracketed class specification.  Most of these will produce an
16007      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16008      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
16009      * under /i with multi-character folds: it will be rewritten following the
16010      * paradigm of this example, where the <multi-fold>s are characters which
16011      * fold to multiple character sequences:
16012      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16013      * gets effectively rewritten as:
16014      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16015      * reg() gets called (recursively) on the rewritten version, and this
16016      * function will return what it constructs.  (Actually the <multi-fold>s
16017      * aren't physically removed from the [abcdefghi], it's just that they are
16018      * ignored in the recursion by means of a flag:
16019      * <RExC_in_multi_char_class>.)
16020      *
16021      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16022      * characters, with the corresponding bit set if that character is in the
16023      * list.  For characters above this, a range list or swash is used.  There
16024      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16025      * determinable at compile time
16026      *
16027      * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
16028      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
16029      * to UTF-8.  This can only happen if ret_invlist is non-NULL.
16030      */
16031
16032     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16033     IV range = 0;
16034     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16035     regnode *ret;
16036     STRLEN numlen;
16037     int namedclass = OOB_NAMEDCLASS;
16038     char *rangebegin = NULL;
16039     bool need_class = 0;
16040     SV *listsv = NULL;
16041     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16042                                       than just initialized.  */
16043     SV* properties = NULL;    /* Code points that match \p{} \P{} */
16044     SV* posixes = NULL;     /* Code points that match classes like [:word:],
16045                                extended beyond the Latin1 range.  These have to
16046                                be kept separate from other code points for much
16047                                of this function because their handling  is
16048                                different under /i, and for most classes under
16049                                /d as well */
16050     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
16051                                separate for a while from the non-complemented
16052                                versions because of complications with /d
16053                                matching */
16054     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16055                                   treated more simply than the general case,
16056                                   leading to less compilation and execution
16057                                   work */
16058     UV element_count = 0;   /* Number of distinct elements in the class.
16059                                Optimizations may be possible if this is tiny */
16060     AV * multi_char_matches = NULL; /* Code points that fold to more than one
16061                                        character; used under /i */
16062     UV n;
16063     char * stop_ptr = RExC_end;    /* where to stop parsing */
16064
16065     /* ignore unescaped whitespace? */
16066     const bool skip_white = cBOOL(   ret_invlist
16067                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16068
16069     /* Unicode properties are stored in a swash; this holds the current one
16070      * being parsed.  If this swash is the only above-latin1 component of the
16071      * character class, an optimization is to pass it directly on to the
16072      * execution engine.  Otherwise, it is set to NULL to indicate that there
16073      * are other things in the class that have to be dealt with at execution
16074      * time */
16075     SV* swash = NULL;           /* Code points that match \p{} \P{} */
16076
16077     /* Set if a component of this character class is user-defined; just passed
16078      * on to the engine */
16079     bool has_user_defined_property = FALSE;
16080
16081     /* inversion list of code points this node matches only when the target
16082      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
16083      * /d) */
16084     SV* has_upper_latin1_only_utf8_matches = NULL;
16085
16086     /* Inversion list of code points this node matches regardless of things
16087      * like locale, folding, utf8ness of the target string */
16088     SV* cp_list = NULL;
16089
16090     /* Like cp_list, but code points on this list need to be checked for things
16091      * that fold to/from them under /i */
16092     SV* cp_foldable_list = NULL;
16093
16094     /* Like cp_list, but code points on this list are valid only when the
16095      * runtime locale is UTF-8 */
16096     SV* only_utf8_locale_list = NULL;
16097
16098     /* In a range, if one of the endpoints is non-character-set portable,
16099      * meaning that it hard-codes a code point that may mean a different
16100      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16101      * mnemonic '\t' which each mean the same character no matter which
16102      * character set the platform is on. */
16103     unsigned int non_portable_endpoint = 0;
16104
16105     /* Is the range unicode? which means on a platform that isn't 1-1 native
16106      * to Unicode (i.e. non-ASCII), each code point in it should be considered
16107      * to be a Unicode value.  */
16108     bool unicode_range = FALSE;
16109     bool invert = FALSE;    /* Is this class to be complemented */
16110
16111     bool warn_super = ALWAYS_WARN_SUPER;
16112
16113     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
16114         case we need to change the emitted regop to an EXACT. */
16115     const char * orig_parse = RExC_parse;
16116     const SSize_t orig_size = RExC_size;
16117     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
16118
16119     /* This variable is used to mark where the end in the input is of something
16120      * that looks like a POSIX construct but isn't.  During the parse, when
16121      * something looks like it could be such a construct is encountered, it is
16122      * checked for being one, but not if we've already checked this area of the
16123      * input.  Only after this position is reached do we check again */
16124     char *not_posix_region_end = RExC_parse - 1;
16125
16126     AV* posix_warnings = NULL;
16127     const bool do_posix_warnings =     return_posix_warnings
16128                                    || (PASS2 && ckWARN(WARN_REGEXP));
16129
16130     GET_RE_DEBUG_FLAGS_DECL;
16131
16132     PERL_ARGS_ASSERT_REGCLASS;
16133 #ifndef DEBUGGING
16134     PERL_UNUSED_ARG(depth);
16135 #endif
16136
16137     DEBUG_PARSE("clas");
16138
16139 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
16140     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
16141                                    && UNICODE_DOT_DOT_VERSION == 0)
16142     allow_multi_folds = FALSE;
16143 #endif
16144
16145     /* Assume we are going to generate an ANYOF node. */
16146     ret = reganode(pRExC_state,
16147                    (LOC)
16148                     ? ANYOFL
16149                     : ANYOF,
16150                    0);
16151
16152     if (SIZE_ONLY) {
16153         RExC_size += ANYOF_SKIP;
16154         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
16155     }
16156     else {
16157         ANYOF_FLAGS(ret) = 0;
16158
16159         RExC_emit += ANYOF_SKIP;
16160         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16161         initial_listsv_len = SvCUR(listsv);
16162         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
16163     }
16164
16165     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16166
16167     assert(RExC_parse <= RExC_end);
16168
16169     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
16170         RExC_parse++;
16171         invert = TRUE;
16172         allow_multi_folds = FALSE;
16173         MARK_NAUGHTY(1);
16174         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16175     }
16176
16177     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16178     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16179         int maybe_class = handle_possible_posix(pRExC_state,
16180                                                 RExC_parse,
16181                                                 &not_posix_region_end,
16182                                                 NULL,
16183                                                 TRUE /* checking only */);
16184         if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16185             SAVEFREESV(RExC_rx_sv);
16186             ckWARN4reg(not_posix_region_end,
16187                     "POSIX syntax [%c %c] belongs inside character classes%s",
16188                     *RExC_parse, *RExC_parse,
16189                     (maybe_class == OOB_NAMEDCLASS)
16190                     ? ((POSIXCC_NOTYET(*RExC_parse))
16191                         ? " (but this one isn't implemented)"
16192                         : " (but this one isn't fully valid)")
16193                     : ""
16194                     );
16195             (void)ReREFCNT_inc(RExC_rx_sv);
16196         }
16197     }
16198
16199     /* If the caller wants us to just parse a single element, accomplish this
16200      * by faking the loop ending condition */
16201     if (stop_at_1 && RExC_end > RExC_parse) {
16202         stop_ptr = RExC_parse + 1;
16203     }
16204
16205     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16206     if (UCHARAT(RExC_parse) == ']')
16207         goto charclassloop;
16208
16209     while (1) {
16210
16211         if (   posix_warnings
16212             && av_tindex_skip_len_mg(posix_warnings) >= 0
16213             && RExC_parse > not_posix_region_end)
16214         {
16215             /* Warnings about posix class issues are considered tentative until
16216              * we are far enough along in the parse that we can no longer
16217              * change our mind, at which point we either output them or add
16218              * them, if it has so specified, to what gets returned to the
16219              * caller.  This is done each time through the loop so that a later
16220              * class won't zap them before they have been dealt with. */
16221             output_or_return_posix_warnings(pRExC_state, posix_warnings,
16222                                             return_posix_warnings);
16223         }
16224
16225         if  (RExC_parse >= stop_ptr) {
16226             break;
16227         }
16228
16229         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16230
16231         if  (UCHARAT(RExC_parse) == ']') {
16232             break;
16233         }
16234
16235       charclassloop:
16236
16237         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16238         save_value = value;
16239         save_prevvalue = prevvalue;
16240
16241         if (!range) {
16242             rangebegin = RExC_parse;
16243             element_count++;
16244             non_portable_endpoint = 0;
16245         }
16246         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16247             value = utf8n_to_uvchr((U8*)RExC_parse,
16248                                    RExC_end - RExC_parse,
16249                                    &numlen, UTF8_ALLOW_DEFAULT);
16250             RExC_parse += numlen;
16251         }
16252         else
16253             value = UCHARAT(RExC_parse++);
16254
16255         if (value == '[') {
16256             char * posix_class_end;
16257             namedclass = handle_possible_posix(pRExC_state,
16258                                                RExC_parse,
16259                                                &posix_class_end,
16260                                                do_posix_warnings ? &posix_warnings : NULL,
16261                                                FALSE    /* die if error */);
16262             if (namedclass > OOB_NAMEDCLASS) {
16263
16264                 /* If there was an earlier attempt to parse this particular
16265                  * posix class, and it failed, it was a false alarm, as this
16266                  * successful one proves */
16267                 if (   posix_warnings
16268                     && av_tindex_skip_len_mg(posix_warnings) >= 0
16269                     && not_posix_region_end >= RExC_parse
16270                     && not_posix_region_end <= posix_class_end)
16271                 {
16272                     av_undef(posix_warnings);
16273                 }
16274
16275                 RExC_parse = posix_class_end;
16276             }
16277             else if (namedclass == OOB_NAMEDCLASS) {
16278                 not_posix_region_end = posix_class_end;
16279             }
16280             else {
16281                 namedclass = OOB_NAMEDCLASS;
16282             }
16283         }
16284         else if (   RExC_parse - 1 > not_posix_region_end
16285                  && MAYBE_POSIXCC(value))
16286         {
16287             (void) handle_possible_posix(
16288                         pRExC_state,
16289                         RExC_parse - 1,  /* -1 because parse has already been
16290                                             advanced */
16291                         &not_posix_region_end,
16292                         do_posix_warnings ? &posix_warnings : NULL,
16293                         TRUE /* checking only */);
16294         }
16295         else if (  strict && ! skip_white
16296                  && (   _generic_isCC(value, _CC_VERTSPACE)
16297                      || is_VERTWS_cp_high(value)))
16298         {
16299             vFAIL("Literal vertical space in [] is illegal except under /x");
16300         }
16301         else if (value == '\\') {
16302             /* Is a backslash; get the code point of the char after it */
16303
16304             if (RExC_parse >= RExC_end) {
16305                 vFAIL("Unmatched [");
16306             }
16307
16308             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16309                 value = utf8n_to_uvchr((U8*)RExC_parse,
16310                                    RExC_end - RExC_parse,
16311                                    &numlen, UTF8_ALLOW_DEFAULT);
16312                 RExC_parse += numlen;
16313             }
16314             else
16315                 value = UCHARAT(RExC_parse++);
16316
16317             /* Some compilers cannot handle switching on 64-bit integer
16318              * values, therefore value cannot be an UV.  Yes, this will
16319              * be a problem later if we want switch on Unicode.
16320              * A similar issue a little bit later when switching on
16321              * namedclass. --jhi */
16322
16323             /* If the \ is escaping white space when white space is being
16324              * skipped, it means that that white space is wanted literally, and
16325              * is already in 'value'.  Otherwise, need to translate the escape
16326              * into what it signifies. */
16327             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16328
16329             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
16330             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
16331             case 's':   namedclass = ANYOF_SPACE;       break;
16332             case 'S':   namedclass = ANYOF_NSPACE;      break;
16333             case 'd':   namedclass = ANYOF_DIGIT;       break;
16334             case 'D':   namedclass = ANYOF_NDIGIT;      break;
16335             case 'v':   namedclass = ANYOF_VERTWS;      break;
16336             case 'V':   namedclass = ANYOF_NVERTWS;     break;
16337             case 'h':   namedclass = ANYOF_HORIZWS;     break;
16338             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
16339             case 'N':  /* Handle \N{NAME} in class */
16340                 {
16341                     const char * const backslash_N_beg = RExC_parse - 2;
16342                     int cp_count;
16343
16344                     if (! grok_bslash_N(pRExC_state,
16345                                         NULL,      /* No regnode */
16346                                         &value,    /* Yes single value */
16347                                         &cp_count, /* Multiple code pt count */
16348                                         flagp,
16349                                         strict,
16350                                         depth)
16351                     ) {
16352
16353                         if (*flagp & NEED_UTF8)
16354                             FAIL("panic: grok_bslash_N set NEED_UTF8");
16355                         if (*flagp & RESTART_PASS1)
16356                             return NULL;
16357
16358                         if (cp_count < 0) {
16359                             vFAIL("\\N in a character class must be a named character: \\N{...}");
16360                         }
16361                         else if (cp_count == 0) {
16362                             if (PASS2) {
16363                                 ckWARNreg(RExC_parse,
16364                                         "Ignoring zero length \\N{} in character class");
16365                             }
16366                         }
16367                         else { /* cp_count > 1 */
16368                             if (! RExC_in_multi_char_class) {
16369                                 if (invert || range || *RExC_parse == '-') {
16370                                     if (strict) {
16371                                         RExC_parse--;
16372                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16373                                     }
16374                                     else if (PASS2) {
16375                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16376                                     }
16377                                     break; /* <value> contains the first code
16378                                               point. Drop out of the switch to
16379                                               process it */
16380                                 }
16381                                 else {
16382                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
16383                                                  RExC_parse - backslash_N_beg);
16384                                     multi_char_matches
16385                                         = add_multi_match(multi_char_matches,
16386                                                           multi_char_N,
16387                                                           cp_count);
16388                                 }
16389                             }
16390                         } /* End of cp_count != 1 */
16391
16392                         /* This element should not be processed further in this
16393                          * class */
16394                         element_count--;
16395                         value = save_value;
16396                         prevvalue = save_prevvalue;
16397                         continue;   /* Back to top of loop to get next char */
16398                     }
16399
16400                     /* Here, is a single code point, and <value> contains it */
16401                     unicode_range = TRUE;   /* \N{} are Unicode */
16402                 }
16403                 break;
16404             case 'p':
16405             case 'P':
16406                 {
16407                 char *e;
16408
16409                 /* We will handle any undefined properties ourselves */
16410                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16411                                        /* And we actually would prefer to get
16412                                         * the straight inversion list of the
16413                                         * swash, since we will be accessing it
16414                                         * anyway, to save a little time */
16415                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16416
16417                 if (RExC_parse >= RExC_end)
16418                     vFAIL2("Empty \\%c", (U8)value);
16419                 if (*RExC_parse == '{') {
16420                     const U8 c = (U8)value;
16421                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16422                     if (!e) {
16423                         RExC_parse++;
16424                         vFAIL2("Missing right brace on \\%c{}", c);
16425                     }
16426
16427                     RExC_parse++;
16428                     while (isSPACE(*RExC_parse)) {
16429                          RExC_parse++;
16430                     }
16431
16432                     if (UCHARAT(RExC_parse) == '^') {
16433
16434                         /* toggle.  (The rhs xor gets the single bit that
16435                          * differs between P and p; the other xor inverts just
16436                          * that bit) */
16437                         value ^= 'P' ^ 'p';
16438
16439                         RExC_parse++;
16440                         while (isSPACE(*RExC_parse)) {
16441                             RExC_parse++;
16442                         }
16443                     }
16444
16445                     if (e == RExC_parse)
16446                         vFAIL2("Empty \\%c{}", c);
16447
16448                     n = e - RExC_parse;
16449                     while (isSPACE(*(RExC_parse + n - 1)))
16450                         n--;
16451                 }   /* The \p isn't immediately followed by a '{' */
16452                 else if (! isALPHA(*RExC_parse)) {
16453                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16454                     vFAIL2("Character following \\%c must be '{' or a "
16455                            "single-character Unicode property name",
16456                            (U8) value);
16457                 }
16458                 else {
16459                     e = RExC_parse;
16460                     n = 1;
16461                 }
16462                 if (!SIZE_ONLY) {
16463                     SV* invlist;
16464                     char* name;
16465                     char* base_name;    /* name after any packages are stripped */
16466                     char* lookup_name = NULL;
16467                     const char * const colon_colon = "::";
16468
16469                     /* Try to get the definition of the property into
16470                      * <invlist>.  If /i is in effect, the effective property
16471                      * will have its name be <__NAME_i>.  The design is
16472                      * discussed in commit
16473                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16474                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16475                     SAVEFREEPV(name);
16476                     if (FOLD) {
16477                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16478
16479                         /* The function call just below that uses this can fail
16480                          * to return, leaking memory if we don't do this */
16481                         SAVEFREEPV(lookup_name);
16482                     }
16483
16484                     /* Look up the property name, and get its swash and
16485                      * inversion list, if the property is found  */
16486                     SvREFCNT_dec(swash); /* Free any left-overs */
16487                     swash = _core_swash_init("utf8",
16488                                              (lookup_name)
16489                                               ? lookup_name
16490                                               : name,
16491                                              &PL_sv_undef,
16492                                              1, /* binary */
16493                                              0, /* not tr/// */
16494                                              NULL, /* No inversion list */
16495                                              &swash_init_flags
16496                                             );
16497                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16498                         HV* curpkg = (IN_PERL_COMPILETIME)
16499                                       ? PL_curstash
16500                                       : CopSTASH(PL_curcop);
16501                         UV final_n = n;
16502                         bool has_pkg;
16503
16504                         if (swash) {    /* Got a swash but no inversion list.
16505                                            Something is likely wrong that will
16506                                            be sorted-out later */
16507                             SvREFCNT_dec_NN(swash);
16508                             swash = NULL;
16509                         }
16510
16511                         /* Here didn't find it.  It could be a an error (like a
16512                          * typo) in specifying a Unicode property, or it could
16513                          * be a user-defined property that will be available at
16514                          * run-time.  The names of these must begin with 'In'
16515                          * or 'Is' (after any packages are stripped off).  So
16516                          * if not one of those, or if we accept only
16517                          * compile-time properties, is an error; otherwise add
16518                          * it to the list for run-time look up. */
16519                         if ((base_name = rninstr(name, name + n,
16520                                                  colon_colon, colon_colon + 2)))
16521                         { /* Has ::.  We know this must be a user-defined
16522                              property */
16523                             base_name += 2;
16524                             final_n -= base_name - name;
16525                             has_pkg = TRUE;
16526                         }
16527                         else {
16528                             base_name = name;
16529                             has_pkg = FALSE;
16530                         }
16531
16532                         if (   final_n < 3
16533                             || base_name[0] != 'I'
16534                             || (base_name[1] != 's' && base_name[1] != 'n')
16535                             || ret_invlist)
16536                         {
16537                             const char * const msg
16538                                 = (has_pkg)
16539                                   ? "Illegal user-defined property name"
16540                                   : "Can't find Unicode property definition";
16541                             RExC_parse = e + 1;
16542
16543                             /* diag_listed_as: Can't find Unicode property definition "%s" */
16544                             vFAIL3utf8f("%s \"%" UTF8f "\"",
16545                                 msg, UTF8fARG(UTF, n, name));
16546                         }
16547
16548                         /* If the property name doesn't already have a package
16549                          * name, add the current one to it so that it can be
16550                          * referred to outside it. [perl #121777] */
16551                         if (! has_pkg && curpkg) {
16552                             char* pkgname = HvNAME(curpkg);
16553                             if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
16554                                 char* full_name = Perl_form(aTHX_
16555                                                             "%s::%s",
16556                                                             pkgname,
16557                                                             name);
16558                                 n = strlen(full_name);
16559                                 name = savepvn(full_name, n);
16560                                 SAVEFREEPV(name);
16561                             }
16562                         }
16563                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
16564                                         (value == 'p' ? '+' : '!'),
16565                                         (FOLD) ? "__" : "",
16566                                         UTF8fARG(UTF, n, name),
16567                                         (FOLD) ? "_i" : "");
16568                         has_user_defined_property = TRUE;
16569                         optimizable = FALSE;    /* Will have to leave this an
16570                                                    ANYOF node */
16571
16572                         /* We don't know yet what this matches, so have to flag
16573                          * it */
16574                         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16575                     }
16576                     else {
16577
16578                         /* Here, did get the swash and its inversion list.  If
16579                          * the swash is from a user-defined property, then this
16580                          * whole character class should be regarded as such */
16581                         if (swash_init_flags
16582                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16583                         {
16584                             has_user_defined_property = TRUE;
16585                         }
16586                         else if
16587                             /* We warn on matching an above-Unicode code point
16588                              * if the match would return true, except don't
16589                              * warn for \p{All}, which has exactly one element
16590                              * = 0 */
16591                             (_invlist_contains_cp(invlist, 0x110000)
16592                                 && (! (_invlist_len(invlist) == 1
16593                                        && *invlist_array(invlist) == 0)))
16594                         {
16595                             warn_super = TRUE;
16596                         }
16597
16598
16599                         /* Invert if asking for the complement */
16600                         if (value == 'P') {
16601                             _invlist_union_complement_2nd(properties,
16602                                                           invlist,
16603                                                           &properties);
16604
16605                             /* The swash can't be used as-is, because we've
16606                              * inverted things; delay removing it to here after
16607                              * have copied its invlist above */
16608                             SvREFCNT_dec_NN(swash);
16609                             swash = NULL;
16610                         }
16611                         else {
16612                             _invlist_union(properties, invlist, &properties);
16613                         }
16614                     }
16615                 }
16616                 RExC_parse = e + 1;
16617                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
16618                                                 named */
16619
16620                 /* \p means they want Unicode semantics */
16621                 REQUIRE_UNI_RULES(flagp, NULL);
16622                 }
16623                 break;
16624             case 'n':   value = '\n';                   break;
16625             case 'r':   value = '\r';                   break;
16626             case 't':   value = '\t';                   break;
16627             case 'f':   value = '\f';                   break;
16628             case 'b':   value = '\b';                   break;
16629             case 'e':   value = ESC_NATIVE;             break;
16630             case 'a':   value = '\a';                   break;
16631             case 'o':
16632                 RExC_parse--;   /* function expects to be pointed at the 'o' */
16633                 {
16634                     const char* error_msg;
16635                     bool valid = grok_bslash_o(&RExC_parse,
16636                                                RExC_end,
16637                                                &value,
16638                                                &error_msg,
16639                                                PASS2,   /* warnings only in
16640                                                            pass 2 */
16641                                                strict,
16642                                                silence_non_portable,
16643                                                UTF);
16644                     if (! valid) {
16645                         vFAIL(error_msg);
16646                     }
16647                 }
16648                 non_portable_endpoint++;
16649                 break;
16650             case 'x':
16651                 RExC_parse--;   /* function expects to be pointed at the 'x' */
16652                 {
16653                     const char* error_msg;
16654                     bool valid = grok_bslash_x(&RExC_parse,
16655                                                RExC_end,
16656                                                &value,
16657                                                &error_msg,
16658                                                PASS2, /* Output warnings */
16659                                                strict,
16660                                                silence_non_portable,
16661                                                UTF);
16662                     if (! valid) {
16663                         vFAIL(error_msg);
16664                     }
16665                 }
16666                 non_portable_endpoint++;
16667                 break;
16668             case 'c':
16669                 value = grok_bslash_c(*RExC_parse++, PASS2);
16670                 non_portable_endpoint++;
16671                 break;
16672             case '0': case '1': case '2': case '3': case '4':
16673             case '5': case '6': case '7':
16674                 {
16675                     /* Take 1-3 octal digits */
16676                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16677                     numlen = (strict) ? 4 : 3;
16678                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16679                     RExC_parse += numlen;
16680                     if (numlen != 3) {
16681                         if (strict) {
16682                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16683                             vFAIL("Need exactly 3 octal digits");
16684                         }
16685                         else if (! SIZE_ONLY /* like \08, \178 */
16686                                  && numlen < 3
16687                                  && RExC_parse < RExC_end
16688                                  && isDIGIT(*RExC_parse)
16689                                  && ckWARN(WARN_REGEXP))
16690                         {
16691                             SAVEFREESV(RExC_rx_sv);
16692                             reg_warn_non_literal_string(
16693                                  RExC_parse + 1,
16694                                  form_short_octal_warning(RExC_parse, numlen));
16695                             (void)ReREFCNT_inc(RExC_rx_sv);
16696                         }
16697                     }
16698                     non_portable_endpoint++;
16699                     break;
16700                 }
16701             default:
16702                 /* Allow \_ to not give an error */
16703                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16704                     if (strict) {
16705                         vFAIL2("Unrecognized escape \\%c in character class",
16706                                (int)value);
16707                     }
16708                     else {
16709                         SAVEFREESV(RExC_rx_sv);
16710                         ckWARN2reg(RExC_parse,
16711                             "Unrecognized escape \\%c in character class passed through",
16712                             (int)value);
16713                         (void)ReREFCNT_inc(RExC_rx_sv);
16714                     }
16715                 }
16716                 break;
16717             }   /* End of switch on char following backslash */
16718         } /* end of handling backslash escape sequences */
16719
16720         /* Here, we have the current token in 'value' */
16721
16722         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16723             U8 classnum;
16724
16725             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
16726              * literal, as is the character that began the false range, i.e.
16727              * the 'a' in the examples */
16728             if (range) {
16729                 if (!SIZE_ONLY) {
16730                     const int w = (RExC_parse >= rangebegin)
16731                                   ? RExC_parse - rangebegin
16732                                   : 0;
16733                     if (strict) {
16734                         vFAIL2utf8f(
16735                             "False [] range \"%" UTF8f "\"",
16736                             UTF8fARG(UTF, w, rangebegin));
16737                     }
16738                     else {
16739                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16740                         ckWARN2reg(RExC_parse,
16741                             "False [] range \"%" UTF8f "\"",
16742                             UTF8fARG(UTF, w, rangebegin));
16743                         (void)ReREFCNT_inc(RExC_rx_sv);
16744                         cp_list = add_cp_to_invlist(cp_list, '-');
16745                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16746                                                              prevvalue);
16747                     }
16748                 }
16749
16750                 range = 0; /* this was not a true range */
16751                 element_count += 2; /* So counts for three values */
16752             }
16753
16754             classnum = namedclass_to_classnum(namedclass);
16755
16756             if (LOC && namedclass < ANYOF_POSIXL_MAX
16757 #ifndef HAS_ISASCII
16758                 && classnum != _CC_ASCII
16759 #endif
16760             ) {
16761                 /* What the Posix classes (like \w, [:space:]) match in locale
16762                  * isn't knowable under locale until actual match time.  Room
16763                  * must be reserved (one time per outer bracketed class) to
16764                  * store such classes.  The space will contain a bit for each
16765                  * named class that is to be matched against.  This isn't
16766                  * needed for \p{} and pseudo-classes, as they are not affected
16767                  * by locale, and hence are dealt with separately */
16768                 if (! need_class) {
16769                     need_class = 1;
16770                     if (SIZE_ONLY) {
16771                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16772                     }
16773                     else {
16774                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16775                     }
16776                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16777                     ANYOF_POSIXL_ZERO(ret);
16778
16779                     /* We can't change this into some other type of node
16780                      * (unless this is the only element, in which case there
16781                      * are nodes that mean exactly this) as has runtime
16782                      * dependencies */
16783                     optimizable = FALSE;
16784                 }
16785
16786                 /* Coverity thinks it is possible for this to be negative; both
16787                  * jhi and khw think it's not, but be safer */
16788                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16789                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16790
16791                 /* See if it already matches the complement of this POSIX
16792                  * class */
16793                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16794                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16795                                                             ? -1
16796                                                             : 1)))
16797                 {
16798                     posixl_matches_all = TRUE;
16799                     break;  /* No need to continue.  Since it matches both
16800                                e.g., \w and \W, it matches everything, and the
16801                                bracketed class can be optimized into qr/./s */
16802                 }
16803
16804                 /* Add this class to those that should be checked at runtime */
16805                 ANYOF_POSIXL_SET(ret, namedclass);
16806
16807                 /* The above-Latin1 characters are not subject to locale rules.
16808                  * Just add them, in the second pass, to the
16809                  * unconditionally-matched list */
16810                 if (! SIZE_ONLY) {
16811                     SV* scratch_list = NULL;
16812
16813                     /* Get the list of the above-Latin1 code points this
16814                      * matches */
16815                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16816                                           PL_XPosix_ptrs[classnum],
16817
16818                                           /* Odd numbers are complements, like
16819                                            * NDIGIT, NASCII, ... */
16820                                           namedclass % 2 != 0,
16821                                           &scratch_list);
16822                     /* Checking if 'cp_list' is NULL first saves an extra
16823                      * clone.  Its reference count will be decremented at the
16824                      * next union, etc, or if this is the only instance, at the
16825                      * end of the routine */
16826                     if (! cp_list) {
16827                         cp_list = scratch_list;
16828                     }
16829                     else {
16830                         _invlist_union(cp_list, scratch_list, &cp_list);
16831                         SvREFCNT_dec_NN(scratch_list);
16832                     }
16833                     continue;   /* Go get next character */
16834                 }
16835             }
16836             else if (! SIZE_ONLY) {
16837
16838                 /* Here, not in pass1 (in that pass we skip calculating the
16839                  * contents of this class), and is not /l, or is a POSIX class
16840                  * for which /l doesn't matter (or is a Unicode property, which
16841                  * is skipped here). */
16842                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
16843                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16844
16845                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
16846                          * nor /l make a difference in what these match,
16847                          * therefore we just add what they match to cp_list. */
16848                         if (classnum != _CC_VERTSPACE) {
16849                             assert(   namedclass == ANYOF_HORIZWS
16850                                    || namedclass == ANYOF_NHORIZWS);
16851
16852                             /* It turns out that \h is just a synonym for
16853                              * XPosixBlank */
16854                             classnum = _CC_BLANK;
16855                         }
16856
16857                         _invlist_union_maybe_complement_2nd(
16858                                 cp_list,
16859                                 PL_XPosix_ptrs[classnum],
16860                                 namedclass % 2 != 0,    /* Complement if odd
16861                                                           (NHORIZWS, NVERTWS)
16862                                                         */
16863                                 &cp_list);
16864                     }
16865                 }
16866                 else if (  UNI_SEMANTICS
16867                         || classnum == _CC_ASCII
16868                         || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
16869                                                   || classnum == _CC_XDIGIT)))
16870                 {
16871                     /* We usually have to worry about /d and /a affecting what
16872                      * POSIX classes match, with special code needed for /d
16873                      * because we won't know until runtime what all matches.
16874                      * But there is no extra work needed under /u, and
16875                      * [:ascii:] is unaffected by /a and /d; and :digit: and
16876                      * :xdigit: don't have runtime differences under /d.  So we
16877                      * can special case these, and avoid some extra work below,
16878                      * and at runtime. */
16879                     _invlist_union_maybe_complement_2nd(
16880                                                      simple_posixes,
16881                                                      PL_XPosix_ptrs[classnum],
16882                                                      namedclass % 2 != 0,
16883                                                      &simple_posixes);
16884                 }
16885                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
16886                            complement and use nposixes */
16887                     SV** posixes_ptr = namedclass % 2 == 0
16888                                        ? &posixes
16889                                        : &nposixes;
16890                     _invlist_union_maybe_complement_2nd(
16891                                                      *posixes_ptr,
16892                                                      PL_XPosix_ptrs[classnum],
16893                                                      namedclass % 2 != 0,
16894                                                      posixes_ptr);
16895                 }
16896             }
16897         } /* end of namedclass \blah */
16898
16899         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16900
16901         /* If 'range' is set, 'value' is the ending of a range--check its
16902          * validity.  (If value isn't a single code point in the case of a
16903          * range, we should have figured that out above in the code that
16904          * catches false ranges).  Later, we will handle each individual code
16905          * point in the range.  If 'range' isn't set, this could be the
16906          * beginning of a range, so check for that by looking ahead to see if
16907          * the next real character to be processed is the range indicator--the
16908          * minus sign */
16909
16910         if (range) {
16911 #ifdef EBCDIC
16912             /* For unicode ranges, we have to test that the Unicode as opposed
16913              * to the native values are not decreasing.  (Above 255, there is
16914              * no difference between native and Unicode) */
16915             if (unicode_range && prevvalue < 255 && value < 255) {
16916                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16917                     goto backwards_range;
16918                 }
16919             }
16920             else
16921 #endif
16922             if (prevvalue > value) /* b-a */ {
16923                 int w;
16924 #ifdef EBCDIC
16925               backwards_range:
16926 #endif
16927                 w = RExC_parse - rangebegin;
16928                 vFAIL2utf8f(
16929                     "Invalid [] range \"%" UTF8f "\"",
16930                     UTF8fARG(UTF, w, rangebegin));
16931                 NOT_REACHED; /* NOTREACHED */
16932             }
16933         }
16934         else {
16935             prevvalue = value; /* save the beginning of the potential range */
16936             if (! stop_at_1     /* Can't be a range if parsing just one thing */
16937                 && *RExC_parse == '-')
16938             {
16939                 char* next_char_ptr = RExC_parse + 1;
16940
16941                 /* Get the next real char after the '-' */
16942                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16943
16944                 /* If the '-' is at the end of the class (just before the ']',
16945                  * it is a literal minus; otherwise it is a range */
16946                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16947                     RExC_parse = next_char_ptr;
16948
16949                     /* a bad range like \w-, [:word:]- ? */
16950                     if (namedclass > OOB_NAMEDCLASS) {
16951                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16952                             const int w = RExC_parse >= rangebegin
16953                                           ?  RExC_parse - rangebegin
16954                                           : 0;
16955                             if (strict) {
16956                                 vFAIL4("False [] range \"%*.*s\"",
16957                                     w, w, rangebegin);
16958                             }
16959                             else if (PASS2) {
16960                                 vWARN4(RExC_parse,
16961                                     "False [] range \"%*.*s\"",
16962                                     w, w, rangebegin);
16963                             }
16964                         }
16965                         if (!SIZE_ONLY) {
16966                             cp_list = add_cp_to_invlist(cp_list, '-');
16967                         }
16968                         element_count++;
16969                     } else
16970                         range = 1;      /* yeah, it's a range! */
16971                     continue;   /* but do it the next time */
16972                 }
16973             }
16974         }
16975
16976         if (namedclass > OOB_NAMEDCLASS) {
16977             continue;
16978         }
16979
16980         /* Here, we have a single value this time through the loop, and
16981          * <prevvalue> is the beginning of the range, if any; or <value> if
16982          * not. */
16983
16984         /* non-Latin1 code point implies unicode semantics.  Must be set in
16985          * pass1 so is there for the whole of pass 2 */
16986         if (value > 255) {
16987             REQUIRE_UNI_RULES(flagp, NULL);
16988         }
16989
16990         /* Ready to process either the single value, or the completed range.
16991          * For single-valued non-inverted ranges, we consider the possibility
16992          * of multi-char folds.  (We made a conscious decision to not do this
16993          * for the other cases because it can often lead to non-intuitive
16994          * results.  For example, you have the peculiar case that:
16995          *  "s s" =~ /^[^\xDF]+$/i => Y
16996          *  "ss"  =~ /^[^\xDF]+$/i => N
16997          *
16998          * See [perl #89750] */
16999         if (FOLD && allow_multi_folds && value == prevvalue) {
17000             if (value == LATIN_SMALL_LETTER_SHARP_S
17001                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17002                                                         value)))
17003             {
17004                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17005
17006                 U8 foldbuf[UTF8_MAXBYTES_CASE];
17007                 STRLEN foldlen;
17008
17009                 UV folded = _to_uni_fold_flags(
17010                                 value,
17011                                 foldbuf,
17012                                 &foldlen,
17013                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17014                                                    ? FOLD_FLAGS_NOMIX_ASCII
17015                                                    : 0)
17016                                 );
17017
17018                 /* Here, <folded> should be the first character of the
17019                  * multi-char fold of <value>, with <foldbuf> containing the
17020                  * whole thing.  But, if this fold is not allowed (because of
17021                  * the flags), <fold> will be the same as <value>, and should
17022                  * be processed like any other character, so skip the special
17023                  * handling */
17024                 if (folded != value) {
17025
17026                     /* Skip if we are recursed, currently parsing the class
17027                      * again.  Otherwise add this character to the list of
17028                      * multi-char folds. */
17029                     if (! RExC_in_multi_char_class) {
17030                         STRLEN cp_count = utf8_length(foldbuf,
17031                                                       foldbuf + foldlen);
17032                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17033
17034                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17035
17036                         multi_char_matches
17037                                         = add_multi_match(multi_char_matches,
17038                                                           multi_fold,
17039                                                           cp_count);
17040
17041                     }
17042
17043                     /* This element should not be processed further in this
17044                      * class */
17045                     element_count--;
17046                     value = save_value;
17047                     prevvalue = save_prevvalue;
17048                     continue;
17049                 }
17050             }
17051         }
17052
17053         if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
17054             if (range) {
17055
17056                 /* If the range starts above 255, everything is portable and
17057                  * likely to be so for any forseeable character set, so don't
17058                  * warn. */
17059                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17060                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17061                 }
17062                 else if (prevvalue != value) {
17063
17064                     /* Under strict, ranges that stop and/or end in an ASCII
17065                      * printable should have each end point be a portable value
17066                      * for it (preferably like 'A', but we don't warn if it is
17067                      * a (portable) Unicode name or code point), and the range
17068                      * must be be all digits or all letters of the same case.
17069                      * Otherwise, the range is non-portable and unclear as to
17070                      * what it contains */
17071                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17072                         && (          non_portable_endpoint
17073                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17074                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17075                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17076                     ))) {
17077                         vWARN(RExC_parse, "Ranges of ASCII printables should"
17078                                           " be some subset of \"0-9\","
17079                                           " \"A-Z\", or \"a-z\"");
17080                     }
17081                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17082                         SSize_t index_start;
17083                         SSize_t index_final;
17084
17085                         /* But the nature of Unicode and languages mean we
17086                          * can't do the same checks for above-ASCII ranges,
17087                          * except in the case of digit ones.  These should
17088                          * contain only digits from the same group of 10.  The
17089                          * ASCII case is handled just above.  Hence here, the
17090                          * range could be a range of digits.  First some
17091                          * unlikely special cases.  Grandfather in that a range
17092                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17093                          * if its starting value is one of the 10 digits prior
17094                          * to it.  This is because it is an alternate way of
17095                          * writing 19D1, and some people may expect it to be in
17096                          * that group.  But it is bad, because it won't give
17097                          * the expected results.  In Unicode 5.2 it was
17098                          * considered to be in that group (of 11, hence), but
17099                          * this was fixed in the next version */
17100
17101                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17102                             goto warn_bad_digit_range;
17103                         }
17104                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
17105                                           &&     value <= 0x1D7FF))
17106                         {
17107                             /* This is the only other case currently in Unicode
17108                              * where the algorithm below fails.  The code
17109                              * points just above are the end points of a single
17110                              * range containing only decimal digits.  It is 5
17111                              * different series of 0-9.  All other ranges of
17112                              * digits currently in Unicode are just a single
17113                              * series.  (And mktables will notify us if a later
17114                              * Unicode version breaks this.)
17115                              *
17116                              * If the range being checked is at most 9 long,
17117                              * and the digit values represented are in
17118                              * numerical order, they are from the same series.
17119                              * */
17120                             if (         value - prevvalue > 9
17121                                 ||    (((    value - 0x1D7CE) % 10)
17122                                      <= (prevvalue - 0x1D7CE) % 10))
17123                             {
17124                                 goto warn_bad_digit_range;
17125                             }
17126                         }
17127                         else {
17128
17129                             /* For all other ranges of digits in Unicode, the
17130                              * algorithm is just to check if both end points
17131                              * are in the same series, which is the same range.
17132                              * */
17133                             index_start = _invlist_search(
17134                                                     PL_XPosix_ptrs[_CC_DIGIT],
17135                                                     prevvalue);
17136
17137                             /* Warn if the range starts and ends with a digit,
17138                              * and they are not in the same group of 10. */
17139                             if (   index_start >= 0
17140                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17141                                 && (index_final =
17142                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17143                                                     value)) != index_start
17144                                 && index_final >= 0
17145                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17146                             {
17147                               warn_bad_digit_range:
17148                                 vWARN(RExC_parse, "Ranges of digits should be"
17149                                                   " from the same group of"
17150                                                   " 10");
17151                             }
17152                         }
17153                     }
17154                 }
17155             }
17156             if ((! range || prevvalue == value) && non_portable_endpoint) {
17157                 if (isPRINT_A(value)) {
17158                     char literal[3];
17159                     unsigned d = 0;
17160                     if (isBACKSLASHED_PUNCT(value)) {
17161                         literal[d++] = '\\';
17162                     }
17163                     literal[d++] = (char) value;
17164                     literal[d++] = '\0';
17165
17166                     vWARN4(RExC_parse,
17167                            "\"%.*s\" is more clearly written simply as \"%s\"",
17168                            (int) (RExC_parse - rangebegin),
17169                            rangebegin,
17170                            literal
17171                         );
17172                 }
17173                 else if isMNEMONIC_CNTRL(value) {
17174                     vWARN4(RExC_parse,
17175                            "\"%.*s\" is more clearly written simply as \"%s\"",
17176                            (int) (RExC_parse - rangebegin),
17177                            rangebegin,
17178                            cntrl_to_mnemonic((U8) value)
17179                         );
17180                 }
17181             }
17182         }
17183
17184         /* Deal with this element of the class */
17185         if (! SIZE_ONLY) {
17186
17187 #ifndef EBCDIC
17188             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17189                                                      prevvalue, value);
17190 #else
17191             /* On non-ASCII platforms, for ranges that span all of 0..255, and
17192              * ones that don't require special handling, we can just add the
17193              * range like we do for ASCII platforms */
17194             if ((UNLIKELY(prevvalue == 0) && value >= 255)
17195                 || ! (prevvalue < 256
17196                       && (unicode_range
17197                           || (! non_portable_endpoint
17198                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17199                                   || (isUPPER_A(prevvalue)
17200                                       && isUPPER_A(value)))))))
17201             {
17202                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17203                                                          prevvalue, value);
17204             }
17205             else {
17206                 /* Here, requires special handling.  This can be because it is
17207                  * a range whose code points are considered to be Unicode, and
17208                  * so must be individually translated into native, or because
17209                  * its a subrange of 'A-Z' or 'a-z' which each aren't
17210                  * contiguous in EBCDIC, but we have defined them to include
17211                  * only the "expected" upper or lower case ASCII alphabetics.
17212                  * Subranges above 255 are the same in native and Unicode, so
17213                  * can be added as a range */
17214                 U8 start = NATIVE_TO_LATIN1(prevvalue);
17215                 unsigned j;
17216                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17217                 for (j = start; j <= end; j++) {
17218                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17219                 }
17220                 if (value > 255) {
17221                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17222                                                              256, value);
17223                 }
17224             }
17225 #endif
17226         }
17227
17228         range = 0; /* this range (if it was one) is done now */
17229     } /* End of loop through all the text within the brackets */
17230
17231
17232     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17233         output_or_return_posix_warnings(pRExC_state, posix_warnings,
17234                                         return_posix_warnings);
17235     }
17236
17237     /* If anything in the class expands to more than one character, we have to
17238      * deal with them by building up a substitute parse string, and recursively
17239      * calling reg() on it, instead of proceeding */
17240     if (multi_char_matches) {
17241         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17242         I32 cp_count;
17243         STRLEN len;
17244         char *save_end = RExC_end;
17245         char *save_parse = RExC_parse;
17246         char *save_start = RExC_start;
17247         STRLEN prefix_end = 0;      /* We copy the character class after a
17248                                        prefix supplied here.  This is the size
17249                                        + 1 of that prefix */
17250         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17251                                        a "|" */
17252         I32 reg_flags;
17253
17254         assert(! invert);
17255         assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
17256
17257 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17258            because too confusing */
17259         if (invert) {
17260             sv_catpv(substitute_parse, "(?:");
17261         }
17262 #endif
17263
17264         /* Look at the longest folds first */
17265         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17266                         cp_count > 0;
17267                         cp_count--)
17268         {
17269
17270             if (av_exists(multi_char_matches, cp_count)) {
17271                 AV** this_array_ptr;
17272                 SV* this_sequence;
17273
17274                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17275                                                  cp_count, FALSE);
17276                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17277                                                                 &PL_sv_undef)
17278                 {
17279                     if (! first_time) {
17280                         sv_catpv(substitute_parse, "|");
17281                     }
17282                     first_time = FALSE;
17283
17284                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17285                 }
17286             }
17287         }
17288
17289         /* If the character class contains anything else besides these
17290          * multi-character folds, have to include it in recursive parsing */
17291         if (element_count) {
17292             sv_catpv(substitute_parse, "|[");
17293             prefix_end = SvCUR(substitute_parse);
17294             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17295
17296             /* Put in a closing ']' only if not going off the end, as otherwise
17297              * we are adding something that really isn't there */
17298             if (RExC_parse < RExC_end) {
17299                 sv_catpv(substitute_parse, "]");
17300             }
17301         }
17302
17303         sv_catpv(substitute_parse, ")");
17304 #if 0
17305         if (invert) {
17306             /* This is a way to get the parse to skip forward a whole named
17307              * sequence instead of matching the 2nd character when it fails the
17308              * first */
17309             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17310         }
17311 #endif
17312
17313         /* Set up the data structure so that any errors will be properly
17314          * reported.  See the comments at the definition of
17315          * REPORT_LOCATION_ARGS for details */
17316         RExC_precomp_adj = orig_parse - RExC_precomp;
17317         RExC_start =  RExC_parse = SvPV(substitute_parse, len);
17318         RExC_adjusted_start = RExC_start + prefix_end;
17319         RExC_end = RExC_parse + len;
17320         RExC_in_multi_char_class = 1;
17321         RExC_emit = (regnode *)orig_emit;
17322
17323         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17324
17325         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
17326
17327         /* And restore so can parse the rest of the pattern */
17328         RExC_parse = save_parse;
17329         RExC_start = RExC_adjusted_start = save_start;
17330         RExC_precomp_adj = 0;
17331         RExC_end = save_end;
17332         RExC_in_multi_char_class = 0;
17333         SvREFCNT_dec_NN(multi_char_matches);
17334         return ret;
17335     }
17336
17337     /* Here, we've gone through the entire class and dealt with multi-char
17338      * folds.  We are now in a position that we can do some checks to see if we
17339      * can optimize this ANYOF node into a simpler one, even in Pass 1.
17340      * Currently we only do two checks:
17341      * 1) is in the unlikely event that the user has specified both, eg. \w and
17342      *    \W under /l, then the class matches everything.  (This optimization
17343      *    is done only to make the optimizer code run later work.)
17344      * 2) if the character class contains only a single element (including a
17345      *    single range), we see if there is an equivalent node for it.
17346      * Other checks are possible */
17347     if (   optimizable
17348         && ! ret_invlist   /* Can't optimize if returning the constructed
17349                               inversion list */
17350         && (UNLIKELY(posixl_matches_all) || element_count == 1))
17351     {
17352         U8 op = END;
17353         U8 arg = 0;
17354
17355         if (UNLIKELY(posixl_matches_all)) {
17356             op = SANY;
17357         }
17358         else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
17359                                                    class, like \w or [:digit:]
17360                                                    or \p{foo} */
17361
17362             /* All named classes are mapped into POSIXish nodes, with its FLAG
17363              * argument giving which class it is */
17364             switch ((I32)namedclass) {
17365                 case ANYOF_UNIPROP:
17366                     break;
17367
17368                 /* These don't depend on the charset modifiers.  They always
17369                  * match under /u rules */
17370                 case ANYOF_NHORIZWS:
17371                 case ANYOF_HORIZWS:
17372                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
17373                     /* FALLTHROUGH */
17374
17375                 case ANYOF_NVERTWS:
17376                 case ANYOF_VERTWS:
17377                     op = POSIXU;
17378                     goto join_posix;
17379
17380                 /* The actual POSIXish node for all the rest depends on the
17381                  * charset modifier.  The ones in the first set depend only on
17382                  * ASCII or, if available on this platform, also locale */
17383
17384                 case ANYOF_ASCII:
17385                 case ANYOF_NASCII:
17386
17387 #ifdef HAS_ISASCII
17388                     if (LOC) {
17389                         op = POSIXL;
17390                         goto join_posix;
17391                     }
17392 #endif
17393                     /* (named_class - ANY_OF_ASCII) is 0 or 1. xor'ing with
17394                      * invert converts that to 1 or 0 */
17395                     op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
17396                     break;
17397
17398                 /* The following don't have any matches in the upper Latin1
17399                  * range, hence /d is equivalent to /u for them.  Making it /u
17400                  * saves some branches at runtime */
17401                 case ANYOF_DIGIT:
17402                 case ANYOF_NDIGIT:
17403                 case ANYOF_XDIGIT:
17404                 case ANYOF_NXDIGIT:
17405                     if (! DEPENDS_SEMANTICS) {
17406                         goto treat_as_default;
17407                     }
17408
17409                     op = POSIXU;
17410                     goto join_posix;
17411
17412                 /* The following change to CASED under /i */
17413                 case ANYOF_LOWER:
17414                 case ANYOF_NLOWER:
17415                 case ANYOF_UPPER:
17416                 case ANYOF_NUPPER:
17417                     if (FOLD) {
17418                         namedclass = ANYOF_CASED + (namedclass % 2);
17419                     }
17420                     /* FALLTHROUGH */
17421
17422                 /* The rest have more possibilities depending on the charset.
17423                  * We take advantage of the enum ordering of the charset
17424                  * modifiers to get the exact node type, */
17425                 default:
17426                   treat_as_default:
17427                     op = POSIXD + get_regex_charset(RExC_flags);
17428                     if (op > POSIXA) { /* /aa is same as /a */
17429                         op = POSIXA;
17430                     }
17431
17432                   join_posix:
17433                     /* The odd numbered ones are the complements of the
17434                      * next-lower even number one */
17435                     if (namedclass % 2 == 1) {
17436                         invert = ! invert;
17437                         namedclass--;
17438                     }
17439                     arg = namedclass_to_classnum(namedclass);
17440                     break;
17441             }
17442         }
17443         else if (value == prevvalue) {
17444
17445             /* Here, the class consists of just a single code point */
17446
17447             if (invert) {
17448                 if (! LOC && value == '\n') {
17449                     op = REG_ANY; /* Optimize [^\n] */
17450                     *flagp |= HASWIDTH|SIMPLE;
17451                     MARK_NAUGHTY(1);
17452                 }
17453             }
17454             else if (value < 256 || UTF) {
17455
17456                 /* Optimize a single value into an EXACTish node, but not if it
17457                  * would require converting the pattern to UTF-8. */
17458                 op = compute_EXACTish(pRExC_state);
17459             }
17460         } /* Otherwise is a range */
17461         else if (! LOC) {   /* locale could vary these */
17462             if (prevvalue == '0') {
17463                 if (value == '9') {
17464                     arg = _CC_DIGIT;
17465                     op = POSIXA;
17466                 }
17467             }
17468             else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17469                 /* We can optimize A-Z or a-z, but not if they could match
17470                  * something like the KELVIN SIGN under /i. */
17471                 if (prevvalue == 'A') {
17472                     if (value == 'Z'
17473 #ifdef EBCDIC
17474                         && ! non_portable_endpoint
17475 #endif
17476                     ) {
17477                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17478                         op = POSIXA;
17479                     }
17480                 }
17481                 else if (prevvalue == 'a') {
17482                     if (value == 'z'
17483 #ifdef EBCDIC
17484                         && ! non_portable_endpoint
17485 #endif
17486                     ) {
17487                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17488                         op = POSIXA;
17489                     }
17490                 }
17491             }
17492         }
17493
17494         /* Here, we have changed <op> away from its initial value iff we found
17495          * an optimization */
17496         if (op != END) {
17497
17498             /* Throw away this ANYOF regnode, and emit the calculated one,
17499              * which should correspond to the beginning, not current, state of
17500              * the parse */
17501             const char * cur_parse = RExC_parse;
17502             RExC_parse = (char *)orig_parse;
17503             if ( SIZE_ONLY) {
17504                 if (! LOC) {
17505
17506                     /* To get locale nodes to not use the full ANYOF size would
17507                      * require moving the code above that writes the portions
17508                      * of it that aren't in other nodes to after this point.
17509                      * e.g.  ANYOF_POSIXL_SET */
17510                     RExC_size = orig_size;
17511                 }
17512             }
17513             else {
17514                 RExC_emit = (regnode *)orig_emit;
17515                 if (PL_regkind[op] == POSIXD) {
17516                     if (op == POSIXL) {
17517                         RExC_contains_locale = 1;
17518                     }
17519                     if (invert) {
17520                         op += NPOSIXD - POSIXD;
17521                     }
17522                 }
17523             }
17524
17525             ret = reg_node(pRExC_state, op);
17526
17527             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17528                 if (! SIZE_ONLY) {
17529                     FLAGS(ret) = arg;
17530                 }
17531                 *flagp |= HASWIDTH|SIMPLE;
17532             }
17533             else if (PL_regkind[op] == EXACT) {
17534                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17535                                            TRUE /* downgradable to EXACT */
17536                                            );
17537             }
17538             else {
17539                 *flagp |= HASWIDTH|SIMPLE;
17540             }
17541
17542             RExC_parse = (char *) cur_parse;
17543
17544             SvREFCNT_dec(posixes);
17545             SvREFCNT_dec(nposixes);
17546             SvREFCNT_dec(simple_posixes);
17547             SvREFCNT_dec(cp_list);
17548             SvREFCNT_dec(cp_foldable_list);
17549             return ret;
17550         }
17551     }
17552
17553     if (SIZE_ONLY)
17554         return ret;
17555     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17556
17557     /* If folding, we calculate all characters that could fold to or from the
17558      * ones already on the list */
17559     if (cp_foldable_list) {
17560         if (FOLD) {
17561             UV start, end;      /* End points of code point ranges */
17562
17563             SV* fold_intersection = NULL;
17564             SV** use_list;
17565
17566             /* Our calculated list will be for Unicode rules.  For locale
17567              * matching, we have to keep a separate list that is consulted at
17568              * runtime only when the locale indicates Unicode rules.  For
17569              * non-locale, we just use the general list */
17570             if (LOC) {
17571                 use_list = &only_utf8_locale_list;
17572             }
17573             else {
17574                 use_list = &cp_list;
17575             }
17576
17577             /* Only the characters in this class that participate in folds need
17578              * be checked.  Get the intersection of this class and all the
17579              * possible characters that are foldable.  This can quickly narrow
17580              * down a large class */
17581             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17582                                   &fold_intersection);
17583
17584             /* The folds for all the Latin1 characters are hard-coded into this
17585              * program, but we have to go out to disk to get the others. */
17586             if (invlist_highest(cp_foldable_list) >= 256) {
17587
17588                 /* This is a hash that for a particular fold gives all
17589                  * characters that are involved in it */
17590                 if (! PL_utf8_foldclosures) {
17591                     _load_PL_utf8_foldclosures();
17592                 }
17593             }
17594
17595             /* Now look at the foldable characters in this class individually */
17596             invlist_iterinit(fold_intersection);
17597             while (invlist_iternext(fold_intersection, &start, &end)) {
17598                 UV j;
17599
17600                 /* Look at every character in the range */
17601                 for (j = start; j <= end; j++) {
17602                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17603                     STRLEN foldlen;
17604                     SV** listp;
17605
17606                     if (j < 256) {
17607
17608                         if (IS_IN_SOME_FOLD_L1(j)) {
17609
17610                             /* ASCII is always matched; non-ASCII is matched
17611                              * only under Unicode rules (which could happen
17612                              * under /l if the locale is a UTF-8 one */
17613                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17614                                 *use_list = add_cp_to_invlist(*use_list,
17615                                                             PL_fold_latin1[j]);
17616                             }
17617                             else {
17618                                 has_upper_latin1_only_utf8_matches
17619                                     = add_cp_to_invlist(
17620                                             has_upper_latin1_only_utf8_matches,
17621                                             PL_fold_latin1[j]);
17622                             }
17623                         }
17624
17625                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17626                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17627                         {
17628                             add_above_Latin1_folds(pRExC_state,
17629                                                    (U8) j,
17630                                                    use_list);
17631                         }
17632                         continue;
17633                     }
17634
17635                     /* Here is an above Latin1 character.  We don't have the
17636                      * rules hard-coded for it.  First, get its fold.  This is
17637                      * the simple fold, as the multi-character folds have been
17638                      * handled earlier and separated out */
17639                     _to_uni_fold_flags(j, foldbuf, &foldlen,
17640                                                         (ASCII_FOLD_RESTRICTED)
17641                                                         ? FOLD_FLAGS_NOMIX_ASCII
17642                                                         : 0);
17643
17644                     /* Single character fold of above Latin1.  Add everything in
17645                     * its fold closure to the list that this node should match.
17646                     * The fold closures data structure is a hash with the keys
17647                     * being the UTF-8 of every character that is folded to, like
17648                     * 'k', and the values each an array of all code points that
17649                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
17650                     * Multi-character folds are not included */
17651                     if ((listp = hv_fetch(PL_utf8_foldclosures,
17652                                         (char *) foldbuf, foldlen, FALSE)))
17653                     {
17654                         AV* list = (AV*) *listp;
17655                         IV k;
17656                         for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
17657                             SV** c_p = av_fetch(list, k, FALSE);
17658                             UV c;
17659                             assert(c_p);
17660
17661                             c = SvUV(*c_p);
17662
17663                             /* /aa doesn't allow folds between ASCII and non- */
17664                             if ((ASCII_FOLD_RESTRICTED
17665                                 && (isASCII(c) != isASCII(j))))
17666                             {
17667                                 continue;
17668                             }
17669
17670                             /* Folds under /l which cross the 255/256 boundary
17671                              * are added to a separate list.  (These are valid
17672                              * only when the locale is UTF-8.) */
17673                             if (c < 256 && LOC) {
17674                                 *use_list = add_cp_to_invlist(*use_list, c);
17675                                 continue;
17676                             }
17677
17678                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17679                             {
17680                                 cp_list = add_cp_to_invlist(cp_list, c);
17681                             }
17682                             else {
17683                                 /* Similarly folds involving non-ascii Latin1
17684                                 * characters under /d are added to their list */
17685                                 has_upper_latin1_only_utf8_matches
17686                                         = add_cp_to_invlist(
17687                                            has_upper_latin1_only_utf8_matches,
17688                                            c);
17689                             }
17690                         }
17691                     }
17692                 }
17693             }
17694             SvREFCNT_dec_NN(fold_intersection);
17695         }
17696
17697         /* Now that we have finished adding all the folds, there is no reason
17698          * to keep the foldable list separate */
17699         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17700         SvREFCNT_dec_NN(cp_foldable_list);
17701     }
17702
17703     /* And combine the result (if any) with any inversion lists from posix
17704      * classes.  The lists are kept separate up to now because we don't want to
17705      * fold the classes (folding of those is automatically handled by the swash
17706      * fetching code) */
17707     if (simple_posixes) {   /* These are the classes known to be unaffected by
17708                                /a, /aa, and /d */
17709         if (cp_list) {
17710             _invlist_union(cp_list, simple_posixes, &cp_list);
17711             SvREFCNT_dec_NN(simple_posixes);
17712         }
17713         else {
17714             cp_list = simple_posixes;
17715         }
17716     }
17717     if (posixes || nposixes) {
17718
17719         /* We have to adjust /a and /aa */
17720         if (AT_LEAST_ASCII_RESTRICTED) {
17721
17722             /* Under /a and /aa, nothing above ASCII matches these */
17723             if (posixes) {
17724                 _invlist_intersection(posixes,
17725                                     PL_XPosix_ptrs[_CC_ASCII],
17726                                     &posixes);
17727             }
17728
17729             /* Under /a and /aa, everything above ASCII matches these
17730              * complements */
17731             if (nposixes) {
17732                 _invlist_union_complement_2nd(nposixes,
17733                                               PL_XPosix_ptrs[_CC_ASCII],
17734                                               &nposixes);
17735             }
17736         }
17737
17738         if (! DEPENDS_SEMANTICS) {
17739
17740             /* For everything but /d, we can just add the current 'posixes' and
17741              * 'nposixes' to the main list */
17742             if (posixes) {
17743                 if (cp_list) {
17744                     _invlist_union(cp_list, posixes, &cp_list);
17745                     SvREFCNT_dec_NN(posixes);
17746                 }
17747                 else {
17748                     cp_list = posixes;
17749                 }
17750             }
17751             if (nposixes) {
17752                 if (cp_list) {
17753                     _invlist_union(cp_list, nposixes, &cp_list);
17754                     SvREFCNT_dec_NN(nposixes);
17755                 }
17756                 else {
17757                     cp_list = nposixes;
17758                 }
17759             }
17760         }
17761         else {
17762             /* Under /d, things like \w match upper Latin1 characters only if
17763              * the target string is in UTF-8.  But things like \W match all the
17764              * upper Latin1 characters if the target string is not in UTF-8.
17765              *
17766              * Handle the case where there something like \W separately */
17767             if (nposixes) {
17768                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17769
17770                 /* A complemented posix class matches all upper Latin1
17771                  * characters if not in UTF-8.  And it matches just certain
17772                  * ones when in UTF-8.  That means those certain ones are
17773                  * matched regardless, so can just be added to the
17774                  * unconditional list */
17775                 if (cp_list) {
17776                     _invlist_union(cp_list, nposixes, &cp_list);
17777                     SvREFCNT_dec_NN(nposixes);
17778                     nposixes = NULL;
17779                 }
17780                 else {
17781                     cp_list = nposixes;
17782                 }
17783
17784                 /* Likewise for 'posixes' */
17785                 _invlist_union(posixes, cp_list, &cp_list);
17786
17787                 /* Likewise for anything else in the range that matched only
17788                  * under UTF-8 */
17789                 if (has_upper_latin1_only_utf8_matches) {
17790                     _invlist_union(cp_list,
17791                                    has_upper_latin1_only_utf8_matches,
17792                                    &cp_list);
17793                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17794                     has_upper_latin1_only_utf8_matches = NULL;
17795                 }
17796
17797                 /* If we don't match all the upper Latin1 characters regardless
17798                  * of UTF-8ness, we have to set a flag to match the rest when
17799                  * not in UTF-8 */
17800                 _invlist_subtract(only_non_utf8_list, cp_list,
17801                                   &only_non_utf8_list);
17802                 if (_invlist_len(only_non_utf8_list) != 0) {
17803                     ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17804                 }
17805                 SvREFCNT_dec_NN(only_non_utf8_list);
17806             }
17807             else {
17808                 /* Here there were no complemented posix classes.  That means
17809                  * the upper Latin1 characters in 'posixes' match only when the
17810                  * target string is in UTF-8.  So we have to add them to the
17811                  * list of those types of code points, while adding the
17812                  * remainder to the unconditional list.
17813                  *
17814                  * First calculate what they are */
17815                 SV* nonascii_but_latin1_properties = NULL;
17816                 _invlist_intersection(posixes, PL_UpperLatin1,
17817                                       &nonascii_but_latin1_properties);
17818
17819                 /* And add them to the final list of such characters. */
17820                 _invlist_union(has_upper_latin1_only_utf8_matches,
17821                                nonascii_but_latin1_properties,
17822                                &has_upper_latin1_only_utf8_matches);
17823
17824                 /* Remove them from what now becomes the unconditional list */
17825                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
17826                                   &posixes);
17827
17828                 /* And add those unconditional ones to the final list */
17829                 if (cp_list) {
17830                     _invlist_union(cp_list, posixes, &cp_list);
17831                     SvREFCNT_dec_NN(posixes);
17832                     posixes = NULL;
17833                 }
17834                 else {
17835                     cp_list = posixes;
17836                 }
17837
17838                 SvREFCNT_dec(nonascii_but_latin1_properties);
17839
17840                 /* Get rid of any characters that we now know are matched
17841                  * unconditionally from the conditional list, which may make
17842                  * that list empty */
17843                 _invlist_subtract(has_upper_latin1_only_utf8_matches,
17844                                   cp_list,
17845                                   &has_upper_latin1_only_utf8_matches);
17846                 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17847                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17848                     has_upper_latin1_only_utf8_matches = NULL;
17849                 }
17850             }
17851         }
17852     }
17853
17854     /* And combine the result (if any) with any inversion list from properties.
17855      * The lists are kept separate up to now so that we can distinguish the two
17856      * in regards to matching above-Unicode.  A run-time warning is generated
17857      * if a Unicode property is matched against a non-Unicode code point. But,
17858      * we allow user-defined properties to match anything, without any warning,
17859      * and we also suppress the warning if there is a portion of the character
17860      * class that isn't a Unicode property, and which matches above Unicode, \W
17861      * or [\x{110000}] for example.
17862      * (Note that in this case, unlike the Posix one above, there is no
17863      * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17864      * forces Unicode semantics */
17865     if (properties) {
17866         if (cp_list) {
17867
17868             /* If it matters to the final outcome, see if a non-property
17869              * component of the class matches above Unicode.  If so, the
17870              * warning gets suppressed.  This is true even if just a single
17871              * such code point is specified, as, though not strictly correct if
17872              * another such code point is matched against, the fact that they
17873              * are using above-Unicode code points indicates they should know
17874              * the issues involved */
17875             if (warn_super) {
17876                 warn_super = ! (invert
17877                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17878             }
17879
17880             _invlist_union(properties, cp_list, &cp_list);
17881             SvREFCNT_dec_NN(properties);
17882         }
17883         else {
17884             cp_list = properties;
17885         }
17886
17887         if (warn_super) {
17888             ANYOF_FLAGS(ret)
17889              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17890
17891             /* Because an ANYOF node is the only one that warns, this node
17892              * can't be optimized into something else */
17893             optimizable = FALSE;
17894         }
17895     }
17896
17897     /* Here, we have calculated what code points should be in the character
17898      * class.
17899      *
17900      * Now we can see about various optimizations.  Fold calculation (which we
17901      * did above) needs to take place before inversion.  Otherwise /[^k]/i
17902      * would invert to include K, which under /i would match k, which it
17903      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
17904      * folded until runtime */
17905
17906     /* If we didn't do folding, it's because some information isn't available
17907      * until runtime; set the run-time fold flag for these.  (We don't have to
17908      * worry about properties folding, as that is taken care of by the swash
17909      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
17910      * locales, or the class matches at least one 0-255 range code point */
17911     if (LOC && FOLD) {
17912
17913         /* Some things on the list might be unconditionally included because of
17914          * other components.  Remove them, and clean up the list if it goes to
17915          * 0 elements */
17916         if (only_utf8_locale_list && cp_list) {
17917             _invlist_subtract(only_utf8_locale_list, cp_list,
17918                               &only_utf8_locale_list);
17919
17920             if (_invlist_len(only_utf8_locale_list) == 0) {
17921                 SvREFCNT_dec_NN(only_utf8_locale_list);
17922                 only_utf8_locale_list = NULL;
17923             }
17924         }
17925         if (only_utf8_locale_list) {
17926             ANYOF_FLAGS(ret)
17927                  |=  ANYOFL_FOLD
17928                     |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17929         }
17930         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17931             UV start, end;
17932             invlist_iterinit(cp_list);
17933             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17934                 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17935             }
17936             invlist_iterfinish(cp_list);
17937         }
17938     }
17939     else if (   DEPENDS_SEMANTICS
17940              && (    has_upper_latin1_only_utf8_matches
17941                  || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
17942     {
17943         OP(ret) = ANYOFD;
17944         optimizable = FALSE;
17945     }
17946
17947
17948     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17949      * at compile time.  Besides not inverting folded locale now, we can't
17950      * invert if there are things such as \w, which aren't known until runtime
17951      * */
17952     if (cp_list
17953         && invert
17954         && OP(ret) != ANYOFD
17955         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17956         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17957     {
17958         _invlist_invert(cp_list);
17959
17960         /* Any swash can't be used as-is, because we've inverted things */
17961         if (swash) {
17962             SvREFCNT_dec_NN(swash);
17963             swash = NULL;
17964         }
17965
17966         /* Clear the invert flag since have just done it here */
17967         invert = FALSE;
17968     }
17969
17970     if (ret_invlist) {
17971         assert(cp_list);
17972
17973         *ret_invlist = cp_list;
17974         SvREFCNT_dec(swash);
17975
17976         /* Discard the generated node */
17977         if (SIZE_ONLY) {
17978             RExC_size = orig_size;
17979         }
17980         else {
17981             RExC_emit = orig_emit;
17982         }
17983         return orig_emit;
17984     }
17985
17986     /* Some character classes are equivalent to other nodes.  Such nodes take
17987      * up less room and generally fewer operations to execute than ANYOF nodes.
17988      * Above, we checked for and optimized into some such equivalents for
17989      * certain common classes that are easy to test.  Getting to this point in
17990      * the code means that the class didn't get optimized there.  Since this
17991      * code is only executed in Pass 2, it is too late to save space--it has
17992      * been allocated in Pass 1, and currently isn't given back.  But turning
17993      * things into an EXACTish node can allow the optimizer to join it to any
17994      * adjacent such nodes.  And if the class is equivalent to things like /./,
17995      * expensive run-time swashes can be avoided.  Now that we have more
17996      * complete information, we can find things necessarily missed by the
17997      * earlier code.  Another possible "optimization" that isn't done is that
17998      * something like [Ee] could be changed into an EXACTFU.  khw tried this
17999      * and found that the ANYOF is faster, including for code points not in the
18000      * bitmap.  This still might make sense to do, provided it got joined with
18001      * an adjacent node(s) to create a longer EXACTFU one.  This could be
18002      * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
18003      * routine would know is joinable.  If that didn't happen, the node type
18004      * could then be made a straight ANYOF */
18005
18006     if (optimizable && cp_list && ! invert) {
18007         UV start, end;
18008         U8 op = END;  /* The optimzation node-type */
18009         int posix_class = -1;   /* Illegal value */
18010         const char * cur_parse= RExC_parse;
18011
18012         invlist_iterinit(cp_list);
18013         if (! invlist_iternext(cp_list, &start, &end)) {
18014
18015             /* Here, the list is empty.  This happens, for example, when a
18016              * Unicode property that doesn't match anything is the only element
18017              * in the character class (perluniprops.pod notes such properties).
18018              * */
18019             op = OPFAIL;
18020             *flagp |= HASWIDTH|SIMPLE;
18021         }
18022         else if (start == end) {    /* The range is a single code point */
18023             if (! invlist_iternext(cp_list, &start, &end)
18024
18025                     /* Don't do this optimization if it would require changing
18026                      * the pattern to UTF-8 */
18027                 && (start < 256 || UTF))
18028             {
18029                 /* Here, the list contains a single code point.  Can optimize
18030                  * into an EXACTish node */
18031
18032                 value = start;
18033
18034                 if (! FOLD) {
18035                     op = (LOC)
18036                          ? EXACTL
18037                          : EXACT;
18038                 }
18039                 else if (LOC) {
18040
18041                     /* A locale node under folding with one code point can be
18042                      * an EXACTFL, as its fold won't be calculated until
18043                      * runtime */
18044                     op = EXACTFL;
18045                 }
18046                 else {
18047
18048                     /* Here, we are generally folding, but there is only one
18049                      * code point to match.  If we have to, we use an EXACT
18050                      * node, but it would be better for joining with adjacent
18051                      * nodes in the optimization pass if we used the same
18052                      * EXACTFish node that any such are likely to be.  We can
18053                      * do this iff the code point doesn't participate in any
18054                      * folds.  For example, an EXACTF of a colon is the same as
18055                      * an EXACT one, since nothing folds to or from a colon. */
18056                     if (value < 256) {
18057                         if (IS_IN_SOME_FOLD_L1(value)) {
18058                             op = EXACT;
18059                         }
18060                     }
18061                     else {
18062                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
18063                             op = EXACT;
18064                         }
18065                     }
18066
18067                     /* If we haven't found the node type, above, it means we
18068                      * can use the prevailing one */
18069                     if (op == END) {
18070                         op = compute_EXACTish(pRExC_state);
18071                     }
18072                 }
18073             }
18074         }   /* End of first range contains just a single code point */
18075         else if (start == 0) {
18076             if (end == UV_MAX) {
18077                 op = SANY;
18078                 *flagp |= HASWIDTH|SIMPLE;
18079                 MARK_NAUGHTY(1);
18080             }
18081             else if (end == '\n' - 1
18082                     && invlist_iternext(cp_list, &start, &end)
18083                     && start == '\n' + 1 && end == UV_MAX)
18084             {
18085                 op = REG_ANY;
18086                 *flagp |= HASWIDTH|SIMPLE;
18087                 MARK_NAUGHTY(1);
18088             }
18089         }
18090         invlist_iterfinish(cp_list);
18091
18092         if (op == END) {
18093             const UV cp_list_len = _invlist_len(cp_list);
18094             const UV* cp_list_array = invlist_array(cp_list);
18095
18096             /* Here, didn't find an optimization.  See if this matches any of
18097              * the POSIX classes.  First try ASCII */
18098
18099             if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
18100                 op = ASCII;
18101                 *flagp |= HASWIDTH|SIMPLE;
18102             }
18103             else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
18104                 op = NASCII;
18105                 *flagp |= HASWIDTH|SIMPLE;
18106             }
18107             else if (cp_list_array[cp_list_len-1] >= 0x2029) {
18108
18109                 /* Then try the other POSIX classes.  The POSIXA ones are about
18110                  * the same speed as ANYOF ops, but the ones that have
18111                  * above-Latin1 code point matches are somewhat faster than
18112                  * ANYOF.  So optimize those, but don't bother with the POSIXA
18113                  * ones nor [:cntrl:] which has no above-Latin1 matches.  If
18114                  * this ANYOF node has a lower highest possible matching code
18115                  * point than any of the XPosix ones, we know that it can't
18116                  * possibly be the same as any of them, so we can avoid
18117                  * executing this code.  The 0x2029 above for the lowest max
18118                  * was determined by manual inspection of the classes, and
18119                  * comes from \v.  Suppose Unicode in a later version adds a
18120                  * higher code point to \v.  All that means is that this code
18121                  * can be executed unnecessarily.  It will still give the
18122                  * correct answer. */
18123
18124                 for (posix_class = 0;
18125                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18126                      posix_class++)
18127                 {
18128                     int try_inverted;
18129
18130                     if (posix_class == _CC_CNTRL) {
18131                         continue;
18132                     }
18133
18134                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18135
18136                         /* Check if matches normal or inverted */
18137                         if (_invlistEQ(cp_list,
18138                                        PL_XPosix_ptrs[posix_class],
18139                                        try_inverted))
18140                         {
18141                             op = (try_inverted)
18142                                  ? NPOSIXU
18143                                  : POSIXU;
18144                             *flagp |= HASWIDTH|SIMPLE;
18145                             goto found_posix;
18146                         }
18147                     }
18148                 }
18149               found_posix: ;
18150             }
18151         }
18152
18153         if (op != END) {
18154             RExC_parse = (char *)orig_parse;
18155             RExC_emit = (regnode *)orig_emit;
18156
18157             if (regarglen[op]) {
18158                 ret = reganode(pRExC_state, op, 0);
18159             } else {
18160                 ret = reg_node(pRExC_state, op);
18161             }
18162
18163             RExC_parse = (char *)cur_parse;
18164
18165             if (PL_regkind[op] == EXACT) {
18166                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
18167                                            TRUE /* downgradable to EXACT */
18168                                           );
18169             }
18170             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
18171                 FLAGS(ret) = posix_class;
18172             }
18173
18174             SvREFCNT_dec_NN(cp_list);
18175             return ret;
18176         }
18177     }
18178
18179     /* Here, <cp_list> contains all the code points we can determine at
18180      * compile time that match under all conditions.  Go through it, and
18181      * for things that belong in the bitmap, put them there, and delete from
18182      * <cp_list>.  While we are at it, see if everything above 255 is in the
18183      * list, and if so, set a flag to speed up execution */
18184
18185     populate_ANYOF_from_invlist(ret, &cp_list);
18186
18187     if (invert) {
18188         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
18189     }
18190
18191     /* Here, the bitmap has been populated with all the Latin1 code points that
18192      * always match.  Can now add to the overall list those that match only
18193      * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
18194      * */
18195     if (has_upper_latin1_only_utf8_matches) {
18196         if (cp_list) {
18197             _invlist_union(cp_list,
18198                            has_upper_latin1_only_utf8_matches,
18199                            &cp_list);
18200             SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18201         }
18202         else {
18203             cp_list = has_upper_latin1_only_utf8_matches;
18204         }
18205         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
18206     }
18207
18208     /* If there is a swash and more than one element, we can't use the swash in
18209      * the optimization below. */
18210     if (swash && element_count > 1) {
18211         SvREFCNT_dec_NN(swash);
18212         swash = NULL;
18213     }
18214
18215     /* Note that the optimization of using 'swash' if it is the only thing in
18216      * the class doesn't have us change swash at all, so it can include things
18217      * that are also in the bitmap; otherwise we have purposely deleted that
18218      * duplicate information */
18219     set_ANYOF_arg(pRExC_state, ret, cp_list,
18220                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18221                    ? listsv : NULL,
18222                   only_utf8_locale_list,
18223                   swash, has_user_defined_property);
18224
18225     *flagp |= HASWIDTH|SIMPLE;
18226
18227     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
18228         RExC_contains_locale = 1;
18229     }
18230
18231     return ret;
18232 }
18233
18234 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
18235
18236 STATIC void
18237 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
18238                 regnode* const node,
18239                 SV* const cp_list,
18240                 SV* const runtime_defns,
18241                 SV* const only_utf8_locale_list,
18242                 SV* const swash,
18243                 const bool has_user_defined_property)
18244 {
18245     /* Sets the arg field of an ANYOF-type node 'node', using information about
18246      * the node passed-in.  If there is nothing outside the node's bitmap, the
18247      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
18248      * the count returned by add_data(), having allocated and stored an array,
18249      * av, that that count references, as follows:
18250      *  av[0] stores the character class description in its textual form.
18251      *        This is used later (regexec.c:Perl_regclass_swash()) to
18252      *        initialize the appropriate swash, and is also useful for dumping
18253      *        the regnode.  This is set to &PL_sv_undef if the textual
18254      *        description is not needed at run-time (as happens if the other
18255      *        elements completely define the class)
18256      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
18257      *        computed from av[0].  But if no further computation need be done,
18258      *        the swash is stored here now (and av[0] is &PL_sv_undef).
18259      *  av[2] stores the inversion list of code points that match only if the
18260      *        current locale is UTF-8
18261      *  av[3] stores the cp_list inversion list for use in addition or instead
18262      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
18263      *        (Otherwise everything needed is already in av[0] and av[1])
18264      *  av[4] is set if any component of the class is from a user-defined
18265      *        property; used only if av[3] exists */
18266
18267     UV n;
18268
18269     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
18270
18271     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
18272         assert(! (ANYOF_FLAGS(node)
18273                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
18274         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
18275     }
18276     else {
18277         AV * const av = newAV();
18278         SV *rv;
18279
18280         av_store(av, 0, (runtime_defns)
18281                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
18282         if (swash) {
18283             assert(cp_list);
18284             av_store(av, 1, swash);
18285             SvREFCNT_dec_NN(cp_list);
18286         }
18287         else {
18288             av_store(av, 1, &PL_sv_undef);
18289             if (cp_list) {
18290                 av_store(av, 3, cp_list);
18291                 av_store(av, 4, newSVuv(has_user_defined_property));
18292             }
18293         }
18294
18295         if (only_utf8_locale_list) {
18296             av_store(av, 2, only_utf8_locale_list);
18297         }
18298         else {
18299             av_store(av, 2, &PL_sv_undef);
18300         }
18301
18302         rv = newRV_noinc(MUTABLE_SV(av));
18303         n = add_data(pRExC_state, STR_WITH_LEN("s"));
18304         RExC_rxi->data->data[n] = (void*)rv;
18305         ARG_SET(node, n);
18306     }
18307 }
18308
18309 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
18310 SV *
18311 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
18312                                         const regnode* node,
18313                                         bool doinit,
18314                                         SV** listsvp,
18315                                         SV** only_utf8_locale_ptr,
18316                                         SV** output_invlist)
18317
18318 {
18319     /* For internal core use only.
18320      * Returns the swash for the input 'node' in the regex 'prog'.
18321      * If <doinit> is 'true', will attempt to create the swash if not already
18322      *    done.
18323      * If <listsvp> is non-null, will return the printable contents of the
18324      *    swash.  This can be used to get debugging information even before the
18325      *    swash exists, by calling this function with 'doinit' set to false, in
18326      *    which case the components that will be used to eventually create the
18327      *    swash are returned  (in a printable form).
18328      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
18329      *    store an inversion list of code points that should match only if the
18330      *    execution-time locale is a UTF-8 one.
18331      * If <output_invlist> is not NULL, it is where this routine is to store an
18332      *    inversion list of the code points that would be instead returned in
18333      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
18334      *    when this parameter is used, is just the non-code point data that
18335      *    will go into creating the swash.  This currently should be just
18336      *    user-defined properties whose definitions were not known at compile
18337      *    time.  Using this parameter allows for easier manipulation of the
18338      *    swash's data by the caller.  It is illegal to call this function with
18339      *    this parameter set, but not <listsvp>
18340      *
18341      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
18342      * that, in spite of this function's name, the swash it returns may include
18343      * the bitmap data as well */
18344
18345     SV *sw  = NULL;
18346     SV *si  = NULL;         /* Input swash initialization string */
18347     SV* invlist = NULL;
18348
18349     RXi_GET_DECL(prog,progi);
18350     const struct reg_data * const data = prog ? progi->data : NULL;
18351
18352     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
18353     assert(! output_invlist || listsvp);
18354
18355     if (data && data->count) {
18356         const U32 n = ARG(node);
18357
18358         if (data->what[n] == 's') {
18359             SV * const rv = MUTABLE_SV(data->data[n]);
18360             AV * const av = MUTABLE_AV(SvRV(rv));
18361             SV **const ary = AvARRAY(av);
18362             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18363
18364             si = *ary;  /* ary[0] = the string to initialize the swash with */
18365
18366             if (av_tindex_skip_len_mg(av) >= 2) {
18367                 if (only_utf8_locale_ptr
18368                     && ary[2]
18369                     && ary[2] != &PL_sv_undef)
18370                 {
18371                     *only_utf8_locale_ptr = ary[2];
18372                 }
18373                 else {
18374                     assert(only_utf8_locale_ptr);
18375                     *only_utf8_locale_ptr = NULL;
18376                 }
18377
18378                 /* Elements 3 and 4 are either both present or both absent. [3]
18379                  * is any inversion list generated at compile time; [4]
18380                  * indicates if that inversion list has any user-defined
18381                  * properties in it. */
18382                 if (av_tindex_skip_len_mg(av) >= 3) {
18383                     invlist = ary[3];
18384                     if (SvUV(ary[4])) {
18385                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18386                     }
18387                 }
18388                 else {
18389                     invlist = NULL;
18390                 }
18391             }
18392
18393             /* Element [1] is reserved for the set-up swash.  If already there,
18394              * return it; if not, create it and store it there */
18395             if (ary[1] && SvROK(ary[1])) {
18396                 sw = ary[1];
18397             }
18398             else if (doinit && ((si && si != &PL_sv_undef)
18399                                  || (invlist && invlist != &PL_sv_undef))) {
18400                 assert(si);
18401                 sw = _core_swash_init("utf8", /* the utf8 package */
18402                                       "", /* nameless */
18403                                       si,
18404                                       1, /* binary */
18405                                       0, /* not from tr/// */
18406                                       invlist,
18407                                       &swash_init_flags);
18408                 (void)av_store(av, 1, sw);
18409             }
18410         }
18411     }
18412
18413     /* If requested, return a printable version of what this swash matches */
18414     if (listsvp) {
18415         SV* matches_string = NULL;
18416
18417         /* The swash should be used, if possible, to get the data, as it
18418          * contains the resolved data.  But this function can be called at
18419          * compile-time, before everything gets resolved, in which case we
18420          * return the currently best available information, which is the string
18421          * that will eventually be used to do that resolving, 'si' */
18422         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18423             && (si && si != &PL_sv_undef))
18424         {
18425             /* Here, we only have 'si' (and possibly some passed-in data in
18426              * 'invlist', which is handled below)  If the caller only wants
18427              * 'si', use that.  */
18428             if (! output_invlist) {
18429                 matches_string = newSVsv(si);
18430             }
18431             else {
18432                 /* But if the caller wants an inversion list of the node, we
18433                  * need to parse 'si' and place as much as possible in the
18434                  * desired output inversion list, making 'matches_string' only
18435                  * contain the currently unresolvable things */
18436                 const char *si_string = SvPVX(si);
18437                 STRLEN remaining = SvCUR(si);
18438                 UV prev_cp = 0;
18439                 U8 count = 0;
18440
18441                 /* Ignore everything before the first new-line */
18442                 while (*si_string != '\n' && remaining > 0) {
18443                     si_string++;
18444                     remaining--;
18445                 }
18446                 assert(remaining > 0);
18447
18448                 si_string++;
18449                 remaining--;
18450
18451                 while (remaining > 0) {
18452
18453                     /* The data consists of just strings defining user-defined
18454                      * property names, but in prior incarnations, and perhaps
18455                      * somehow from pluggable regex engines, it could still
18456                      * hold hex code point definitions.  Each component of a
18457                      * range would be separated by a tab, and each range by a
18458                      * new-line.  If these are found, instead add them to the
18459                      * inversion list */
18460                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
18461                                      |PERL_SCAN_SILENT_NON_PORTABLE;
18462                     STRLEN len = remaining;
18463                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
18464
18465                     /* If the hex decode routine found something, it should go
18466                      * up to the next \n */
18467                     if (   *(si_string + len) == '\n') {
18468                         if (count) {    /* 2nd code point on line */
18469                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
18470                         }
18471                         else {
18472                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
18473                         }
18474                         count = 0;
18475                         goto prepare_for_next_iteration;
18476                     }
18477
18478                     /* If the hex decode was instead for the lower range limit,
18479                      * save it, and go parse the upper range limit */
18480                     if (*(si_string + len) == '\t') {
18481                         assert(count == 0);
18482
18483                         prev_cp = cp;
18484                         count = 1;
18485                       prepare_for_next_iteration:
18486                         si_string += len + 1;
18487                         remaining -= len + 1;
18488                         continue;
18489                     }
18490
18491                     /* Here, didn't find a legal hex number.  Just add it from
18492                      * here to the next \n */
18493
18494                     remaining -= len;
18495                     while (*(si_string + len) != '\n' && remaining > 0) {
18496                         remaining--;
18497                         len++;
18498                     }
18499                     if (*(si_string + len) == '\n') {
18500                         len++;
18501                         remaining--;
18502                     }
18503                     if (matches_string) {
18504                         sv_catpvn(matches_string, si_string, len - 1);
18505                     }
18506                     else {
18507                         matches_string = newSVpvn(si_string, len - 1);
18508                     }
18509                     si_string += len;
18510                     sv_catpvs(matches_string, " ");
18511                 } /* end of loop through the text */
18512
18513                 assert(matches_string);
18514                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
18515                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18516                 }
18517             } /* end of has an 'si' but no swash */
18518         }
18519
18520         /* If we have a swash in place, its equivalent inversion list was above
18521          * placed into 'invlist'.  If not, this variable may contain a stored
18522          * inversion list which is information beyond what is in 'si' */
18523         if (invlist) {
18524
18525             /* Again, if the caller doesn't want the output inversion list, put
18526              * everything in 'matches-string' */
18527             if (! output_invlist) {
18528                 if ( ! matches_string) {
18529                     matches_string = newSVpvs("\n");
18530                 }
18531                 sv_catsv(matches_string, invlist_contents(invlist,
18532                                                   TRUE /* traditional style */
18533                                                   ));
18534             }
18535             else if (! *output_invlist) {
18536                 *output_invlist = invlist_clone(invlist);
18537             }
18538             else {
18539                 _invlist_union(*output_invlist, invlist, output_invlist);
18540             }
18541         }
18542
18543         *listsvp = matches_string;
18544     }
18545
18546     return sw;
18547 }
18548 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18549
18550 /* reg_skipcomment()
18551
18552    Absorbs an /x style # comment from the input stream,
18553    returning a pointer to the first character beyond the comment, or if the
18554    comment terminates the pattern without anything following it, this returns
18555    one past the final character of the pattern (in other words, RExC_end) and
18556    sets the REG_RUN_ON_COMMENT_SEEN flag.
18557
18558    Note it's the callers responsibility to ensure that we are
18559    actually in /x mode
18560
18561 */
18562
18563 PERL_STATIC_INLINE char*
18564 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18565 {
18566     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18567
18568     assert(*p == '#');
18569
18570     while (p < RExC_end) {
18571         if (*(++p) == '\n') {
18572             return p+1;
18573         }
18574     }
18575
18576     /* we ran off the end of the pattern without ending the comment, so we have
18577      * to add an \n when wrapping */
18578     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18579     return p;
18580 }
18581
18582 STATIC void
18583 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18584                                 char ** p,
18585                                 const bool force_to_xmod
18586                          )
18587 {
18588     /* If the text at the current parse position '*p' is a '(?#...)' comment,
18589      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18590      * is /x whitespace, advance '*p' so that on exit it points to the first
18591      * byte past all such white space and comments */
18592
18593     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18594
18595     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18596
18597     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18598
18599     for (;;) {
18600         if (RExC_end - (*p) >= 3
18601             && *(*p)     == '('
18602             && *(*p + 1) == '?'
18603             && *(*p + 2) == '#')
18604         {
18605             while (*(*p) != ')') {
18606                 if ((*p) == RExC_end)
18607                     FAIL("Sequence (?#... not terminated");
18608                 (*p)++;
18609             }
18610             (*p)++;
18611             continue;
18612         }
18613
18614         if (use_xmod) {
18615             const char * save_p = *p;
18616             while ((*p) < RExC_end) {
18617                 STRLEN len;
18618                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18619                     (*p) += len;
18620                 }
18621                 else if (*(*p) == '#') {
18622                     (*p) = reg_skipcomment(pRExC_state, (*p));
18623                 }
18624                 else {
18625                     break;
18626                 }
18627             }
18628             if (*p != save_p) {
18629                 continue;
18630             }
18631         }
18632
18633         break;
18634     }
18635
18636     return;
18637 }
18638
18639 /* nextchar()
18640
18641    Advances the parse position by one byte, unless that byte is the beginning
18642    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
18643    those two cases, the parse position is advanced beyond all such comments and
18644    white space.
18645
18646    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18647 */
18648
18649 STATIC void
18650 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18651 {
18652     PERL_ARGS_ASSERT_NEXTCHAR;
18653
18654     if (RExC_parse < RExC_end) {
18655         assert(   ! UTF
18656                || UTF8_IS_INVARIANT(*RExC_parse)
18657                || UTF8_IS_START(*RExC_parse));
18658
18659         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18660
18661         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18662                                 FALSE /* Don't force /x */ );
18663     }
18664 }
18665
18666 STATIC regnode *
18667 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18668 {
18669     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18670      * space.  In pass1, it aligns and increments RExC_size; in pass2,
18671      * RExC_emit */
18672
18673     regnode * const ret = RExC_emit;
18674     GET_RE_DEBUG_FLAGS_DECL;
18675
18676     PERL_ARGS_ASSERT_REGNODE_GUTS;
18677
18678     assert(extra_size >= regarglen[op]);
18679
18680     if (SIZE_ONLY) {
18681         SIZE_ALIGN(RExC_size);
18682         RExC_size += 1 + extra_size;
18683         return(ret);
18684     }
18685     if (RExC_emit >= RExC_emit_bound)
18686         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18687                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
18688
18689     NODE_ALIGN_FILL(ret);
18690 #ifndef RE_TRACK_PATTERN_OFFSETS
18691     PERL_UNUSED_ARG(name);
18692 #else
18693     if (RExC_offsets) {         /* MJD */
18694         MJD_OFFSET_DEBUG(
18695               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
18696               name, __LINE__,
18697               PL_reg_name[op],
18698               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18699                 ? "Overwriting end of array!\n" : "OK",
18700               (UV)(RExC_emit - RExC_emit_start),
18701               (UV)(RExC_parse - RExC_start),
18702               (UV)RExC_offsets[0]));
18703         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18704     }
18705 #endif
18706     return(ret);
18707 }
18708
18709 /*
18710 - reg_node - emit a node
18711 */
18712 STATIC regnode *                        /* Location. */
18713 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18714 {
18715     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18716
18717     PERL_ARGS_ASSERT_REG_NODE;
18718
18719     assert(regarglen[op] == 0);
18720
18721     if (PASS2) {
18722         regnode *ptr = ret;
18723         FILL_ADVANCE_NODE(ptr, op);
18724         RExC_emit = ptr;
18725     }
18726     return(ret);
18727 }
18728
18729 /*
18730 - reganode - emit a node with an argument
18731 */
18732 STATIC regnode *                        /* Location. */
18733 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18734 {
18735     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18736
18737     PERL_ARGS_ASSERT_REGANODE;
18738
18739     assert(regarglen[op] == 1);
18740
18741     if (PASS2) {
18742         regnode *ptr = ret;
18743         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18744         RExC_emit = ptr;
18745     }
18746     return(ret);
18747 }
18748
18749 STATIC regnode *
18750 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18751 {
18752     /* emit a node with U32 and I32 arguments */
18753
18754     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18755
18756     PERL_ARGS_ASSERT_REG2LANODE;
18757
18758     assert(regarglen[op] == 2);
18759
18760     if (PASS2) {
18761         regnode *ptr = ret;
18762         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18763         RExC_emit = ptr;
18764     }
18765     return(ret);
18766 }
18767
18768 /*
18769 - reginsert - insert an operator in front of already-emitted operand
18770 *
18771 * Means relocating the operand.
18772 *
18773 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
18774 * set up NEXT_OFF() of the inserted node if needed. Something like this:
18775 *
18776 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
18777 * if (PASS2)
18778 *     NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
18779 *
18780 * ALSO NOTE - operand->flags will be set to 0 as well.
18781 */
18782 STATIC void
18783 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
18784 {
18785     regnode *src;
18786     regnode *dst;
18787     regnode *place;
18788     const int offset = regarglen[(U8)op];
18789     const int size = NODE_STEP_REGNODE + offset;
18790     GET_RE_DEBUG_FLAGS_DECL;
18791
18792     PERL_ARGS_ASSERT_REGINSERT;
18793     PERL_UNUSED_CONTEXT;
18794     PERL_UNUSED_ARG(depth);
18795 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18796     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18797     if (SIZE_ONLY) {
18798         RExC_size += size;
18799         return;
18800     }
18801     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18802                                     studying. If this is wrong then we need to adjust RExC_recurse
18803                                     below like we do with RExC_open_parens/RExC_close_parens. */
18804     src = RExC_emit;
18805     RExC_emit += size;
18806     dst = RExC_emit;
18807     if (RExC_open_parens) {
18808         int paren;
18809         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
18810         /* remember that RExC_npar is rex->nparens + 1,
18811          * iow it is 1 more than the number of parens seen in
18812          * the pattern so far. */
18813         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18814             /* note, RExC_open_parens[0] is the start of the
18815              * regex, it can't move. RExC_close_parens[0] is the end
18816              * of the regex, it *can* move. */
18817             if ( paren && RExC_open_parens[paren] >= operand ) {
18818                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18819                 RExC_open_parens[paren] += size;
18820             } else {
18821                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18822             }
18823             if ( RExC_close_parens[paren] >= operand ) {
18824                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18825                 RExC_close_parens[paren] += size;
18826             } else {
18827                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18828             }
18829         }
18830     }
18831     if (RExC_end_op)
18832         RExC_end_op += size;
18833
18834     while (src > operand) {
18835         StructCopy(--src, --dst, regnode);
18836 #ifdef RE_TRACK_PATTERN_OFFSETS
18837         if (RExC_offsets) {     /* MJD 20010112 */
18838             MJD_OFFSET_DEBUG(
18839                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
18840                   "reg_insert",
18841                   __LINE__,
18842                   PL_reg_name[op],
18843                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18844                     ? "Overwriting end of array!\n" : "OK",
18845                   (UV)(src - RExC_emit_start),
18846                   (UV)(dst - RExC_emit_start),
18847                   (UV)RExC_offsets[0]));
18848             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18849             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18850         }
18851 #endif
18852     }
18853
18854     place = operand;            /* Op node, where operand used to be. */
18855 #ifdef RE_TRACK_PATTERN_OFFSETS
18856     if (RExC_offsets) {         /* MJD */
18857         MJD_OFFSET_DEBUG(
18858               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
18859               "reginsert",
18860               __LINE__,
18861               PL_reg_name[op],
18862               (UV)(place - RExC_emit_start) > RExC_offsets[0]
18863               ? "Overwriting end of array!\n" : "OK",
18864               (UV)(place - RExC_emit_start),
18865               (UV)(RExC_parse - RExC_start),
18866               (UV)RExC_offsets[0]));
18867         Set_Node_Offset(place, RExC_parse);
18868         Set_Node_Length(place, 1);
18869     }
18870 #endif
18871     src = NEXTOPER(place);
18872     place->flags = 0;
18873     FILL_ADVANCE_NODE(place, op);
18874     Zero(src, offset, regnode);
18875 }
18876
18877 /*
18878 - regtail - set the next-pointer at the end of a node chain of p to val.
18879 - SEE ALSO: regtail_study
18880 */
18881 STATIC void
18882 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18883                 const regnode * const p,
18884                 const regnode * const val,
18885                 const U32 depth)
18886 {
18887     regnode *scan;
18888     GET_RE_DEBUG_FLAGS_DECL;
18889
18890     PERL_ARGS_ASSERT_REGTAIL;
18891 #ifndef DEBUGGING
18892     PERL_UNUSED_ARG(depth);
18893 #endif
18894
18895     if (SIZE_ONLY)
18896         return;
18897
18898     /* Find last node. */
18899     scan = (regnode *) p;
18900     for (;;) {
18901         regnode * const temp = regnext(scan);
18902         DEBUG_PARSE_r({
18903             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18904             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18905             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
18906                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18907                     (temp == NULL ? "->" : ""),
18908                     (temp == NULL ? PL_reg_name[OP(val)] : "")
18909             );
18910         });
18911         if (temp == NULL)
18912             break;
18913         scan = temp;
18914     }
18915
18916     if (reg_off_by_arg[OP(scan)]) {
18917         ARG_SET(scan, val - scan);
18918     }
18919     else {
18920         NEXT_OFF(scan) = val - scan;
18921     }
18922 }
18923
18924 #ifdef DEBUGGING
18925 /*
18926 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18927 - Look for optimizable sequences at the same time.
18928 - currently only looks for EXACT chains.
18929
18930 This is experimental code. The idea is to use this routine to perform
18931 in place optimizations on branches and groups as they are constructed,
18932 with the long term intention of removing optimization from study_chunk so
18933 that it is purely analytical.
18934
18935 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18936 to control which is which.
18937
18938 */
18939 /* TODO: All four parms should be const */
18940
18941 STATIC U8
18942 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18943                       const regnode *val,U32 depth)
18944 {
18945     regnode *scan;
18946     U8 exact = PSEUDO;
18947 #ifdef EXPERIMENTAL_INPLACESCAN
18948     I32 min = 0;
18949 #endif
18950     GET_RE_DEBUG_FLAGS_DECL;
18951
18952     PERL_ARGS_ASSERT_REGTAIL_STUDY;
18953
18954
18955     if (SIZE_ONLY)
18956         return exact;
18957
18958     /* Find last node. */
18959
18960     scan = p;
18961     for (;;) {
18962         regnode * const temp = regnext(scan);
18963 #ifdef EXPERIMENTAL_INPLACESCAN
18964         if (PL_regkind[OP(scan)] == EXACT) {
18965             bool unfolded_multi_char;   /* Unexamined in this routine */
18966             if (join_exact(pRExC_state, scan, &min,
18967                            &unfolded_multi_char, 1, val, depth+1))
18968                 return EXACT;
18969         }
18970 #endif
18971         if ( exact ) {
18972             switch (OP(scan)) {
18973                 case EXACT:
18974                 case EXACTL:
18975                 case EXACTF:
18976                 case EXACTFA_NO_TRIE:
18977                 case EXACTFA:
18978                 case EXACTFU:
18979                 case EXACTFLU8:
18980                 case EXACTFU_SS:
18981                 case EXACTFL:
18982                         if( exact == PSEUDO )
18983                             exact= OP(scan);
18984                         else if ( exact != OP(scan) )
18985                             exact= 0;
18986                 case NOTHING:
18987                     break;
18988                 default:
18989                     exact= 0;
18990             }
18991         }
18992         DEBUG_PARSE_r({
18993             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18994             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18995             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
18996                 SvPV_nolen_const(RExC_mysv),
18997                 REG_NODE_NUM(scan),
18998                 PL_reg_name[exact]);
18999         });
19000         if (temp == NULL)
19001             break;
19002         scan = temp;
19003     }
19004     DEBUG_PARSE_r({
19005         DEBUG_PARSE_MSG("");
19006         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
19007         Perl_re_printf( aTHX_
19008                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19009                       SvPV_nolen_const(RExC_mysv),
19010                       (IV)REG_NODE_NUM(val),
19011                       (IV)(val - scan)
19012         );
19013     });
19014     if (reg_off_by_arg[OP(scan)]) {
19015         ARG_SET(scan, val - scan);
19016     }
19017     else {
19018         NEXT_OFF(scan) = val - scan;
19019     }
19020
19021     return exact;
19022 }
19023 #endif
19024
19025 /*
19026  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19027  */
19028 #ifdef DEBUGGING
19029
19030 static void
19031 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19032 {
19033     int bit;
19034     int set=0;
19035
19036     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19037
19038     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19039         if (flags & (1<<bit)) {
19040             if (!set++ && lead)
19041                 Perl_re_printf( aTHX_  "%s",lead);
19042             Perl_re_printf( aTHX_  "%s ",PL_reg_intflags_name[bit]);
19043         }
19044     }
19045     if (lead)  {
19046         if (set)
19047             Perl_re_printf( aTHX_  "\n");
19048         else
19049             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
19050     }
19051 }
19052
19053 static void
19054 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19055 {
19056     int bit;
19057     int set=0;
19058     regex_charset cs;
19059
19060     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19061
19062     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19063         if (flags & (1<<bit)) {
19064             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
19065                 continue;
19066             }
19067             if (!set++ && lead)
19068                 Perl_re_printf( aTHX_  "%s",lead);
19069             Perl_re_printf( aTHX_  "%s ",PL_reg_extflags_name[bit]);
19070         }
19071     }
19072     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19073             if (!set++ && lead) {
19074                 Perl_re_printf( aTHX_  "%s",lead);
19075             }
19076             switch (cs) {
19077                 case REGEX_UNICODE_CHARSET:
19078                     Perl_re_printf( aTHX_  "UNICODE");
19079                     break;
19080                 case REGEX_LOCALE_CHARSET:
19081                     Perl_re_printf( aTHX_  "LOCALE");
19082                     break;
19083                 case REGEX_ASCII_RESTRICTED_CHARSET:
19084                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
19085                     break;
19086                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19087                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
19088                     break;
19089                 default:
19090                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
19091                     break;
19092             }
19093     }
19094     if (lead)  {
19095         if (set)
19096             Perl_re_printf( aTHX_  "\n");
19097         else
19098             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
19099     }
19100 }
19101 #endif
19102
19103 void
19104 Perl_regdump(pTHX_ const regexp *r)
19105 {
19106 #ifdef DEBUGGING
19107     int i;
19108     SV * const sv = sv_newmortal();
19109     SV *dsv= sv_newmortal();
19110     RXi_GET_DECL(r,ri);
19111     GET_RE_DEBUG_FLAGS_DECL;
19112
19113     PERL_ARGS_ASSERT_REGDUMP;
19114
19115     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19116
19117     /* Header fields of interest. */
19118     for (i = 0; i < 2; i++) {
19119         if (r->substrs->data[i].substr) {
19120             RE_PV_QUOTED_DECL(s, 0, dsv,
19121                             SvPVX_const(r->substrs->data[i].substr),
19122                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
19123                             PL_dump_re_max_len);
19124             Perl_re_printf( aTHX_
19125                           "%s %s%s at %" IVdf "..%" UVuf " ",
19126                           i ? "floating" : "anchored",
19127                           s,
19128                           RE_SV_TAIL(r->substrs->data[i].substr),
19129                           (IV)r->substrs->data[i].min_offset,
19130                           (UV)r->substrs->data[i].max_offset);
19131         }
19132         else if (r->substrs->data[i].utf8_substr) {
19133             RE_PV_QUOTED_DECL(s, 1, dsv,
19134                             SvPVX_const(r->substrs->data[i].utf8_substr),
19135                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19136                             30);
19137             Perl_re_printf( aTHX_
19138                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19139                           i ? "floating" : "anchored",
19140                           s,
19141                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19142                           (IV)r->substrs->data[i].min_offset,
19143                           (UV)r->substrs->data[i].max_offset);
19144         }
19145     }
19146
19147     if (r->check_substr || r->check_utf8)
19148         Perl_re_printf( aTHX_
19149                       (const char *)
19150                       (   r->check_substr == r->substrs->data[1].substr
19151                        && r->check_utf8   == r->substrs->data[1].utf8_substr
19152                        ? "(checking floating" : "(checking anchored"));
19153     if (r->intflags & PREGf_NOSCAN)
19154         Perl_re_printf( aTHX_  " noscan");
19155     if (r->extflags & RXf_CHECK_ALL)
19156         Perl_re_printf( aTHX_  " isall");
19157     if (r->check_substr || r->check_utf8)
19158         Perl_re_printf( aTHX_  ") ");
19159
19160     if (ri->regstclass) {
19161         regprop(r, sv, ri->regstclass, NULL, NULL);
19162         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
19163     }
19164     if (r->intflags & PREGf_ANCH) {
19165         Perl_re_printf( aTHX_  "anchored");
19166         if (r->intflags & PREGf_ANCH_MBOL)
19167             Perl_re_printf( aTHX_  "(MBOL)");
19168         if (r->intflags & PREGf_ANCH_SBOL)
19169             Perl_re_printf( aTHX_  "(SBOL)");
19170         if (r->intflags & PREGf_ANCH_GPOS)
19171             Perl_re_printf( aTHX_  "(GPOS)");
19172         Perl_re_printf( aTHX_ " ");
19173     }
19174     if (r->intflags & PREGf_GPOS_SEEN)
19175         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
19176     if (r->intflags & PREGf_SKIP)
19177         Perl_re_printf( aTHX_  "plus ");
19178     if (r->intflags & PREGf_IMPLICIT)
19179         Perl_re_printf( aTHX_  "implicit ");
19180     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
19181     if (r->extflags & RXf_EVAL_SEEN)
19182         Perl_re_printf( aTHX_  "with eval ");
19183     Perl_re_printf( aTHX_  "\n");
19184     DEBUG_FLAGS_r({
19185         regdump_extflags("r->extflags: ",r->extflags);
19186         regdump_intflags("r->intflags: ",r->intflags);
19187     });
19188 #else
19189     PERL_ARGS_ASSERT_REGDUMP;
19190     PERL_UNUSED_CONTEXT;
19191     PERL_UNUSED_ARG(r);
19192 #endif  /* DEBUGGING */
19193 }
19194
19195 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
19196 #ifdef DEBUGGING
19197
19198 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
19199      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
19200      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
19201      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
19202      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
19203      || _CC_VERTSPACE != 15
19204 #   error Need to adjust order of anyofs[]
19205 #  endif
19206 static const char * const anyofs[] = {
19207     "\\w",
19208     "\\W",
19209     "\\d",
19210     "\\D",
19211     "[:alpha:]",
19212     "[:^alpha:]",
19213     "[:lower:]",
19214     "[:^lower:]",
19215     "[:upper:]",
19216     "[:^upper:]",
19217     "[:punct:]",
19218     "[:^punct:]",
19219     "[:print:]",
19220     "[:^print:]",
19221     "[:alnum:]",
19222     "[:^alnum:]",
19223     "[:graph:]",
19224     "[:^graph:]",
19225     "[:cased:]",
19226     "[:^cased:]",
19227     "\\s",
19228     "\\S",
19229     "[:blank:]",
19230     "[:^blank:]",
19231     "[:xdigit:]",
19232     "[:^xdigit:]",
19233     "[:cntrl:]",
19234     "[:^cntrl:]",
19235     "[:ascii:]",
19236     "[:^ascii:]",
19237     "\\v",
19238     "\\V"
19239 };
19240 #endif
19241
19242 /*
19243 - regprop - printable representation of opcode, with run time support
19244 */
19245
19246 void
19247 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
19248 {
19249 #ifdef DEBUGGING
19250     int k;
19251     RXi_GET_DECL(prog,progi);
19252     GET_RE_DEBUG_FLAGS_DECL;
19253
19254     PERL_ARGS_ASSERT_REGPROP;
19255
19256     SvPVCLEAR(sv);
19257
19258     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
19259         /* It would be nice to FAIL() here, but this may be called from
19260            regexec.c, and it would be hard to supply pRExC_state. */
19261         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19262                                               (int)OP(o), (int)REGNODE_MAX);
19263     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
19264
19265     k = PL_regkind[OP(o)];
19266
19267     if (k == EXACT) {
19268         sv_catpvs(sv, " ");
19269         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
19270          * is a crude hack but it may be the best for now since
19271          * we have no flag "this EXACTish node was UTF-8"
19272          * --jhi */
19273         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
19274                   PL_colors[0], PL_colors[1],
19275                   PERL_PV_ESCAPE_UNI_DETECT |
19276                   PERL_PV_ESCAPE_NONASCII   |
19277                   PERL_PV_PRETTY_ELLIPSES   |
19278                   PERL_PV_PRETTY_LTGT       |
19279                   PERL_PV_PRETTY_NOCLEAR
19280                   );
19281     } else if (k == TRIE) {
19282         /* print the details of the trie in dumpuntil instead, as
19283          * progi->data isn't available here */
19284         const char op = OP(o);
19285         const U32 n = ARG(o);
19286         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
19287                (reg_ac_data *)progi->data->data[n] :
19288                NULL;
19289         const reg_trie_data * const trie
19290             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
19291
19292         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
19293         DEBUG_TRIE_COMPILE_r({
19294           if (trie->jump)
19295             sv_catpvs(sv, "(JUMP)");
19296           Perl_sv_catpvf(aTHX_ sv,
19297             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
19298             (UV)trie->startstate,
19299             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
19300             (UV)trie->wordcount,
19301             (UV)trie->minlen,
19302             (UV)trie->maxlen,
19303             (UV)TRIE_CHARCOUNT(trie),
19304             (UV)trie->uniquecharcount
19305           );
19306         });
19307         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
19308             sv_catpvs(sv, "[");
19309             (void) put_charclass_bitmap_innards(sv,
19310                                                 ((IS_ANYOF_TRIE(op))
19311                                                  ? ANYOF_BITMAP(o)
19312                                                  : TRIE_BITMAP(trie)),
19313                                                 NULL,
19314                                                 NULL,
19315                                                 NULL,
19316                                                 FALSE
19317                                                );
19318             sv_catpvs(sv, "]");
19319         }
19320     } else if (k == CURLY) {
19321         U32 lo = ARG1(o), hi = ARG2(o);
19322         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
19323             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
19324         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
19325         if (hi == REG_INFTY)
19326             sv_catpvs(sv, "INFTY");
19327         else
19328             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
19329         sv_catpvs(sv, "}");
19330     }
19331     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
19332         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
19333     else if (k == REF || k == OPEN || k == CLOSE
19334              || k == GROUPP || OP(o)==ACCEPT)
19335     {
19336         AV *name_list= NULL;
19337         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
19338         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
19339         if ( RXp_PAREN_NAMES(prog) ) {
19340             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19341         } else if ( pRExC_state ) {
19342             name_list= RExC_paren_name_list;
19343         }
19344         if (name_list) {
19345             if ( k != REF || (OP(o) < NREF)) {
19346                 SV **name= av_fetch(name_list, parno, 0 );
19347                 if (name)
19348                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19349             }
19350             else {
19351                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
19352                 I32 *nums=(I32*)SvPVX(sv_dat);
19353                 SV **name= av_fetch(name_list, nums[0], 0 );
19354                 I32 n;
19355                 if (name) {
19356                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
19357                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
19358                                     (n ? "," : ""), (IV)nums[n]);
19359                     }
19360                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19361                 }
19362             }
19363         }
19364         if ( k == REF && reginfo) {
19365             U32 n = ARG(o);  /* which paren pair */
19366             I32 ln = prog->offs[n].start;
19367             if (prog->lastparen < n || ln == -1)
19368                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
19369             else if (ln == prog->offs[n].end)
19370                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19371             else {
19372                 const char *s = reginfo->strbeg + ln;
19373                 Perl_sv_catpvf(aTHX_ sv, ": ");
19374                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19375                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19376             }
19377         }
19378     } else if (k == GOSUB) {
19379         AV *name_list= NULL;
19380         if ( RXp_PAREN_NAMES(prog) ) {
19381             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19382         } else if ( pRExC_state ) {
19383             name_list= RExC_paren_name_list;
19384         }
19385
19386         /* Paren and offset */
19387         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19388                 (int)((o + (int)ARG2L(o)) - progi->program) );
19389         if (name_list) {
19390             SV **name= av_fetch(name_list, ARG(o), 0 );
19391             if (name)
19392                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19393         }
19394     }
19395     else if (k == LOGICAL)
19396         /* 2: embedded, otherwise 1 */
19397         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19398     else if (k == ANYOF) {
19399         const U8 flags = ANYOF_FLAGS(o);
19400         bool do_sep = FALSE;    /* Do we need to separate various components of
19401                                    the output? */
19402         /* Set if there is still an unresolved user-defined property */
19403         SV *unresolved                = NULL;
19404
19405         /* Things that are ignored except when the runtime locale is UTF-8 */
19406         SV *only_utf8_locale_invlist = NULL;
19407
19408         /* Code points that don't fit in the bitmap */
19409         SV *nonbitmap_invlist = NULL;
19410
19411         /* And things that aren't in the bitmap, but are small enough to be */
19412         SV* bitmap_range_not_in_bitmap = NULL;
19413
19414         const bool inverted = flags & ANYOF_INVERT;
19415
19416         if (OP(o) == ANYOFL) {
19417             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
19418                 sv_catpvs(sv, "{utf8-locale-reqd}");
19419             }
19420             if (flags & ANYOFL_FOLD) {
19421                 sv_catpvs(sv, "{i}");
19422             }
19423         }
19424
19425         /* If there is stuff outside the bitmap, get it */
19426         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
19427             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
19428                                                 &unresolved,
19429                                                 &only_utf8_locale_invlist,
19430                                                 &nonbitmap_invlist);
19431             /* The non-bitmap data may contain stuff that could fit in the
19432              * bitmap.  This could come from a user-defined property being
19433              * finally resolved when this call was done; or much more likely
19434              * because there are matches that require UTF-8 to be valid, and so
19435              * aren't in the bitmap.  This is teased apart later */
19436             _invlist_intersection(nonbitmap_invlist,
19437                                   PL_InBitmap,
19438                                   &bitmap_range_not_in_bitmap);
19439             /* Leave just the things that don't fit into the bitmap */
19440             _invlist_subtract(nonbitmap_invlist,
19441                               PL_InBitmap,
19442                               &nonbitmap_invlist);
19443         }
19444
19445         /* Obey this flag to add all above-the-bitmap code points */
19446         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
19447             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
19448                                                       NUM_ANYOF_CODE_POINTS,
19449                                                       UV_MAX);
19450         }
19451
19452         /* Ready to start outputting.  First, the initial left bracket */
19453         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19454
19455         /* Then all the things that could fit in the bitmap */
19456         do_sep = put_charclass_bitmap_innards(sv,
19457                                               ANYOF_BITMAP(o),
19458                                               bitmap_range_not_in_bitmap,
19459                                               only_utf8_locale_invlist,
19460                                               o,
19461
19462                                               /* Can't try inverting for a
19463                                                * better display if there are
19464                                                * things that haven't been
19465                                                * resolved */
19466                                               unresolved != NULL);
19467         SvREFCNT_dec(bitmap_range_not_in_bitmap);
19468
19469         /* If there are user-defined properties which haven't been defined yet,
19470          * output them.  If the result is not to be inverted, it is clearest to
19471          * output them in a separate [] from the bitmap range stuff.  If the
19472          * result is to be complemented, we have to show everything in one [],
19473          * as the inversion applies to the whole thing.  Use {braces} to
19474          * separate them from anything in the bitmap and anything above the
19475          * bitmap. */
19476         if (unresolved) {
19477             if (inverted) {
19478                 if (! do_sep) { /* If didn't output anything in the bitmap */
19479                     sv_catpvs(sv, "^");
19480                 }
19481                 sv_catpvs(sv, "{");
19482             }
19483             else if (do_sep) {
19484                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19485             }
19486             sv_catsv(sv, unresolved);
19487             if (inverted) {
19488                 sv_catpvs(sv, "}");
19489             }
19490             do_sep = ! inverted;
19491         }
19492
19493         /* And, finally, add the above-the-bitmap stuff */
19494         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
19495             SV* contents;
19496
19497             /* See if truncation size is overridden */
19498             const STRLEN dump_len = (PL_dump_re_max_len > 256)
19499                                     ? PL_dump_re_max_len
19500                                     : 256;
19501
19502             /* This is output in a separate [] */
19503             if (do_sep) {
19504                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19505             }
19506
19507             /* And, for easy of understanding, it is shown in the
19508              * uncomplemented form if possible.  The one exception being if
19509              * there are unresolved items, where the inversion has to be
19510              * delayed until runtime */
19511             if (inverted && ! unresolved) {
19512                 _invlist_invert(nonbitmap_invlist);
19513                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
19514             }
19515
19516             contents = invlist_contents(nonbitmap_invlist,
19517                                         FALSE /* output suitable for catsv */
19518                                        );
19519
19520             /* If the output is shorter than the permissible maximum, just do it. */
19521             if (SvCUR(contents) <= dump_len) {
19522                 sv_catsv(sv, contents);
19523             }
19524             else {
19525                 const char * contents_string = SvPVX(contents);
19526                 STRLEN i = dump_len;
19527
19528                 /* Otherwise, start at the permissible max and work back to the
19529                  * first break possibility */
19530                 while (i > 0 && contents_string[i] != ' ') {
19531                     i--;
19532                 }
19533                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
19534                                        find a legal break */
19535                     i = dump_len;
19536                 }
19537
19538                 sv_catpvn(sv, contents_string, i);
19539                 sv_catpvs(sv, "...");
19540             }
19541
19542             SvREFCNT_dec_NN(contents);
19543             SvREFCNT_dec_NN(nonbitmap_invlist);
19544         }
19545
19546         /* And finally the matching, closing ']' */
19547         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19548
19549         SvREFCNT_dec(unresolved);
19550     }
19551     else if (k == POSIXD || k == NPOSIXD) {
19552         U8 index = FLAGS(o) * 2;
19553         if (index < C_ARRAY_LENGTH(anyofs)) {
19554             if (*anyofs[index] != '[')  {
19555                 sv_catpv(sv, "[");
19556             }
19557             sv_catpv(sv, anyofs[index]);
19558             if (*anyofs[index] != '[')  {
19559                 sv_catpv(sv, "]");
19560             }
19561         }
19562         else {
19563             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19564         }
19565     }
19566     else if (k == BOUND || k == NBOUND) {
19567         /* Must be synced with order of 'bound_type' in regcomp.h */
19568         const char * const bounds[] = {
19569             "",      /* Traditional */
19570             "{gcb}",
19571             "{lb}",
19572             "{sb}",
19573             "{wb}"
19574         };
19575         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19576         sv_catpv(sv, bounds[FLAGS(o)]);
19577     }
19578     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19579         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19580     else if (OP(o) == SBOL)
19581         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19582
19583     /* add on the verb argument if there is one */
19584     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19585         if ( ARG(o) )
19586             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
19587                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19588         else
19589             sv_catpvs(sv, ":NULL");
19590     }
19591 #else
19592     PERL_UNUSED_CONTEXT;
19593     PERL_UNUSED_ARG(sv);
19594     PERL_UNUSED_ARG(o);
19595     PERL_UNUSED_ARG(prog);
19596     PERL_UNUSED_ARG(reginfo);
19597     PERL_UNUSED_ARG(pRExC_state);
19598 #endif  /* DEBUGGING */
19599 }
19600
19601
19602
19603 SV *
19604 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19605 {                               /* Assume that RE_INTUIT is set */
19606     struct regexp *const prog = ReANY(r);
19607     GET_RE_DEBUG_FLAGS_DECL;
19608
19609     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19610     PERL_UNUSED_CONTEXT;
19611
19612     DEBUG_COMPILE_r(
19613         {
19614             const char * const s = SvPV_nolen_const(RX_UTF8(r)
19615                       ? prog->check_utf8 : prog->check_substr);
19616
19617             if (!PL_colorset) reginitcolors();
19618             Perl_re_printf( aTHX_
19619                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19620                       PL_colors[4],
19621                       RX_UTF8(r) ? "utf8 " : "",
19622                       PL_colors[5],PL_colors[0],
19623                       s,
19624                       PL_colors[1],
19625                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
19626         } );
19627
19628     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19629     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19630 }
19631
19632 /*
19633    pregfree()
19634
19635    handles refcounting and freeing the perl core regexp structure. When
19636    it is necessary to actually free the structure the first thing it
19637    does is call the 'free' method of the regexp_engine associated to
19638    the regexp, allowing the handling of the void *pprivate; member
19639    first. (This routine is not overridable by extensions, which is why
19640    the extensions free is called first.)
19641
19642    See regdupe and regdupe_internal if you change anything here.
19643 */
19644 #ifndef PERL_IN_XSUB_RE
19645 void
19646 Perl_pregfree(pTHX_ REGEXP *r)
19647 {
19648     SvREFCNT_dec(r);
19649 }
19650
19651 void
19652 Perl_pregfree2(pTHX_ REGEXP *rx)
19653 {
19654     struct regexp *const r = ReANY(rx);
19655     GET_RE_DEBUG_FLAGS_DECL;
19656
19657     PERL_ARGS_ASSERT_PREGFREE2;
19658
19659     if (r->mother_re) {
19660         ReREFCNT_dec(r->mother_re);
19661     } else {
19662         CALLREGFREE_PVT(rx); /* free the private data */
19663         SvREFCNT_dec(RXp_PAREN_NAMES(r));
19664     }
19665     if (r->substrs) {
19666         int i;
19667         for (i = 0; i < 2; i++) {
19668             SvREFCNT_dec(r->substrs->data[i].substr);
19669             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
19670         }
19671         Safefree(r->substrs);
19672     }
19673     RX_MATCH_COPY_FREE(rx);
19674 #ifdef PERL_ANY_COW
19675     SvREFCNT_dec(r->saved_copy);
19676 #endif
19677     Safefree(r->offs);
19678     SvREFCNT_dec(r->qr_anoncv);
19679     if (r->recurse_locinput)
19680         Safefree(r->recurse_locinput);
19681 }
19682
19683
19684 /*  reg_temp_copy()
19685
19686     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
19687     except that dsv will be created if NULL.
19688
19689     This function is used in two main ways. First to implement
19690         $r = qr/....; $s = $$r;
19691
19692     Secondly, it is used as a hacky workaround to the structural issue of
19693     match results
19694     being stored in the regexp structure which is in turn stored in
19695     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19696     could be PL_curpm in multiple contexts, and could require multiple
19697     result sets being associated with the pattern simultaneously, such
19698     as when doing a recursive match with (??{$qr})
19699
19700     The solution is to make a lightweight copy of the regexp structure
19701     when a qr// is returned from the code executed by (??{$qr}) this
19702     lightweight copy doesn't actually own any of its data except for
19703     the starp/end and the actual regexp structure itself.
19704
19705 */
19706
19707
19708 REGEXP *
19709 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
19710 {
19711     struct regexp *drx;
19712     struct regexp *const srx = ReANY(ssv);
19713     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
19714
19715     PERL_ARGS_ASSERT_REG_TEMP_COPY;
19716
19717     if (!dsv)
19718         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
19719     else {
19720         SvOK_off((SV *)dsv);
19721         if (islv) {
19722             /* For PVLVs, the head (sv_any) points to an XPVLV, while
19723              * the LV's xpvlenu_rx will point to a regexp body, which
19724              * we allocate here */
19725             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19726             assert(!SvPVX(dsv));
19727             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
19728             temp->sv_any = NULL;
19729             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19730             SvREFCNT_dec_NN(temp);
19731             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19732                ing below will not set it. */
19733             SvCUR_set(dsv, SvCUR(ssv));
19734         }
19735     }
19736     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19737        sv_force_normal(sv) is called.  */
19738     SvFAKE_on(dsv);
19739     drx = ReANY(dsv);
19740
19741     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
19742     SvPV_set(dsv, RX_WRAPPED(ssv));
19743     /* We share the same string buffer as the original regexp, on which we
19744        hold a reference count, incremented when mother_re is set below.
19745        The string pointer is copied here, being part of the regexp struct.
19746      */
19747     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
19748            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19749     if (!islv)
19750         SvLEN_set(dsv, 0);
19751     if (srx->offs) {
19752         const I32 npar = srx->nparens+1;
19753         Newx(drx->offs, npar, regexp_paren_pair);
19754         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
19755     }
19756     if (srx->substrs) {
19757         int i;
19758         Newx(drx->substrs, 1, struct reg_substr_data);
19759         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
19760
19761         for (i = 0; i < 2; i++) {
19762             SvREFCNT_inc_void(drx->substrs->data[i].substr);
19763             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
19764         }
19765
19766         /* check_substr and check_utf8, if non-NULL, point to either their
19767            anchored or float namesakes, and don't hold a second reference.  */
19768     }
19769     RX_MATCH_COPIED_off(dsv);
19770 #ifdef PERL_ANY_COW
19771     drx->saved_copy = NULL;
19772 #endif
19773     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
19774     SvREFCNT_inc_void(drx->qr_anoncv);
19775     if (srx->recurse_locinput)
19776         Newx(drx->recurse_locinput,srx->nparens + 1,char *);
19777
19778     return dsv;
19779 }
19780 #endif
19781
19782
19783 /* regfree_internal()
19784
19785    Free the private data in a regexp. This is overloadable by
19786    extensions. Perl takes care of the regexp structure in pregfree(),
19787    this covers the *pprivate pointer which technically perl doesn't
19788    know about, however of course we have to handle the
19789    regexp_internal structure when no extension is in use.
19790
19791    Note this is called before freeing anything in the regexp
19792    structure.
19793  */
19794
19795 void
19796 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19797 {
19798     struct regexp *const r = ReANY(rx);
19799     RXi_GET_DECL(r,ri);
19800     GET_RE_DEBUG_FLAGS_DECL;
19801
19802     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19803
19804     DEBUG_COMPILE_r({
19805         if (!PL_colorset)
19806             reginitcolors();
19807         {
19808             SV *dsv= sv_newmortal();
19809             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19810                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
19811             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19812                 PL_colors[4],PL_colors[5],s);
19813         }
19814     });
19815 #ifdef RE_TRACK_PATTERN_OFFSETS
19816     if (ri->u.offsets)
19817         Safefree(ri->u.offsets);             /* 20010421 MJD */
19818 #endif
19819     if (ri->code_blocks)
19820         S_free_codeblocks(aTHX_ ri->code_blocks);
19821
19822     if (ri->data) {
19823         int n = ri->data->count;
19824
19825         while (--n >= 0) {
19826           /* If you add a ->what type here, update the comment in regcomp.h */
19827             switch (ri->data->what[n]) {
19828             case 'a':
19829             case 'r':
19830             case 's':
19831             case 'S':
19832             case 'u':
19833                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19834                 break;
19835             case 'f':
19836                 Safefree(ri->data->data[n]);
19837                 break;
19838             case 'l':
19839             case 'L':
19840                 break;
19841             case 'T':
19842                 { /* Aho Corasick add-on structure for a trie node.
19843                      Used in stclass optimization only */
19844                     U32 refcount;
19845                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19846 #ifdef USE_ITHREADS
19847                     dVAR;
19848 #endif
19849                     OP_REFCNT_LOCK;
19850                     refcount = --aho->refcount;
19851                     OP_REFCNT_UNLOCK;
19852                     if ( !refcount ) {
19853                         PerlMemShared_free(aho->states);
19854                         PerlMemShared_free(aho->fail);
19855                          /* do this last!!!! */
19856                         PerlMemShared_free(ri->data->data[n]);
19857                         /* we should only ever get called once, so
19858                          * assert as much, and also guard the free
19859                          * which /might/ happen twice. At the least
19860                          * it will make code anlyzers happy and it
19861                          * doesn't cost much. - Yves */
19862                         assert(ri->regstclass);
19863                         if (ri->regstclass) {
19864                             PerlMemShared_free(ri->regstclass);
19865                             ri->regstclass = 0;
19866                         }
19867                     }
19868                 }
19869                 break;
19870             case 't':
19871                 {
19872                     /* trie structure. */
19873                     U32 refcount;
19874                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19875 #ifdef USE_ITHREADS
19876                     dVAR;
19877 #endif
19878                     OP_REFCNT_LOCK;
19879                     refcount = --trie->refcount;
19880                     OP_REFCNT_UNLOCK;
19881                     if ( !refcount ) {
19882                         PerlMemShared_free(trie->charmap);
19883                         PerlMemShared_free(trie->states);
19884                         PerlMemShared_free(trie->trans);
19885                         if (trie->bitmap)
19886                             PerlMemShared_free(trie->bitmap);
19887                         if (trie->jump)
19888                             PerlMemShared_free(trie->jump);
19889                         PerlMemShared_free(trie->wordinfo);
19890                         /* do this last!!!! */
19891                         PerlMemShared_free(ri->data->data[n]);
19892                     }
19893                 }
19894                 break;
19895             default:
19896                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
19897                                                     ri->data->what[n]);
19898             }
19899         }
19900         Safefree(ri->data->what);
19901         Safefree(ri->data);
19902     }
19903
19904     Safefree(ri);
19905 }
19906
19907 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19908 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19909 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
19910
19911 /*
19912    re_dup_guts - duplicate a regexp.
19913
19914    This routine is expected to clone a given regexp structure. It is only
19915    compiled under USE_ITHREADS.
19916
19917    After all of the core data stored in struct regexp is duplicated
19918    the regexp_engine.dupe method is used to copy any private data
19919    stored in the *pprivate pointer. This allows extensions to handle
19920    any duplication it needs to do.
19921
19922    See pregfree() and regfree_internal() if you change anything here.
19923 */
19924 #if defined(USE_ITHREADS)
19925 #ifndef PERL_IN_XSUB_RE
19926 void
19927 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19928 {
19929     dVAR;
19930     I32 npar;
19931     const struct regexp *r = ReANY(sstr);
19932     struct regexp *ret = ReANY(dstr);
19933
19934     PERL_ARGS_ASSERT_RE_DUP_GUTS;
19935
19936     npar = r->nparens+1;
19937     Newx(ret->offs, npar, regexp_paren_pair);
19938     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19939
19940     if (ret->substrs) {
19941         /* Do it this way to avoid reading from *r after the StructCopy().
19942            That way, if any of the sv_dup_inc()s dislodge *r from the L1
19943            cache, it doesn't matter.  */
19944         int i;
19945         const bool anchored = r->check_substr
19946             ? r->check_substr == r->substrs->data[0].substr
19947             : r->check_utf8   == r->substrs->data[0].utf8_substr;
19948         Newx(ret->substrs, 1, struct reg_substr_data);
19949         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19950
19951         for (i = 0; i < 2; i++) {
19952             ret->substrs->data[i].substr =
19953                         sv_dup_inc(ret->substrs->data[i].substr, param);
19954             ret->substrs->data[i].utf8_substr =
19955                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
19956         }
19957
19958         /* check_substr and check_utf8, if non-NULL, point to either their
19959            anchored or float namesakes, and don't hold a second reference.  */
19960
19961         if (ret->check_substr) {
19962             if (anchored) {
19963                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
19964
19965                 ret->check_substr = ret->substrs->data[0].substr;
19966                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
19967             } else {
19968                 assert(r->check_substr == r->substrs->data[1].substr);
19969                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
19970
19971                 ret->check_substr = ret->substrs->data[1].substr;
19972                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
19973             }
19974         } else if (ret->check_utf8) {
19975             if (anchored) {
19976                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
19977             } else {
19978                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
19979             }
19980         }
19981     }
19982
19983     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19984     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19985     if (r->recurse_locinput)
19986         Newx(ret->recurse_locinput,r->nparens + 1,char *);
19987
19988     if (ret->pprivate)
19989         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19990
19991     if (RX_MATCH_COPIED(dstr))
19992         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
19993     else
19994         ret->subbeg = NULL;
19995 #ifdef PERL_ANY_COW
19996     ret->saved_copy = NULL;
19997 #endif
19998
19999     /* Whether mother_re be set or no, we need to copy the string.  We
20000        cannot refrain from copying it when the storage points directly to
20001        our mother regexp, because that's
20002                1: a buffer in a different thread
20003                2: something we no longer hold a reference on
20004                so we need to copy it locally.  */
20005     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20006     ret->mother_re   = NULL;
20007 }
20008 #endif /* PERL_IN_XSUB_RE */
20009
20010 /*
20011    regdupe_internal()
20012
20013    This is the internal complement to regdupe() which is used to copy
20014    the structure pointed to by the *pprivate pointer in the regexp.
20015    This is the core version of the extension overridable cloning hook.
20016    The regexp structure being duplicated will be copied by perl prior
20017    to this and will be provided as the regexp *r argument, however
20018    with the /old/ structures pprivate pointer value. Thus this routine
20019    may override any copying normally done by perl.
20020
20021    It returns a pointer to the new regexp_internal structure.
20022 */
20023
20024 void *
20025 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20026 {
20027     dVAR;
20028     struct regexp *const r = ReANY(rx);
20029     regexp_internal *reti;
20030     int len;
20031     RXi_GET_DECL(r,ri);
20032
20033     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20034
20035     len = ProgLen(ri);
20036
20037     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20038           char, regexp_internal);
20039     Copy(ri->program, reti->program, len+1, regnode);
20040
20041
20042     if (ri->code_blocks) {
20043         int n;
20044         Newx(reti->code_blocks, 1, struct reg_code_blocks);
20045         Newx(reti->code_blocks->cb, ri->code_blocks->count,
20046                     struct reg_code_block);
20047         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20048              ri->code_blocks->count, struct reg_code_block);
20049         for (n = 0; n < ri->code_blocks->count; n++)
20050              reti->code_blocks->cb[n].src_regex = (REGEXP*)
20051                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20052         reti->code_blocks->count = ri->code_blocks->count;
20053         reti->code_blocks->refcnt = 1;
20054     }
20055     else
20056         reti->code_blocks = NULL;
20057
20058     reti->regstclass = NULL;
20059
20060     if (ri->data) {
20061         struct reg_data *d;
20062         const int count = ri->data->count;
20063         int i;
20064
20065         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20066                 char, struct reg_data);
20067         Newx(d->what, count, U8);
20068
20069         d->count = count;
20070         for (i = 0; i < count; i++) {
20071             d->what[i] = ri->data->what[i];
20072             switch (d->what[i]) {
20073                 /* see also regcomp.h and regfree_internal() */
20074             case 'a': /* actually an AV, but the dup function is identical.
20075                          values seem to be "plain sv's" generally. */
20076             case 'r': /* a compiled regex (but still just another SV) */
20077             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20078                          this use case should go away, the code could have used
20079                          'a' instead - see S_set_ANYOF_arg() for array contents. */
20080             case 'S': /* actually an SV, but the dup function is identical.  */
20081             case 'u': /* actually an HV, but the dup function is identical.
20082                          values are "plain sv's" */
20083                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20084                 break;
20085             case 'f':
20086                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20087                  * patterns which could start with several different things. Pre-TRIE
20088                  * this was more important than it is now, however this still helps
20089                  * in some places, for instance /x?a+/ might produce a SSC equivalent
20090                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20091                  * in regexec.c
20092                  */
20093                 /* This is cheating. */
20094                 Newx(d->data[i], 1, regnode_ssc);
20095                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20096                 reti->regstclass = (regnode*)d->data[i];
20097                 break;
20098             case 'T':
20099                 /* AHO-CORASICK fail table */
20100                 /* Trie stclasses are readonly and can thus be shared
20101                  * without duplication. We free the stclass in pregfree
20102                  * when the corresponding reg_ac_data struct is freed.
20103                  */
20104                 reti->regstclass= ri->regstclass;
20105                 /* FALLTHROUGH */
20106             case 't':
20107                 /* TRIE transition table */
20108                 OP_REFCNT_LOCK;
20109                 ((reg_trie_data*)ri->data->data[i])->refcount++;
20110                 OP_REFCNT_UNLOCK;
20111                 /* FALLTHROUGH */
20112             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20113             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20114                          is not from another regexp */
20115                 d->data[i] = ri->data->data[i];
20116                 break;
20117             default:
20118                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20119                                                            ri->data->what[i]);
20120             }
20121         }
20122
20123         reti->data = d;
20124     }
20125     else
20126         reti->data = NULL;
20127
20128     reti->name_list_idx = ri->name_list_idx;
20129
20130 #ifdef RE_TRACK_PATTERN_OFFSETS
20131     if (ri->u.offsets) {
20132         Newx(reti->u.offsets, 2*len+1, U32);
20133         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
20134     }
20135 #else
20136     SetProgLen(reti,len);
20137 #endif
20138
20139     return (void*)reti;
20140 }
20141
20142 #endif    /* USE_ITHREADS */
20143
20144 #ifndef PERL_IN_XSUB_RE
20145
20146 /*
20147  - regnext - dig the "next" pointer out of a node
20148  */
20149 regnode *
20150 Perl_regnext(pTHX_ regnode *p)
20151 {
20152     I32 offset;
20153
20154     if (!p)
20155         return(NULL);
20156
20157     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
20158         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20159                                                 (int)OP(p), (int)REGNODE_MAX);
20160     }
20161
20162     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
20163     if (offset == 0)
20164         return(NULL);
20165
20166     return(p+offset);
20167 }
20168 #endif
20169
20170 STATIC void
20171 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
20172 {
20173     va_list args;
20174     STRLEN l1 = strlen(pat1);
20175     STRLEN l2 = strlen(pat2);
20176     char buf[512];
20177     SV *msv;
20178     const char *message;
20179
20180     PERL_ARGS_ASSERT_RE_CROAK2;
20181
20182     if (l1 > 510)
20183         l1 = 510;
20184     if (l1 + l2 > 510)
20185         l2 = 510 - l1;
20186     Copy(pat1, buf, l1 , char);
20187     Copy(pat2, buf + l1, l2 , char);
20188     buf[l1 + l2] = '\n';
20189     buf[l1 + l2 + 1] = '\0';
20190     va_start(args, pat2);
20191     msv = vmess(buf, &args);
20192     va_end(args);
20193     message = SvPV_const(msv,l1);
20194     if (l1 > 512)
20195         l1 = 512;
20196     Copy(message, buf, l1 , char);
20197     /* l1-1 to avoid \n */
20198     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
20199 }
20200
20201 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
20202
20203 #ifndef PERL_IN_XSUB_RE
20204 void
20205 Perl_save_re_context(pTHX)
20206 {
20207     I32 nparens = -1;
20208     I32 i;
20209
20210     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
20211
20212     if (PL_curpm) {
20213         const REGEXP * const rx = PM_GETRE(PL_curpm);
20214         if (rx)
20215             nparens = RX_NPARENS(rx);
20216     }
20217
20218     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
20219      * that PL_curpm will be null, but that utf8.pm and the modules it
20220      * loads will only use $1..$3.
20221      * The t/porting/re_context.t test file checks this assumption.
20222      */
20223     if (nparens == -1)
20224         nparens = 3;
20225
20226     for (i = 1; i <= nparens; i++) {
20227         char digits[TYPE_CHARS(long)];
20228         const STRLEN len = my_snprintf(digits, sizeof(digits),
20229                                        "%lu", (long)i);
20230         GV *const *const gvp
20231             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
20232
20233         if (gvp) {
20234             GV * const gv = *gvp;
20235             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
20236                 save_scalar(gv);
20237         }
20238     }
20239 }
20240 #endif
20241
20242 #ifdef DEBUGGING
20243
20244 STATIC void
20245 S_put_code_point(pTHX_ SV *sv, UV c)
20246 {
20247     PERL_ARGS_ASSERT_PUT_CODE_POINT;
20248
20249     if (c > 255) {
20250         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
20251     }
20252     else if (isPRINT(c)) {
20253         const char string = (char) c;
20254
20255         /* We use {phrase} as metanotation in the class, so also escape literal
20256          * braces */
20257         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
20258             sv_catpvs(sv, "\\");
20259         sv_catpvn(sv, &string, 1);
20260     }
20261     else if (isMNEMONIC_CNTRL(c)) {
20262         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
20263     }
20264     else {
20265         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
20266     }
20267 }
20268
20269 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
20270
20271 STATIC void
20272 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
20273 {
20274     /* Appends to 'sv' a displayable version of the range of code points from
20275      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
20276      * that have them, when they occur at the beginning or end of the range.
20277      * It uses hex to output the remaining code points, unless 'allow_literals'
20278      * is true, in which case the printable ASCII ones are output as-is (though
20279      * some of these will be escaped by put_code_point()).
20280      *
20281      * NOTE:  This is designed only for printing ranges of code points that fit
20282      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
20283      */
20284
20285     const unsigned int min_range_count = 3;
20286
20287     assert(start <= end);
20288
20289     PERL_ARGS_ASSERT_PUT_RANGE;
20290
20291     while (start <= end) {
20292         UV this_end;
20293         const char * format;
20294
20295         if (end - start < min_range_count) {
20296
20297             /* Output chars individually when they occur in short ranges */
20298             for (; start <= end; start++) {
20299                 put_code_point(sv, start);
20300             }
20301             break;
20302         }
20303
20304         /* If permitted by the input options, and there is a possibility that
20305          * this range contains a printable literal, look to see if there is
20306          * one. */
20307         if (allow_literals && start <= MAX_PRINT_A) {
20308
20309             /* If the character at the beginning of the range isn't an ASCII
20310              * printable, effectively split the range into two parts:
20311              *  1) the portion before the first such printable,
20312              *  2) the rest
20313              * and output them separately. */
20314             if (! isPRINT_A(start)) {
20315                 UV temp_end = start + 1;
20316
20317                 /* There is no point looking beyond the final possible
20318                  * printable, in MAX_PRINT_A */
20319                 UV max = MIN(end, MAX_PRINT_A);
20320
20321                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
20322                     temp_end++;
20323                 }
20324
20325                 /* Here, temp_end points to one beyond the first printable if
20326                  * found, or to one beyond 'max' if not.  If none found, make
20327                  * sure that we use the entire range */
20328                 if (temp_end > MAX_PRINT_A) {
20329                     temp_end = end + 1;
20330                 }
20331
20332                 /* Output the first part of the split range: the part that
20333                  * doesn't have printables, with the parameter set to not look
20334                  * for literals (otherwise we would infinitely recurse) */
20335                 put_range(sv, start, temp_end - 1, FALSE);
20336
20337                 /* The 2nd part of the range (if any) starts here. */
20338                 start = temp_end;
20339
20340                 /* We do a continue, instead of dropping down, because even if
20341                  * the 2nd part is non-empty, it could be so short that we want
20342                  * to output it as individual characters, as tested for at the
20343                  * top of this loop.  */
20344                 continue;
20345             }
20346
20347             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
20348              * output a sub-range of just the digits or letters, then process
20349              * the remaining portion as usual. */
20350             if (isALPHANUMERIC_A(start)) {
20351                 UV mask = (isDIGIT_A(start))
20352                            ? _CC_DIGIT
20353                              : isUPPER_A(start)
20354                                ? _CC_UPPER
20355                                : _CC_LOWER;
20356                 UV temp_end = start + 1;
20357
20358                 /* Find the end of the sub-range that includes just the
20359                  * characters in the same class as the first character in it */
20360                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
20361                     temp_end++;
20362                 }
20363                 temp_end--;
20364
20365                 /* For short ranges, don't duplicate the code above to output
20366                  * them; just call recursively */
20367                 if (temp_end - start < min_range_count) {
20368                     put_range(sv, start, temp_end, FALSE);
20369                 }
20370                 else {  /* Output as a range */
20371                     put_code_point(sv, start);
20372                     sv_catpvs(sv, "-");
20373                     put_code_point(sv, temp_end);
20374                 }
20375                 start = temp_end + 1;
20376                 continue;
20377             }
20378
20379             /* We output any other printables as individual characters */
20380             if (isPUNCT_A(start) || isSPACE_A(start)) {
20381                 while (start <= end && (isPUNCT_A(start)
20382                                         || isSPACE_A(start)))
20383                 {
20384                     put_code_point(sv, start);
20385                     start++;
20386                 }
20387                 continue;
20388             }
20389         } /* End of looking for literals */
20390
20391         /* Here is not to output as a literal.  Some control characters have
20392          * mnemonic names.  Split off any of those at the beginning and end of
20393          * the range to print mnemonically.  It isn't possible for many of
20394          * these to be in a row, so this won't overwhelm with output */
20395         if (   start <= end
20396             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
20397         {
20398             while (isMNEMONIC_CNTRL(start) && start <= end) {
20399                 put_code_point(sv, start);
20400                 start++;
20401             }
20402
20403             /* If this didn't take care of the whole range ... */
20404             if (start <= end) {
20405
20406                 /* Look backwards from the end to find the final non-mnemonic
20407                  * */
20408                 UV temp_end = end;
20409                 while (isMNEMONIC_CNTRL(temp_end)) {
20410                     temp_end--;
20411                 }
20412
20413                 /* And separately output the interior range that doesn't start
20414                  * or end with mnemonics */
20415                 put_range(sv, start, temp_end, FALSE);
20416
20417                 /* Then output the mnemonic trailing controls */
20418                 start = temp_end + 1;
20419                 while (start <= end) {
20420                     put_code_point(sv, start);
20421                     start++;
20422                 }
20423                 break;
20424             }
20425         }
20426
20427         /* As a final resort, output the range or subrange as hex. */
20428
20429         this_end = (end < NUM_ANYOF_CODE_POINTS)
20430                     ? end
20431                     : NUM_ANYOF_CODE_POINTS - 1;
20432 #if NUM_ANYOF_CODE_POINTS > 256
20433         format = (this_end < 256)
20434                  ? "\\x%02" UVXf "-\\x%02" UVXf
20435                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
20436 #else
20437         format = "\\x%02" UVXf "-\\x%02" UVXf;
20438 #endif
20439         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
20440         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
20441         GCC_DIAG_RESTORE_STMT;
20442         break;
20443     }
20444 }
20445
20446 STATIC void
20447 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
20448 {
20449     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
20450      * 'invlist' */
20451
20452     UV start, end;
20453     bool allow_literals = TRUE;
20454
20455     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
20456
20457     /* Generally, it is more readable if printable characters are output as
20458      * literals, but if a range (nearly) spans all of them, it's best to output
20459      * it as a single range.  This code will use a single range if all but 2
20460      * ASCII printables are in it */
20461     invlist_iterinit(invlist);
20462     while (invlist_iternext(invlist, &start, &end)) {
20463
20464         /* If the range starts beyond the final printable, it doesn't have any
20465          * in it */
20466         if (start > MAX_PRINT_A) {
20467             break;
20468         }
20469
20470         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
20471          * all but two, the range must start and end no later than 2 from
20472          * either end */
20473         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
20474             if (end > MAX_PRINT_A) {
20475                 end = MAX_PRINT_A;
20476             }
20477             if (start < ' ') {
20478                 start = ' ';
20479             }
20480             if (end - start >= MAX_PRINT_A - ' ' - 2) {
20481                 allow_literals = FALSE;
20482             }
20483             break;
20484         }
20485     }
20486     invlist_iterfinish(invlist);
20487
20488     /* Here we have figured things out.  Output each range */
20489     invlist_iterinit(invlist);
20490     while (invlist_iternext(invlist, &start, &end)) {
20491         if (start >= NUM_ANYOF_CODE_POINTS) {
20492             break;
20493         }
20494         put_range(sv, start, end, allow_literals);
20495     }
20496     invlist_iterfinish(invlist);
20497
20498     return;
20499 }
20500
20501 STATIC SV*
20502 S_put_charclass_bitmap_innards_common(pTHX_
20503         SV* invlist,            /* The bitmap */
20504         SV* posixes,            /* Under /l, things like [:word:], \S */
20505         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
20506         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
20507         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
20508         const bool invert       /* Is the result to be inverted? */
20509 )
20510 {
20511     /* Create and return an SV containing a displayable version of the bitmap
20512      * and associated information determined by the input parameters.  If the
20513      * output would have been only the inversion indicator '^', NULL is instead
20514      * returned. */
20515
20516     SV * output;
20517
20518     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
20519
20520     if (invert) {
20521         output = newSVpvs("^");
20522     }
20523     else {
20524         output = newSVpvs("");
20525     }
20526
20527     /* First, the code points in the bitmap that are unconditionally there */
20528     put_charclass_bitmap_innards_invlist(output, invlist);
20529
20530     /* Traditionally, these have been placed after the main code points */
20531     if (posixes) {
20532         sv_catsv(output, posixes);
20533     }
20534
20535     if (only_utf8 && _invlist_len(only_utf8)) {
20536         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
20537         put_charclass_bitmap_innards_invlist(output, only_utf8);
20538     }
20539
20540     if (not_utf8 && _invlist_len(not_utf8)) {
20541         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
20542         put_charclass_bitmap_innards_invlist(output, not_utf8);
20543     }
20544
20545     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
20546         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
20547         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
20548
20549         /* This is the only list in this routine that can legally contain code
20550          * points outside the bitmap range.  The call just above to
20551          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
20552          * output them here.  There's about a half-dozen possible, and none in
20553          * contiguous ranges longer than 2 */
20554         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20555             UV start, end;
20556             SV* above_bitmap = NULL;
20557
20558             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
20559
20560             invlist_iterinit(above_bitmap);
20561             while (invlist_iternext(above_bitmap, &start, &end)) {
20562                 UV i;
20563
20564                 for (i = start; i <= end; i++) {
20565                     put_code_point(output, i);
20566                 }
20567             }
20568             invlist_iterfinish(above_bitmap);
20569             SvREFCNT_dec_NN(above_bitmap);
20570         }
20571     }
20572
20573     if (invert && SvCUR(output) == 1) {
20574         return NULL;
20575     }
20576
20577     return output;
20578 }
20579
20580 STATIC bool
20581 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20582                                      char *bitmap,
20583                                      SV *nonbitmap_invlist,
20584                                      SV *only_utf8_locale_invlist,
20585                                      const regnode * const node,
20586                                      const bool force_as_is_display)
20587 {
20588     /* Appends to 'sv' a displayable version of the innards of the bracketed
20589      * character class defined by the other arguments:
20590      *  'bitmap' points to the bitmap.
20591      *  'nonbitmap_invlist' is an inversion list of the code points that are in
20592      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
20593      *      none.  The reasons for this could be that they require some
20594      *      condition such as the target string being or not being in UTF-8
20595      *      (under /d), or because they came from a user-defined property that
20596      *      was not resolved at the time of the regex compilation (under /u)
20597      *  'only_utf8_locale_invlist' is an inversion list of the code points that
20598      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
20599      *  'node' is the regex pattern node.  It is needed only when the above two
20600      *      parameters are not null, and is passed so that this routine can
20601      *      tease apart the various reasons for them.
20602      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
20603      *      to invert things to see if that leads to a cleaner display.  If
20604      *      FALSE, this routine is free to use its judgment about doing this.
20605      *
20606      * It returns TRUE if there was actually something output.  (It may be that
20607      * the bitmap, etc is empty.)
20608      *
20609      * When called for outputting the bitmap of a non-ANYOF node, just pass the
20610      * bitmap, with the succeeding parameters set to NULL, and the final one to
20611      * FALSE.
20612      */
20613
20614     /* In general, it tries to display the 'cleanest' representation of the
20615      * innards, choosing whether to display them inverted or not, regardless of
20616      * whether the class itself is to be inverted.  However,  there are some
20617      * cases where it can't try inverting, as what actually matches isn't known
20618      * until runtime, and hence the inversion isn't either. */
20619     bool inverting_allowed = ! force_as_is_display;
20620
20621     int i;
20622     STRLEN orig_sv_cur = SvCUR(sv);
20623
20624     SV* invlist;            /* Inversion list we accumulate of code points that
20625                                are unconditionally matched */
20626     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
20627                                UTF-8 */
20628     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
20629                              */
20630     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
20631     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
20632                                        is UTF-8 */
20633
20634     SV* as_is_display;      /* The output string when we take the inputs
20635                                literally */
20636     SV* inverted_display;   /* The output string when we invert the inputs */
20637
20638     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20639
20640     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
20641                                                    to match? */
20642     /* We are biased in favor of displaying things without them being inverted,
20643      * as that is generally easier to understand */
20644     const int bias = 5;
20645
20646     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20647
20648     /* Start off with whatever code points are passed in.  (We clone, so we
20649      * don't change the caller's list) */
20650     if (nonbitmap_invlist) {
20651         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20652         invlist = invlist_clone(nonbitmap_invlist);
20653     }
20654     else {  /* Worst case size is every other code point is matched */
20655         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20656     }
20657
20658     if (flags) {
20659         if (OP(node) == ANYOFD) {
20660
20661             /* This flag indicates that the code points below 0x100 in the
20662              * nonbitmap list are precisely the ones that match only when the
20663              * target is UTF-8 (they should all be non-ASCII). */
20664             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20665             {
20666                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20667                 _invlist_subtract(invlist, only_utf8, &invlist);
20668             }
20669
20670             /* And this flag for matching all non-ASCII 0xFF and below */
20671             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20672             {
20673                 not_utf8 = invlist_clone(PL_UpperLatin1);
20674             }
20675         }
20676         else if (OP(node) == ANYOFL) {
20677
20678             /* If either of these flags are set, what matches isn't
20679              * determinable except during execution, so don't know enough here
20680              * to invert */
20681             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20682                 inverting_allowed = FALSE;
20683             }
20684
20685             /* What the posix classes match also varies at runtime, so these
20686              * will be output symbolically. */
20687             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20688                 int i;
20689
20690                 posixes = newSVpvs("");
20691                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20692                     if (ANYOF_POSIXL_TEST(node,i)) {
20693                         sv_catpv(posixes, anyofs[i]);
20694                     }
20695                 }
20696             }
20697         }
20698     }
20699
20700     /* Accumulate the bit map into the unconditional match list */
20701     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20702         if (BITMAP_TEST(bitmap, i)) {
20703             int start = i++;
20704             for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20705                 /* empty */
20706             }
20707             invlist = _add_range_to_invlist(invlist, start, i-1);
20708         }
20709     }
20710
20711     /* Make sure that the conditional match lists don't have anything in them
20712      * that match unconditionally; otherwise the output is quite confusing.
20713      * This could happen if the code that populates these misses some
20714      * duplication. */
20715     if (only_utf8) {
20716         _invlist_subtract(only_utf8, invlist, &only_utf8);
20717     }
20718     if (not_utf8) {
20719         _invlist_subtract(not_utf8, invlist, &not_utf8);
20720     }
20721
20722     if (only_utf8_locale_invlist) {
20723
20724         /* Since this list is passed in, we have to make a copy before
20725          * modifying it */
20726         only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20727
20728         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20729
20730         /* And, it can get really weird for us to try outputting an inverted
20731          * form of this list when it has things above the bitmap, so don't even
20732          * try */
20733         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20734             inverting_allowed = FALSE;
20735         }
20736     }
20737
20738     /* Calculate what the output would be if we take the input as-is */
20739     as_is_display = put_charclass_bitmap_innards_common(invlist,
20740                                                     posixes,
20741                                                     only_utf8,
20742                                                     not_utf8,
20743                                                     only_utf8_locale,
20744                                                     invert);
20745
20746     /* If have to take the output as-is, just do that */
20747     if (! inverting_allowed) {
20748         if (as_is_display) {
20749             sv_catsv(sv, as_is_display);
20750             SvREFCNT_dec_NN(as_is_display);
20751         }
20752     }
20753     else { /* But otherwise, create the output again on the inverted input, and
20754               use whichever version is shorter */
20755
20756         int inverted_bias, as_is_bias;
20757
20758         /* We will apply our bias to whichever of the the results doesn't have
20759          * the '^' */
20760         if (invert) {
20761             invert = FALSE;
20762             as_is_bias = bias;
20763             inverted_bias = 0;
20764         }
20765         else {
20766             invert = TRUE;
20767             as_is_bias = 0;
20768             inverted_bias = bias;
20769         }
20770
20771         /* Now invert each of the lists that contribute to the output,
20772          * excluding from the result things outside the possible range */
20773
20774         /* For the unconditional inversion list, we have to add in all the
20775          * conditional code points, so that when inverted, they will be gone
20776          * from it */
20777         _invlist_union(only_utf8, invlist, &invlist);
20778         _invlist_union(not_utf8, invlist, &invlist);
20779         _invlist_union(only_utf8_locale, invlist, &invlist);
20780         _invlist_invert(invlist);
20781         _invlist_intersection(invlist, PL_InBitmap, &invlist);
20782
20783         if (only_utf8) {
20784             _invlist_invert(only_utf8);
20785             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20786         }
20787         else if (not_utf8) {
20788
20789             /* If a code point matches iff the target string is not in UTF-8,
20790              * then complementing the result has it not match iff not in UTF-8,
20791              * which is the same thing as matching iff it is UTF-8. */
20792             only_utf8 = not_utf8;
20793             not_utf8 = NULL;
20794         }
20795
20796         if (only_utf8_locale) {
20797             _invlist_invert(only_utf8_locale);
20798             _invlist_intersection(only_utf8_locale,
20799                                   PL_InBitmap,
20800                                   &only_utf8_locale);
20801         }
20802
20803         inverted_display = put_charclass_bitmap_innards_common(
20804                                             invlist,
20805                                             posixes,
20806                                             only_utf8,
20807                                             not_utf8,
20808                                             only_utf8_locale, invert);
20809
20810         /* Use the shortest representation, taking into account our bias
20811          * against showing it inverted */
20812         if (   inverted_display
20813             && (   ! as_is_display
20814                 || (  SvCUR(inverted_display) + inverted_bias
20815                     < SvCUR(as_is_display)    + as_is_bias)))
20816         {
20817             sv_catsv(sv, inverted_display);
20818         }
20819         else if (as_is_display) {
20820             sv_catsv(sv, as_is_display);
20821         }
20822
20823         SvREFCNT_dec(as_is_display);
20824         SvREFCNT_dec(inverted_display);
20825     }
20826
20827     SvREFCNT_dec_NN(invlist);
20828     SvREFCNT_dec(only_utf8);
20829     SvREFCNT_dec(not_utf8);
20830     SvREFCNT_dec(posixes);
20831     SvREFCNT_dec(only_utf8_locale);
20832
20833     return SvCUR(sv) > orig_sv_cur;
20834 }
20835
20836 #define CLEAR_OPTSTART                                                       \
20837     if (optstart) STMT_START {                                               \
20838         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
20839                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
20840         optstart=NULL;                                                       \
20841     } STMT_END
20842
20843 #define DUMPUNTIL(b,e)                                                       \
20844                     CLEAR_OPTSTART;                                          \
20845                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20846
20847 STATIC const regnode *
20848 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20849             const regnode *last, const regnode *plast,
20850             SV* sv, I32 indent, U32 depth)
20851 {
20852     U8 op = PSEUDO;     /* Arbitrary non-END op. */
20853     const regnode *next;
20854     const regnode *optstart= NULL;
20855
20856     RXi_GET_DECL(r,ri);
20857     GET_RE_DEBUG_FLAGS_DECL;
20858
20859     PERL_ARGS_ASSERT_DUMPUNTIL;
20860
20861 #ifdef DEBUG_DUMPUNTIL
20862     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n",indent,node-start,
20863         last ? last-start : 0,plast ? plast-start : 0);
20864 #endif
20865
20866     if (plast && plast < last)
20867         last= plast;
20868
20869     while (PL_regkind[op] != END && (!last || node < last)) {
20870         assert(node);
20871         /* While that wasn't END last time... */
20872         NODE_ALIGN(node);
20873         op = OP(node);
20874         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
20875             indent--;
20876         next = regnext((regnode *)node);
20877
20878         /* Where, what. */
20879         if (OP(node) == OPTIMIZED) {
20880             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20881                 optstart = node;
20882             else
20883                 goto after_print;
20884         } else
20885             CLEAR_OPTSTART;
20886
20887         regprop(r, sv, node, NULL, NULL);
20888         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
20889                       (int)(2*indent + 1), "", SvPVX_const(sv));
20890
20891         if (OP(node) != OPTIMIZED) {
20892             if (next == NULL)           /* Next ptr. */
20893                 Perl_re_printf( aTHX_  " (0)");
20894             else if (PL_regkind[(U8)op] == BRANCH
20895                      && PL_regkind[OP(next)] != BRANCH )
20896                 Perl_re_printf( aTHX_  " (FAIL)");
20897             else
20898                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
20899             Perl_re_printf( aTHX_ "\n");
20900         }
20901
20902       after_print:
20903         if (PL_regkind[(U8)op] == BRANCHJ) {
20904             assert(next);
20905             {
20906                 const regnode *nnode = (OP(next) == LONGJMP
20907                                        ? regnext((regnode *)next)
20908                                        : next);
20909                 if (last && nnode > last)
20910                     nnode = last;
20911                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20912             }
20913         }
20914         else if (PL_regkind[(U8)op] == BRANCH) {
20915             assert(next);
20916             DUMPUNTIL(NEXTOPER(node), next);
20917         }
20918         else if ( PL_regkind[(U8)op]  == TRIE ) {
20919             const regnode *this_trie = node;
20920             const char op = OP(node);
20921             const U32 n = ARG(node);
20922             const reg_ac_data * const ac = op>=AHOCORASICK ?
20923                (reg_ac_data *)ri->data->data[n] :
20924                NULL;
20925             const reg_trie_data * const trie =
20926                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20927 #ifdef DEBUGGING
20928             AV *const trie_words
20929                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20930 #endif
20931             const regnode *nextbranch= NULL;
20932             I32 word_idx;
20933             SvPVCLEAR(sv);
20934             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20935                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20936
20937                 Perl_re_indentf( aTHX_  "%s ",
20938                     indent+3,
20939                     elem_ptr
20940                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20941                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
20942                                 PL_colors[0], PL_colors[1],
20943                                 (SvUTF8(*elem_ptr)
20944                                  ? PERL_PV_ESCAPE_UNI
20945                                  : 0)
20946                                 | PERL_PV_PRETTY_ELLIPSES
20947                                 | PERL_PV_PRETTY_LTGT
20948                             )
20949                     : "???"
20950                 );
20951                 if (trie->jump) {
20952                     U16 dist= trie->jump[word_idx+1];
20953                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
20954                                (UV)((dist ? this_trie + dist : next) - start));
20955                     if (dist) {
20956                         if (!nextbranch)
20957                             nextbranch= this_trie + trie->jump[0];
20958                         DUMPUNTIL(this_trie + dist, nextbranch);
20959                     }
20960                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20961                         nextbranch= regnext((regnode *)nextbranch);
20962                 } else {
20963                     Perl_re_printf( aTHX_  "\n");
20964                 }
20965             }
20966             if (last && next > last)
20967                 node= last;
20968             else
20969                 node= next;
20970         }
20971         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
20972             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20973                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20974         }
20975         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20976             assert(next);
20977             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20978         }
20979         else if ( op == PLUS || op == STAR) {
20980             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20981         }
20982         else if (PL_regkind[(U8)op] == ANYOF) {
20983             /* arglen 1 + class block */
20984             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20985                           ? ANYOF_POSIXL_SKIP
20986                           : ANYOF_SKIP);
20987             node = NEXTOPER(node);
20988         }
20989         else if (PL_regkind[(U8)op] == EXACT) {
20990             /* Literal string, where present. */
20991             node += NODE_SZ_STR(node) - 1;
20992             node = NEXTOPER(node);
20993         }
20994         else {
20995             node = NEXTOPER(node);
20996             node += regarglen[(U8)op];
20997         }
20998         if (op == CURLYX || op == OPEN || op == SROPEN)
20999             indent++;
21000     }
21001     CLEAR_OPTSTART;
21002 #ifdef DEBUG_DUMPUNTIL
21003     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
21004 #endif
21005     return node;
21006 }
21007
21008 #endif  /* DEBUGGING */
21009
21010 /*
21011  * ex: set ts=8 sts=4 sw=4 et:
21012  */