regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 #ifndef MIN
 105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 106 #endif
 107
 108 #ifndef MAX
 109 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 110 #endif
 111
 112 /* this is a chain of data about sub patterns we are processing that
 113    need to be handled separately/specially in study_chunk. Its so
 114    we can simulate recursion without losing state.  */
 115 struct scan_frame;
 116 typedef struct scan_frame {
 117     regnode *last_regnode;      /* last node to process in this frame */
 118     regnode *next_regnode;      /* next node to process when last is reached */
 119     U32 prev_recursed_depth;
 120     I32 stopparen;              /* what stopparen do we use */
 121     U32 is_top_frame;           /* what flags do we use? */
 122
 123     struct scan_frame *this_prev_frame; /* this previous frame */
 124     struct scan_frame *prev_frame;      /* previous frame */
 125     struct scan_frame *next_frame;      /* next frame */
 126 } scan_frame;
 127
 128 /* Certain characters are output as a sequence with the first being a
 129  * backslash. */
 130 #define isBACKSLASHED_PUNCT(c)                                              \
 131                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 132
 133
 134 struct RExC_state_t {
 135     U32         flags;                  /* RXf_* are we folding, multilining? */
 136     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 137     char        *precomp;               /* uncompiled string. */
 138     char        *precomp_end;           /* pointer to end of uncompiled string. */
 139     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 140     regexp      *rx;                    /* perl core regexp structure */
 141     regexp_internal     *rxi;           /* internal data for regexp object
 142                                            pprivate field */
 143     char        *start;                 /* Start of input for compile */
 144     char        *end;                   /* End of input for compile */
 145     char        *parse;                 /* Input-scan pointer. */
 146     char        *adjusted_start;        /* 'start', adjusted.  See code use */
 147     STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
 148     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 149     regnode     *emit_start;            /* Start of emitted-code area */
 150     regnode     *emit_bound;            /* First regnode outside of the
 151                                            allocated space */
 152     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 153                                            implies compiling, so don't emit */
 154     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 155                                            large enough for the largest
 156                                            non-EXACTish node, so can use it as
 157                                            scratch in pass1 */
 158     I32         naughty;                /* How bad is this pattern? */
 159     I32         sawback;                /* Did we see \1, ...? */
 160     U32         seen;
 161     SSize_t     size;                   /* Code size. */
 162     I32                npar;            /* Capture buffer count, (OPEN) plus
 163                                            one. ("par" 0 is the whole
 164                                            pattern)*/
 165     I32         nestroot;               /* root parens we are in - used by
 166                                            accept */
 167     I32         extralen;
 168     I32         seen_zerolen;
 169     regnode     **open_parens;          /* pointers to open parens */
 170     regnode     **close_parens;         /* pointers to close parens */
 171     regnode     *end_op;                /* END node in program */
 172     I32         utf8;           /* whether the pattern is utf8 or not */
 173     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 174                                 /* XXX use this for future optimisation of case
 175                                  * where pattern must be upgraded to utf8. */
 176     I32         uni_semantics;  /* If a d charset modifier should use unicode
 177                                    rules, even if the pattern is not in
 178                                    utf8 */
 179     HV          *paren_names;           /* Paren names */
 180
 181     regnode     **recurse;              /* Recurse regops */
 182     I32                recurse_count;                /* Number of recurse regops we have generated */
 183     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 184                                            through */
 185     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 186     I32         in_lookbehind;
 187     I32         contains_locale;
 188     I32         contains_i;
 189     I32         override_recoding;
 190 #ifdef EBCDIC
 191     I32         recode_x_to_native;
 192 #endif
 193     I32         in_multi_char_class;
 194     struct reg_code_block *code_blocks; /* positions of literal (?{})
 195                                             within pattern */
 196     int         num_code_blocks;        /* size of code_blocks[] */
 197     int         code_index;             /* next code_blocks[] slot */
 198     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 199     scan_frame *frame_head;
 200     scan_frame *frame_last;
 201     U32         frame_count;
 202     AV         *warn_text;
 203 #ifdef ADD_TO_REGEXEC
 204     char        *starttry;              /* -Dr: where regtry was called. */
 205 #define RExC_starttry   (pRExC_state->starttry)
 206 #endif
 207     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 208 #ifdef DEBUGGING
 209     const char  *lastparse;
 210     I32         lastnum;
 211     AV          *paren_name_list;       /* idx -> name */
 212     U32         study_chunk_recursed_count;
 213     SV          *mysv1;
 214     SV          *mysv2;
 215 #define RExC_lastparse  (pRExC_state->lastparse)
 216 #define RExC_lastnum    (pRExC_state->lastnum)
 217 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 218 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 219 #define RExC_mysv       (pRExC_state->mysv1)
 220 #define RExC_mysv1      (pRExC_state->mysv1)
 221 #define RExC_mysv2      (pRExC_state->mysv2)
 222
 223 #endif
 224     bool        seen_unfolded_sharp_s;
 225     bool        strict;
 226     bool        study_started;
 227 };
 228
 229 #define RExC_flags      (pRExC_state->flags)
 230 #define RExC_pm_flags   (pRExC_state->pm_flags)
 231 #define RExC_precomp    (pRExC_state->precomp)
 232 #define RExC_precomp_adj (pRExC_state->precomp_adj)
 233 #define RExC_adjusted_start  (pRExC_state->adjusted_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 242
 243 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
 244  * EXACTF node, hence was parsed under /di rules.  If later in the parse,
 245  * something forces the pattern into using /ui rules, the sharp s should be
 246  * folded into the sequence 'ss', which takes up more space than previously
 247  * calculated.  This means that the sizing pass needs to be restarted.  (The
 248  * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
 249  * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
 250  * so there is no need to resize [perl #125990]. */
 251 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
 252
 253 #ifdef RE_TRACK_PATTERN_OFFSETS
 254 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 255                                                          others */
 256 #endif
 257 #define RExC_emit       (pRExC_state->emit)
 258 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 259 #define RExC_emit_start (pRExC_state->emit_start)
 260 #define RExC_emit_bound (pRExC_state->emit_bound)
 261 #define RExC_sawback    (pRExC_state->sawback)
 262 #define RExC_seen       (pRExC_state->seen)
 263 #define RExC_size       (pRExC_state->size)
 264 #define RExC_maxlen        (pRExC_state->maxlen)
 265 #define RExC_npar       (pRExC_state->npar)
 266 #define RExC_nestroot   (pRExC_state->nestroot)
 267 #define RExC_extralen   (pRExC_state->extralen)
 268 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 269 #define RExC_utf8       (pRExC_state->utf8)
 270 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 271 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 272 #define RExC_open_parens        (pRExC_state->open_parens)
 273 #define RExC_close_parens       (pRExC_state->close_parens)
 274 #define RExC_end_op     (pRExC_state->end_op)
 275 #define RExC_paren_names        (pRExC_state->paren_names)
 276 #define RExC_recurse    (pRExC_state->recurse)
 277 #define RExC_recurse_count      (pRExC_state->recurse_count)
 278 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 279 #define RExC_study_chunk_recursed_bytes  \
 280                                    (pRExC_state->study_chunk_recursed_bytes)
 281 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 282 #define RExC_contains_locale    (pRExC_state->contains_locale)
 283 #define RExC_contains_i (pRExC_state->contains_i)
 284 #define RExC_override_recoding (pRExC_state->override_recoding)
 285 #ifdef EBCDIC
 286 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 287 #endif
 288 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 289 #define RExC_frame_head (pRExC_state->frame_head)
 290 #define RExC_frame_last (pRExC_state->frame_last)
 291 #define RExC_frame_count (pRExC_state->frame_count)
 292 #define RExC_strict (pRExC_state->strict)
 293 #define RExC_study_started      (pRExC_state->study_started)
 294 #define RExC_warn_text (pRExC_state->warn_text)
 295
 296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 297  * a flag to disable back-off on the fixed/floating substrings - if it's
 298  * a high complexity pattern we assume the benefit of avoiding a full match
 299  * is worth the cost of checking for the substrings even if they rarely help.
 300  */
 301 #define RExC_naughty    (pRExC_state->naughty)
 302 #define TOO_NAUGHTY (10)
 303 #define MARK_NAUGHTY(add) \
 304     if (RExC_naughty < TOO_NAUGHTY) \
 305         RExC_naughty += (add)
 306 #define MARK_NAUGHTY_EXP(exp, add) \
 307     if (RExC_naughty < TOO_NAUGHTY) \
 308         RExC_naughty += RExC_naughty / (exp) + (add)
 309
 310 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 311 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 312         ((*s) == '{' && regcurly(s)))
 313
 314 /*
 315  * Flags to be passed up and down.
 316  */
 317 #define WORST           0       /* Worst case. */
 318 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 319
 320 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 321  * character.  (There needs to be a case: in the switch statement in regexec.c
 322  * for any node marked SIMPLE.)  Note that this is not the same thing as
 323  * REGNODE_SIMPLE */
 324 #define SIMPLE          0x02
 325 #define SPSTART         0x04    /* Starts with * or + */
 326 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 327 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 328 #define RESTART_PASS1   0x20    /* Need to restart sizing pass */
 329 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
 330                                    calcuate sizes as UTF-8 */
 331
 332 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 333
 334 /* whether trie related optimizations are enabled */
 335 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 336 #define TRIE_STUDY_OPT
 337 #define FULL_TRIE_STUDY
 338 #define TRIE_STCLASS
 339 #endif
 340
 341
 342
 343 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 344 #define PBITVAL(paren) (1 << ((paren) & 7))
 345 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 346 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 347 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 348
 349 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 350                                      if (!UTF) {                           \
 351                                          assert(PASS1);                    \
 352                                          *flagp = RESTART_PASS1|NEED_UTF8; \
 353                                          return NULL;                      \
 354                                      }                                     \
 355                              } STMT_END
 356
 357 /* Change from /d into /u rules, and restart the parse if we've already seen
 358  * something whose size would increase as a result, by setting *flagp and
 359  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
 360  * we've change to /u during the parse.  */
 361 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 362     STMT_START {                                                            \
 363             if (DEPENDS_SEMANTICS) {                                        \
 364                 assert(PASS1);                                              \
 365                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 366                 RExC_uni_semantics = 1;                                     \
 367                 if (RExC_seen_unfolded_sharp_s) {                           \
 368                     *flagp |= RESTART_PASS1;                                \
 369                     return restart_retval;                                  \
 370                 }                                                           \
 371             }                                                               \
 372     } STMT_END
 373
 374 /* This converts the named class defined in regcomp.h to its equivalent class
 375  * number defined in handy.h. */
 376 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 377 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 378
 379 #define _invlist_union_complement_2nd(a, b, output) \
 380                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 381 #define _invlist_intersection_complement_2nd(a, b, output) \
 382                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 383
 384 /* About scan_data_t.
 385
 386   During optimisation we recurse through the regexp program performing
 387   various inplace (keyhole style) optimisations. In addition study_chunk
 388   and scan_commit populate this data structure with information about
 389   what strings MUST appear in the pattern. We look for the longest
 390   string that must appear at a fixed location, and we look for the
 391   longest string that may appear at a floating location. So for instance
 392   in the pattern:
 393
 394     /FOO[xX]A.*B[xX]BAR/
 395
 396   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 397   strings (because they follow a .* construct). study_chunk will identify
 398   both FOO and BAR as being the longest fixed and floating strings respectively.
 399
 400   The strings can be composites, for instance
 401
 402      /(f)(o)(o)/
 403
 404   will result in a composite fixed substring 'foo'.
 405
 406   For each string some basic information is maintained:
 407
 408   - offset or min_offset
 409     This is the position the string must appear at, or not before.
 410     It also implicitly (when combined with minlenp) tells us how many
 411     characters must match before the string we are searching for.
 412     Likewise when combined with minlenp and the length of the string it
 413     tells us how many characters must appear after the string we have
 414     found.
 415
 416   - max_offset
 417     Only used for floating strings. This is the rightmost point that
 418     the string can appear at. If set to SSize_t_MAX it indicates that the
 419     string can occur infinitely far to the right.
 420
 421   - minlenp
 422     A pointer to the minimum number of characters of the pattern that the
 423     string was found inside. This is important as in the case of positive
 424     lookahead or positive lookbehind we can have multiple patterns
 425     involved. Consider
 426
 427     /(?=FOO).*F/
 428
 429     The minimum length of the pattern overall is 3, the minimum length
 430     of the lookahead part is 3, but the minimum length of the part that
 431     will actually match is 1. So 'FOO's minimum length is 3, but the
 432     minimum length for the F is 1. This is important as the minimum length
 433     is used to determine offsets in front of and behind the string being
 434     looked for.  Since strings can be composites this is the length of the
 435     pattern at the time it was committed with a scan_commit. Note that
 436     the length is calculated by study_chunk, so that the minimum lengths
 437     are not known until the full pattern has been compiled, thus the
 438     pointer to the value.
 439
 440   - lookbehind
 441
 442     In the case of lookbehind the string being searched for can be
 443     offset past the start point of the final matching string.
 444     If this value was just blithely removed from the min_offset it would
 445     invalidate some of the calculations for how many chars must match
 446     before or after (as they are derived from min_offset and minlen and
 447     the length of the string being searched for).
 448     When the final pattern is compiled and the data is moved from the
 449     scan_data_t structure into the regexp structure the information
 450     about lookbehind is factored in, with the information that would
 451     have been lost precalculated in the end_shift field for the
 452     associated string.
 453
 454   The fields pos_min and pos_delta are used to store the minimum offset
 455   and the delta to the maximum offset at the current point in the pattern.
 456
 457 */
 458
 459 typedef struct scan_data_t {
 460     /*I32 len_min;      unused */
 461     /*I32 len_delta;    unused */
 462     SSize_t pos_min;
 463     SSize_t pos_delta;
 464     SV *last_found;
 465     SSize_t last_end;       /* min value, <0 unless valid. */
 466     SSize_t last_start_min;
 467     SSize_t last_start_max;
 468     SV **longest;           /* Either &l_fixed, or &l_float. */
 469     SV *longest_fixed;      /* longest fixed string found in pattern */
 470     SSize_t offset_fixed;   /* offset where it starts */
 471     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 472     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 473     SV *longest_float;      /* longest floating string found in pattern */
 474     SSize_t offset_float_min; /* earliest point in string it can appear */
 475     SSize_t offset_float_max; /* latest point in string it can appear */
 476     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 477     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 478     I32 flags;
 479     I32 whilem_c;
 480     SSize_t *last_closep;
 481     regnode_ssc *start_class;
 482 } scan_data_t;
 483
 484 /*
 485  * Forward declarations for pregcomp()'s friends.
 486  */
 487
 488 static const scan_data_t zero_scan_data =
 489   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 490
 491 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 492 #define SF_BEFORE_SEOL          0x0001
 493 #define SF_BEFORE_MEOL          0x0002
 494 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 495 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 496
 497 #define SF_FIX_SHIFT_EOL        (+2)
 498 #define SF_FL_SHIFT_EOL         (+4)
 499
 500 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 501 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 502
 503 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 504 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 505 #define SF_IS_INF               0x0040
 506 #define SF_HAS_PAR              0x0080
 507 #define SF_IN_PAR               0x0100
 508 #define SF_HAS_EVAL             0x0200
 509
 510
 511 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 512  * longest substring in the pattern. When it is not set the optimiser keeps
 513  * track of position, but does not keep track of the actual strings seen,
 514  *
 515  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 516  * /foo/i will not.
 517  *
 518  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 519  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 520  * turned off because of the alternation (BRANCH). */
 521 #define SCF_DO_SUBSTR           0x0400
 522
 523 #define SCF_DO_STCLASS_AND      0x0800
 524 #define SCF_DO_STCLASS_OR       0x1000
 525 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 526 #define SCF_WHILEM_VISITED_POS  0x2000
 527
 528 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 529 #define SCF_SEEN_ACCEPT         0x8000
 530 #define SCF_TRIE_DOING_RESTUDY 0x10000
 531 #define SCF_IN_DEFINE          0x20000
 532
 533
 534
 535
 536 #define UTF cBOOL(RExC_utf8)
 537
 538 /* The enums for all these are ordered so things work out correctly */
 539 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 540 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 541                                                      == REGEX_DEPENDS_CHARSET)
 542 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 543 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 544                                                      >= REGEX_UNICODE_CHARSET)
 545 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 546                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 547 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 548                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 549 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 550                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 551
 552 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 553
 554 /* For programs that want to be strictly Unicode compatible by dying if any
 555  * attempt is made to match a non-Unicode code point against a Unicode
 556  * property.  */
 557 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 558
 559 #define OOB_NAMEDCLASS          -1
 560
 561 /* There is no code point that is out-of-bounds, so this is problematic.  But
 562  * its only current use is to initialize a variable that is always set before
 563  * looked at. */
 564 #define OOB_UNICODE             0xDEADBEEF
 565
 566 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 567 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 568
 569
 570 /* length of regex to show in messages that don't mark a position within */
 571 #define RegexLengthToShowInErrorMessages 127
 572
 573 /*
 574  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 575  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 576  * op/pragma/warn/regcomp.
 577  */
 578 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 579 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 580
 581 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 582                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 583
 584 /* The code in this file in places uses one level of recursion with parsing
 585  * rebased to an alternate string constructed by us in memory.  This can take
 586  * the form of something that is completely different from the input, or
 587  * something that uses the input as part of the alternate.  In the first case,
 588  * there should be no possibility of an error, as we are in complete control of
 589  * the alternate string.  But in the second case we don't control the input
 590  * portion, so there may be errors in that.  Here's an example:
 591  *      /[abc\x{DF}def]/ui
 592  * is handled specially because \x{df} folds to a sequence of more than one
 593  * character, 'ss'.  What is done is to create and parse an alternate string,
 594  * which looks like this:
 595  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 596  * where it uses the input unchanged in the middle of something it constructs,
 597  * which is a branch for the DF outside the character class, and clustering
 598  * parens around the whole thing. (It knows enough to skip the DF inside the
 599  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 600  * need to be reported.  The general situation looks like this:
 601  *
 602  *              sI                       tI               xI       eI
 603  * Input:       ----------------------------------------------------
 604  * Constructed:         ---------------------------------------------------
 605  *                      sC               tC               xC       eC     EC
 606  *
 607  * The input string sI..eI is the input pattern.  The string sC..EC is the
 608  * constructed substitute parse string.  The portions sC..tC and eC..EC are
 609  * constructed by us.  The portion tC..eC is an exact duplicate of the input
 610  * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
 611  * while parsing, we find an error at xC.  We want to display a message showing
 612  * the real input string.  Thus we need to find the point xI in it which
 613  * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
 614  * been constructed by us, and so shouldn't have errors.  We get:
 615  *
 616  *      xI = sI + (tI - sI) + (xC - tC)
 617  *
 618  * and, the offset into sI is:
 619  *
 620  *      (xI - sI) = (tI - sI) + (xC - tC)
 621  *
 622  * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
 623  * and we save tC as RExC_adjusted_start.
 624  *
 625  * During normal processing of the input pattern, everything points to that,
 626  * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
 627  */
 628
 629 #define tI_sI           RExC_precomp_adj
 630 #define tC              RExC_adjusted_start
 631 #define sC              RExC_precomp
 632 #define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
 633 #define xI(xC)          (sC + xI_offset(xC))
 634 #define eC              RExC_precomp_end
 635
 636 #define REPORT_LOCATION_ARGS(xC)                                            \
 637     UTF8fARG(UTF,                                                           \
 638              (xI(xC) > eC) /* Don't run off end */                          \
 639               ? eC - sC   /* Length before the <--HERE */                   \
 640               : xI_offset(xC),                                              \
 641              sC),         /* The input pattern printed up to the <--HERE */ \
 642     UTF8fARG(UTF,                                                           \
 643              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
 644              (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 645
 646 /* Used to point after bad bytes for an error message, but avoid skipping
 647  * past a nul byte. */
 648 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 649
 650 /*
 651  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 652  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 653  * "...".
 654  */
 655 #define _FAIL(code) STMT_START {                                        \
 656     const char *ellipses = "";                                          \
 657     IV len = RExC_precomp_end - RExC_precomp;                                   \
 658                                                                         \
 659     if (!SIZE_ONLY)                                                     \
 660         SAVEFREESV(RExC_rx_sv);                                         \
 661     if (len > RegexLengthToShowInErrorMessages) {                       \
 662         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 663         len = RegexLengthToShowInErrorMessages - 10;                    \
 664         ellipses = "...";                                               \
 665     }                                                                   \
 666     code;                                                               \
 667 } STMT_END
 668
 669 #define FAIL(msg) _FAIL(                            \
 670     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 671             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 672
 673 #define FAIL2(msg,arg) _FAIL(                       \
 674     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 675             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 676
 677 /*
 678  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 679  */
 680 #define Simple_vFAIL(m) STMT_START {                                    \
 681     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 682             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 683 } STMT_END
 684
 685 /*
 686  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 687  */
 688 #define vFAIL(m) STMT_START {                           \
 689     if (!SIZE_ONLY)                                     \
 690         SAVEFREESV(RExC_rx_sv);                         \
 691     Simple_vFAIL(m);                                    \
 692 } STMT_END
 693
 694 /*
 695  * Like Simple_vFAIL(), but accepts two arguments.
 696  */
 697 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 698     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 699                       REPORT_LOCATION_ARGS(RExC_parse));        \
 700 } STMT_END
 701
 702 /*
 703  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 704  */
 705 #define vFAIL2(m,a1) STMT_START {                       \
 706     if (!SIZE_ONLY)                                     \
 707         SAVEFREESV(RExC_rx_sv);                         \
 708     Simple_vFAIL2(m, a1);                               \
 709 } STMT_END
 710
 711
 712 /*
 713  * Like Simple_vFAIL(), but accepts three arguments.
 714  */
 715 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 716     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 717             REPORT_LOCATION_ARGS(RExC_parse));                  \
 718 } STMT_END
 719
 720 /*
 721  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 722  */
 723 #define vFAIL3(m,a1,a2) STMT_START {                    \
 724     if (!SIZE_ONLY)                                     \
 725         SAVEFREESV(RExC_rx_sv);                         \
 726     Simple_vFAIL3(m, a1, a2);                           \
 727 } STMT_END
 728
 729 /*
 730  * Like Simple_vFAIL(), but accepts four arguments.
 731  */
 732 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 733     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 734             REPORT_LOCATION_ARGS(RExC_parse));                  \
 735 } STMT_END
 736
 737 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 738     if (!SIZE_ONLY)                                     \
 739         SAVEFREESV(RExC_rx_sv);                         \
 740     Simple_vFAIL4(m, a1, a2, a3);                       \
 741 } STMT_END
 742
 743 /* A specialized version of vFAIL2 that works with UTF8f */
 744 #define vFAIL2utf8f(m, a1) STMT_START {             \
 745     if (!SIZE_ONLY)                                 \
 746         SAVEFREESV(RExC_rx_sv);                     \
 747     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 748             REPORT_LOCATION_ARGS(RExC_parse));      \
 749 } STMT_END
 750
 751 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 752     if (!SIZE_ONLY)                                     \
 753         SAVEFREESV(RExC_rx_sv);                         \
 754     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 755             REPORT_LOCATION_ARGS(RExC_parse));          \
 756 } STMT_END
 757
 758 /* These have asserts in them because of [perl #122671] Many warnings in
 759  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 760  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 761  * would get output again.  So they should be output in pass2, and these
 762  * asserts make sure new warnings follow that paradigm. */
 763
 764 /* m is not necessarily a "literal string", in this macro */
 765 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 766     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 767                                        "%s" REPORT_LOCATION,            \
 768                                   m, REPORT_LOCATION_ARGS(loc));        \
 769 } STMT_END
 770
 771 #define ckWARNreg(loc,m) STMT_START {                                   \
 772     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 773                                           m REPORT_LOCATION,            \
 774                                           REPORT_LOCATION_ARGS(loc));   \
 775 } STMT_END
 776
 777 #define vWARN(loc, m) STMT_START {                                      \
 778     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 779                                        m REPORT_LOCATION,               \
 780                                        REPORT_LOCATION_ARGS(loc));      \
 781 } STMT_END
 782
 783 #define vWARN_dep(loc, m) STMT_START {                                  \
 784     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),       \
 785                                        m REPORT_LOCATION,               \
 786                                        REPORT_LOCATION_ARGS(loc));      \
 787 } STMT_END
 788
 789 #define ckWARNdep(loc,m) STMT_START {                                   \
 790     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),  \
 791                                             m REPORT_LOCATION,          \
 792                                             REPORT_LOCATION_ARGS(loc)); \
 793 } STMT_END
 794
 795 #define ckWARNregdep(loc,m) STMT_START {                                    \
 796     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,      \
 797                                                       WARN_REGEXP),         \
 798                                              m REPORT_LOCATION,             \
 799                                              REPORT_LOCATION_ARGS(loc));    \
 800 } STMT_END
 801
 802 #define ckWARN2reg_d(loc,m, a1) STMT_START {                                \
 803     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),          \
 804                                             m REPORT_LOCATION,              \
 805                                             a1, REPORT_LOCATION_ARGS(loc)); \
 806 } STMT_END
 807
 808 #define ckWARN2reg(loc, m, a1) STMT_START {                                 \
 809     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 810                                           m REPORT_LOCATION,                \
 811                                           a1, REPORT_LOCATION_ARGS(loc));   \
 812 } STMT_END
 813
 814 #define vWARN3(loc, m, a1, a2) STMT_START {                                 \
 815     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),               \
 816                                        m REPORT_LOCATION,                   \
 817                                        a1, a2, REPORT_LOCATION_ARGS(loc));  \
 818 } STMT_END
 819
 820 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                             \
 821     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 822                                           m REPORT_LOCATION,                \
 823                                           a1, a2,                           \
 824                                           REPORT_LOCATION_ARGS(loc));       \
 825 } STMT_END
 826
 827 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 828     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 829                                        m REPORT_LOCATION,               \
 830                                        a1, a2, a3,                      \
 831                                        REPORT_LOCATION_ARGS(loc));      \
 832 } STMT_END
 833
 834 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 835     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 836                                           m REPORT_LOCATION,            \
 837                                           a1, a2, a3,                   \
 838                                           REPORT_LOCATION_ARGS(loc));   \
 839 } STMT_END
 840
 841 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 842     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 843                                        m REPORT_LOCATION,               \
 844                                        a1, a2, a3, a4,                  \
 845                                        REPORT_LOCATION_ARGS(loc));      \
 846 } STMT_END
 847
 848 /* Macros for recording node offsets.   20001227 mjd@plover.com
 849  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 850  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 851  * Element 0 holds the number n.
 852  * Position is 1 indexed.
 853  */
 854 #ifndef RE_TRACK_PATTERN_OFFSETS
 855 #define Set_Node_Offset_To_R(node,byte)
 856 #define Set_Node_Offset(node,byte)
 857 #define Set_Cur_Node_Offset
 858 #define Set_Node_Length_To_R(node,len)
 859 #define Set_Node_Length(node,len)
 860 #define Set_Node_Cur_Length(node,start)
 861 #define Node_Offset(n)
 862 #define Node_Length(n)
 863 #define Set_Node_Offset_Length(node,offset,len)
 864 #define ProgLen(ri) ri->u.proglen
 865 #define SetProgLen(ri,x) ri->u.proglen = x
 866 #else
 867 #define ProgLen(ri) ri->u.offsets[0]
 868 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 869 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 870     if (! SIZE_ONLY) {                                                  \
 871         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 872                     __LINE__, (int)(node), (int)(byte)));               \
 873         if((node) < 0) {                                                \
 874             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 875                                          (int)(node));                  \
 876         } else {                                                        \
 877             RExC_offsets[2*(node)-1] = (byte);                          \
 878         }                                                               \
 879     }                                                                   \
 880 } STMT_END
 881
 882 #define Set_Node_Offset(node,byte) \
 883     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 884 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 885
 886 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 887     if (! SIZE_ONLY) {                                                  \
 888         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 889                 __LINE__, (int)(node), (int)(len)));                    \
 890         if((node) < 0) {                                                \
 891             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 892                                          (int)(node));                  \
 893         } else {                                                        \
 894             RExC_offsets[2*(node)] = (len);                             \
 895         }                                                               \
 896     }                                                                   \
 897 } STMT_END
 898
 899 #define Set_Node_Length(node,len) \
 900     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 901 #define Set_Node_Cur_Length(node, start)                \
 902     Set_Node_Length(node, RExC_parse - start)
 903
 904 /* Get offsets and lengths */
 905 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 906 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 907
 908 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 909     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 910     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 911 } STMT_END
 912 #endif
 913
 914 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 915 #define EXPERIMENTAL_INPLACESCAN
 916 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 917
 918 #ifdef DEBUGGING
 919 int
 920 Perl_re_printf(pTHX_ const char *fmt, ...)
 921 {
 922     va_list ap;
 923     int result;
 924     PerlIO *f= Perl_debug_log;
 925     PERL_ARGS_ASSERT_RE_PRINTF;
 926     va_start(ap, fmt);
 927     result = PerlIO_vprintf(f, fmt, ap);
 928     va_end(ap);
 929     return result;
 930 }
 931
 932 int
 933 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
 934 {
 935     va_list ap;
 936     int result;
 937     PerlIO *f= Perl_debug_log;
 938     PERL_ARGS_ASSERT_RE_INDENTF;
 939     va_start(ap, depth);
 940     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
 941     result = PerlIO_vprintf(f, fmt, ap);
 942     va_end(ap);
 943     return result;
 944 }
 945 #endif /* DEBUGGING */
 946
 947 #define DEBUG_RExC_seen()                                                   \
 948         DEBUG_OPTIMISE_MORE_r({                                             \
 949             Perl_re_printf( aTHX_ "RExC_seen: ");                                       \
 950                                                                             \
 951             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 952                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                            \
 953                                                                             \
 954             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 955                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");                          \
 956                                                                             \
 957             if (RExC_seen & REG_GPOS_SEEN)                                  \
 958                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                                \
 959                                                                             \
 960             if (RExC_seen & REG_RECURSE_SEEN)                               \
 961                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                             \
 962                                                                             \
 963             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
 964                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");                  \
 965                                                                             \
 966             if (RExC_seen & REG_VERBARG_SEEN)                               \
 967                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                             \
 968                                                                             \
 969             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 970                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                            \
 971                                                                             \
 972             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 973                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");                      \
 974                                                                             \
 975             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 976                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");                      \
 977                                                                             \
 978             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
 979                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");                \
 980                                                                             \
 981             Perl_re_printf( aTHX_ "\n");                                                \
 982         });
 983
 984 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 985   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
 986
 987 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 988     if ( ( flags ) ) {                                                      \
 989         Perl_re_printf( aTHX_  "%s", open_str);                                         \
 990         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 991         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 992         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 993         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 994         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 995         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 996         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 997         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 998         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 999         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
1000         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
1001         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
1002         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
1003         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
1004         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
1005         Perl_re_printf( aTHX_  "%s", close_str);                                        \
1006     }
1007
1008
1009 #define DEBUG_STUDYDATA(str,data,depth)                              \
1010 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
1011     Perl_re_indentf( aTHX_  "" str "Pos:%"IVdf"/%"IVdf                           \
1012         " Flags: 0x%"UVXf,                                           \
1013         depth,                                                       \
1014         (IV)((data)->pos_min),                                       \
1015         (IV)((data)->pos_delta),                                     \
1016         (UV)((data)->flags)                                          \
1017     );                                                               \
1018     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
1019     Perl_re_printf( aTHX_                                                        \
1020         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
1021         (IV)((data)->whilem_c),                                      \
1022         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
1023         is_inf ? "INF " : ""                                         \
1024     );                                                               \
1025     if ((data)->last_found)                                          \
1026         Perl_re_printf( aTHX_                                                    \
1027             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
1028             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
1029             SvPVX_const((data)->last_found),                         \
1030             (IV)((data)->last_end),                                  \
1031             (IV)((data)->last_start_min),                            \
1032             (IV)((data)->last_start_max),                            \
1033             ((data)->longest &&                                      \
1034              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
1035             SvPVX_const((data)->longest_fixed),                      \
1036             (IV)((data)->offset_fixed),                              \
1037             ((data)->longest &&                                      \
1038              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
1039             SvPVX_const((data)->longest_float),                      \
1040             (IV)((data)->offset_float_min),                          \
1041             (IV)((data)->offset_float_max)                           \
1042         );                                                           \
1043     Perl_re_printf( aTHX_ "\n");                                                 \
1044 });
1045
1046
1047 /* =========================================================
1048  * BEGIN edit_distance stuff.
1049  *
1050  * This calculates how many single character changes of any type are needed to
1051  * transform a string into another one.  It is taken from version 3.1 of
1052  *
1053  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1054  */
1055
1056 /* Our unsorted dictionary linked list.   */
1057 /* Note we use UVs, not chars. */
1058
1059 struct dictionary{
1060   UV key;
1061   UV value;
1062   struct dictionary* next;
1063 };
1064 typedef struct dictionary item;
1065
1066
1067 PERL_STATIC_INLINE item*
1068 push(UV key,item* curr)
1069 {
1070     item* head;
1071     Newxz(head, 1, item);
1072     head->key = key;
1073     head->value = 0;
1074     head->next = curr;
1075     return head;
1076 }
1077
1078
1079 PERL_STATIC_INLINE item*
1080 find(item* head, UV key)
1081 {
1082     item* iterator = head;
1083     while (iterator){
1084         if (iterator->key == key){
1085             return iterator;
1086         }
1087         iterator = iterator->next;
1088     }
1089
1090     return NULL;
1091 }
1092
1093 PERL_STATIC_INLINE item*
1094 uniquePush(item* head,UV key)
1095 {
1096     item* iterator = head;
1097
1098     while (iterator){
1099         if (iterator->key == key) {
1100             return head;
1101         }
1102         iterator = iterator->next;
1103     }
1104
1105     return push(key,head);
1106 }
1107
1108 PERL_STATIC_INLINE void
1109 dict_free(item* head)
1110 {
1111     item* iterator = head;
1112
1113     while (iterator) {
1114         item* temp = iterator;
1115         iterator = iterator->next;
1116         Safefree(temp);
1117     }
1118
1119     head = NULL;
1120 }
1121
1122 /* End of Dictionary Stuff */
1123
1124 /* All calculations/work are done here */
1125 STATIC int
1126 S_edit_distance(const UV* src,
1127                 const UV* tgt,
1128                 const STRLEN x,             /* length of src[] */
1129                 const STRLEN y,             /* length of tgt[] */
1130                 const SSize_t maxDistance
1131 )
1132 {
1133     item *head = NULL;
1134     UV swapCount,swapScore,targetCharCount,i,j;
1135     UV *scores;
1136     UV score_ceil = x + y;
1137
1138     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1139
1140     /* intialize matrix start values */
1141     Newxz(scores, ( (x + 2) * (y + 2)), UV);
1142     scores[0] = score_ceil;
1143     scores[1 * (y + 2) + 0] = score_ceil;
1144     scores[0 * (y + 2) + 1] = score_ceil;
1145     scores[1 * (y + 2) + 1] = 0;
1146     head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1147
1148     /* work loops    */
1149     /* i = src index */
1150     /* j = tgt index */
1151     for (i=1;i<=x;i++) {
1152         if (i < x)
1153             head = uniquePush(head,src[i]);
1154         scores[(i+1) * (y + 2) + 1] = i;
1155         scores[(i+1) * (y + 2) + 0] = score_ceil;
1156         swapCount = 0;
1157
1158         for (j=1;j<=y;j++) {
1159             if (i == 1) {
1160                 if(j < y)
1161                 head = uniquePush(head,tgt[j]);
1162                 scores[1 * (y + 2) + (j + 1)] = j;
1163                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1164             }
1165
1166             targetCharCount = find(head,tgt[j-1])->value;
1167             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1168
1169             if (src[i-1] != tgt[j-1]){
1170                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1171             }
1172             else {
1173                 swapCount = j;
1174                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1175             }
1176         }
1177
1178         find(head,src[i-1])->value = i;
1179     }
1180
1181     {
1182         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1183         dict_free(head);
1184         Safefree(scores);
1185         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1186     }
1187 }
1188
1189 /* END of edit_distance() stuff
1190  * ========================================================= */
1191
1192 /* is c a control character for which we have a mnemonic? */
1193 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1194
1195 STATIC const char *
1196 S_cntrl_to_mnemonic(const U8 c)
1197 {
1198     /* Returns the mnemonic string that represents character 'c', if one
1199      * exists; NULL otherwise.  The only ones that exist for the purposes of
1200      * this routine are a few control characters */
1201
1202     switch (c) {
1203         case '\a':       return "\\a";
1204         case '\b':       return "\\b";
1205         case ESC_NATIVE: return "\\e";
1206         case '\f':       return "\\f";
1207         case '\n':       return "\\n";
1208         case '\r':       return "\\r";
1209         case '\t':       return "\\t";
1210     }
1211
1212     return NULL;
1213 }
1214
1215 /* Mark that we cannot extend a found fixed substring at this point.
1216    Update the longest found anchored substring and the longest found
1217    floating substrings if needed. */
1218
1219 STATIC void
1220 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1221                     SSize_t *minlenp, int is_inf)
1222 {
1223     const STRLEN l = CHR_SVLEN(data->last_found);
1224     const STRLEN old_l = CHR_SVLEN(*data->longest);
1225     GET_RE_DEBUG_FLAGS_DECL;
1226
1227     PERL_ARGS_ASSERT_SCAN_COMMIT;
1228
1229     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1230         SvSetMagicSV(*data->longest, data->last_found);
1231         if (*data->longest == data->longest_fixed) {
1232             data->offset_fixed = l ? data->last_start_min : data->pos_min;
1233             if (data->flags & SF_BEFORE_EOL)
1234                 data->flags
1235                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1236             else
1237                 data->flags &= ~SF_FIX_BEFORE_EOL;
1238             data->minlen_fixed=minlenp;
1239             data->lookbehind_fixed=0;
1240         }
1241         else { /* *data->longest == data->longest_float */
1242             data->offset_float_min = l ? data->last_start_min : data->pos_min;
1243             data->offset_float_max = (l
1244                           ? data->last_start_max
1245                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1246                                          ? SSize_t_MAX
1247                                          : data->pos_min + data->pos_delta));
1248             if (is_inf
1249                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1250                 data->offset_float_max = SSize_t_MAX;
1251             if (data->flags & SF_BEFORE_EOL)
1252                 data->flags
1253                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1254             else
1255                 data->flags &= ~SF_FL_BEFORE_EOL;
1256             data->minlen_float=minlenp;
1257             data->lookbehind_float=0;
1258         }
1259     }
1260     SvCUR_set(data->last_found, 0);
1261     {
1262         SV * const sv = data->last_found;
1263         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1264             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1265             if (mg)
1266                 mg->mg_len = 0;
1267         }
1268     }
1269     data->last_end = -1;
1270     data->flags &= ~SF_BEFORE_EOL;
1271     DEBUG_STUDYDATA("commit: ",data,0);
1272 }
1273
1274 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1275  * list that describes which code points it matches */
1276
1277 STATIC void
1278 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1279 {
1280     /* Set the SSC 'ssc' to match an empty string or any code point */
1281
1282     PERL_ARGS_ASSERT_SSC_ANYTHING;
1283
1284     assert(is_ANYOF_SYNTHETIC(ssc));
1285
1286     /* mortalize so won't leak */
1287     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1288     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1289 }
1290
1291 STATIC int
1292 S_ssc_is_anything(const regnode_ssc *ssc)
1293 {
1294     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1295      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1296      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1297      * in any way, so there's no point in using it */
1298
1299     UV start, end;
1300     bool ret;
1301
1302     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1303
1304     assert(is_ANYOF_SYNTHETIC(ssc));
1305
1306     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1307         return FALSE;
1308     }
1309
1310     /* See if the list consists solely of the range 0 - Infinity */
1311     invlist_iterinit(ssc->invlist);
1312     ret = invlist_iternext(ssc->invlist, &start, &end)
1313           && start == 0
1314           && end == UV_MAX;
1315
1316     invlist_iterfinish(ssc->invlist);
1317
1318     if (ret) {
1319         return TRUE;
1320     }
1321
1322     /* If e.g., both \w and \W are set, matches everything */
1323     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1324         int i;
1325         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1326             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1327                 return TRUE;
1328             }
1329         }
1330     }
1331
1332     return FALSE;
1333 }
1334
1335 STATIC void
1336 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1337 {
1338     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1339      * string, any code point, or any posix class under locale */
1340
1341     PERL_ARGS_ASSERT_SSC_INIT;
1342
1343     Zero(ssc, 1, regnode_ssc);
1344     set_ANYOF_SYNTHETIC(ssc);
1345     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1346     ssc_anything(ssc);
1347
1348     /* If any portion of the regex is to operate under locale rules that aren't
1349      * fully known at compile time, initialization includes it.  The reason
1350      * this isn't done for all regexes is that the optimizer was written under
1351      * the assumption that locale was all-or-nothing.  Given the complexity and
1352      * lack of documentation in the optimizer, and that there are inadequate
1353      * test cases for locale, many parts of it may not work properly, it is
1354      * safest to avoid locale unless necessary. */
1355     if (RExC_contains_locale) {
1356         ANYOF_POSIXL_SETALL(ssc);
1357     }
1358     else {
1359         ANYOF_POSIXL_ZERO(ssc);
1360     }
1361 }
1362
1363 STATIC int
1364 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1365                         const regnode_ssc *ssc)
1366 {
1367     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1368      * to the list of code points matched, and locale posix classes; hence does
1369      * not check its flags) */
1370
1371     UV start, end;
1372     bool ret;
1373
1374     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     invlist_iterinit(ssc->invlist);
1379     ret = invlist_iternext(ssc->invlist, &start, &end)
1380           && start == 0
1381           && end == UV_MAX;
1382
1383     invlist_iterfinish(ssc->invlist);
1384
1385     if (! ret) {
1386         return FALSE;
1387     }
1388
1389     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1390         return FALSE;
1391     }
1392
1393     return TRUE;
1394 }
1395
1396 STATIC SV*
1397 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1398                                const regnode_charclass* const node)
1399 {
1400     /* Returns a mortal inversion list defining which code points are matched
1401      * by 'node', which is of type ANYOF.  Handles complementing the result if
1402      * appropriate.  If some code points aren't knowable at this time, the
1403      * returned list must, and will, contain every code point that is a
1404      * possibility. */
1405
1406     SV* invlist = NULL;
1407     SV* only_utf8_locale_invlist = NULL;
1408     unsigned int i;
1409     const U32 n = ARG(node);
1410     bool new_node_has_latin1 = FALSE;
1411
1412     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1413
1414     /* Look at the data structure created by S_set_ANYOF_arg() */
1415     if (n != ANYOF_ONLY_HAS_BITMAP) {
1416         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1417         AV * const av = MUTABLE_AV(SvRV(rv));
1418         SV **const ary = AvARRAY(av);
1419         assert(RExC_rxi->data->what[n] == 's');
1420
1421         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1422             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1423         }
1424         else if (ary[0] && ary[0] != &PL_sv_undef) {
1425
1426             /* Here, no compile-time swash, and there are things that won't be
1427              * known until runtime -- we have to assume it could be anything */
1428             invlist = sv_2mortal(_new_invlist(1));
1429             return _add_range_to_invlist(invlist, 0, UV_MAX);
1430         }
1431         else if (ary[3] && ary[3] != &PL_sv_undef) {
1432
1433             /* Here no compile-time swash, and no run-time only data.  Use the
1434              * node's inversion list */
1435             invlist = sv_2mortal(invlist_clone(ary[3]));
1436         }
1437
1438         /* Get the code points valid only under UTF-8 locales */
1439         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1440             && ary[2] && ary[2] != &PL_sv_undef)
1441         {
1442             only_utf8_locale_invlist = ary[2];
1443         }
1444     }
1445
1446     if (! invlist) {
1447         invlist = sv_2mortal(_new_invlist(0));
1448     }
1449
1450     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1451      * code points, and an inversion list for the others, but if there are code
1452      * points that should match only conditionally on the target string being
1453      * UTF-8, those are placed in the inversion list, and not the bitmap.
1454      * Since there are circumstances under which they could match, they are
1455      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1456      * to exclude them here, so that when we invert below, the end result
1457      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1458      * have to do this here before we add the unconditionally matched code
1459      * points */
1460     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1461         _invlist_intersection_complement_2nd(invlist,
1462                                              PL_UpperLatin1,
1463                                              &invlist);
1464     }
1465
1466     /* Add in the points from the bit map */
1467     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1468         if (ANYOF_BITMAP_TEST(node, i)) {
1469             unsigned int start = i++;
1470
1471             for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1472                 /* empty */
1473             }
1474             invlist = _add_range_to_invlist(invlist, start, i-1);
1475             new_node_has_latin1 = TRUE;
1476         }
1477     }
1478
1479     /* If this can match all upper Latin1 code points, have to add them
1480      * as well.  But don't add them if inverting, as when that gets done below,
1481      * it would exclude all these characters, including the ones it shouldn't
1482      * that were added just above */
1483     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1484         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1485     {
1486         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1487     }
1488
1489     /* Similarly for these */
1490     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1491         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1492     }
1493
1494     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1495         _invlist_invert(invlist);
1496     }
1497     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1498
1499         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1500          * locale.  We can skip this if there are no 0-255 at all. */
1501         _invlist_union(invlist, PL_Latin1, &invlist);
1502     }
1503
1504     /* Similarly add the UTF-8 locale possible matches.  These have to be
1505      * deferred until after the non-UTF-8 locale ones are taken care of just
1506      * above, or it leads to wrong results under ANYOF_INVERT */
1507     if (only_utf8_locale_invlist) {
1508         _invlist_union_maybe_complement_2nd(invlist,
1509                                             only_utf8_locale_invlist,
1510                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1511                                             &invlist);
1512     }
1513
1514     return invlist;
1515 }
1516
1517 /* These two functions currently do the exact same thing */
1518 #define ssc_init_zero           ssc_init
1519
1520 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1521 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1522
1523 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1524  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1525  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1526
1527 STATIC void
1528 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1529                 const regnode_charclass *and_with)
1530 {
1531     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1532      * another SSC or a regular ANYOF class.  Can create false positives. */
1533
1534     SV* anded_cp_list;
1535     U8  anded_flags;
1536
1537     PERL_ARGS_ASSERT_SSC_AND;
1538
1539     assert(is_ANYOF_SYNTHETIC(ssc));
1540
1541     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1542      * the code point inversion list and just the relevant flags */
1543     if (is_ANYOF_SYNTHETIC(and_with)) {
1544         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1545         anded_flags = ANYOF_FLAGS(and_with);
1546
1547         /* XXX This is a kludge around what appears to be deficiencies in the
1548          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1549          * there are paths through the optimizer where it doesn't get weeded
1550          * out when it should.  And if we don't make some extra provision for
1551          * it like the code just below, it doesn't get added when it should.
1552          * This solution is to add it only when AND'ing, which is here, and
1553          * only when what is being AND'ed is the pristine, original node
1554          * matching anything.  Thus it is like adding it to ssc_anything() but
1555          * only when the result is to be AND'ed.  Probably the same solution
1556          * could be adopted for the same problem we have with /l matching,
1557          * which is solved differently in S_ssc_init(), and that would lead to
1558          * fewer false positives than that solution has.  But if this solution
1559          * creates bugs, the consequences are only that a warning isn't raised
1560          * that should be; while the consequences for having /l bugs is
1561          * incorrect matches */
1562         if (ssc_is_anything((regnode_ssc *)and_with)) {
1563             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1564         }
1565     }
1566     else {
1567         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1568         if (OP(and_with) == ANYOFD) {
1569             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1570         }
1571         else {
1572             anded_flags = ANYOF_FLAGS(and_with)
1573             &( ANYOF_COMMON_FLAGS
1574               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1575               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1576             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1577                 anded_flags &=
1578                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1579             }
1580         }
1581     }
1582
1583     ANYOF_FLAGS(ssc) &= anded_flags;
1584
1585     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1586      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1587      * 'and_with' may be inverted.  When not inverted, we have the situation of
1588      * computing:
1589      *  (C1 | P1) & (C2 | P2)
1590      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1591      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1592      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1593      *                    <=  ((C1 & C2) | P1 | P2)
1594      * Alternatively, the last few steps could be:
1595      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1596      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1597      *                    <=  (C1 | C2 | (P1 & P2))
1598      * We favor the second approach if either P1 or P2 is non-empty.  This is
1599      * because these components are a barrier to doing optimizations, as what
1600      * they match cannot be known until the moment of matching as they are
1601      * dependent on the current locale, 'AND"ing them likely will reduce or
1602      * eliminate them.
1603      * But we can do better if we know that C1,P1 are in their initial state (a
1604      * frequent occurrence), each matching everything:
1605      *  (<everything>) & (C2 | P2) =  C2 | P2
1606      * Similarly, if C2,P2 are in their initial state (again a frequent
1607      * occurrence), the result is a no-op
1608      *  (C1 | P1) & (<everything>) =  C1 | P1
1609      *
1610      * Inverted, we have
1611      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1612      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1613      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1614      * */
1615
1616     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1617         && ! is_ANYOF_SYNTHETIC(and_with))
1618     {
1619         unsigned int i;
1620
1621         ssc_intersection(ssc,
1622                          anded_cp_list,
1623                          FALSE /* Has already been inverted */
1624                          );
1625
1626         /* If either P1 or P2 is empty, the intersection will be also; can skip
1627          * the loop */
1628         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1629             ANYOF_POSIXL_ZERO(ssc);
1630         }
1631         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1632
1633             /* Note that the Posix class component P from 'and_with' actually
1634              * looks like:
1635              *      P = Pa | Pb | ... | Pn
1636              * where each component is one posix class, such as in [\w\s].
1637              * Thus
1638              *      ~P = ~(Pa | Pb | ... | Pn)
1639              *         = ~Pa & ~Pb & ... & ~Pn
1640              *        <= ~Pa | ~Pb | ... | ~Pn
1641              * The last is something we can easily calculate, but unfortunately
1642              * is likely to have many false positives.  We could do better
1643              * in some (but certainly not all) instances if two classes in
1644              * P have known relationships.  For example
1645              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1646              * So
1647              *      :lower: & :print: = :lower:
1648              * And similarly for classes that must be disjoint.  For example,
1649              * since \s and \w can have no elements in common based on rules in
1650              * the POSIX standard,
1651              *      \w & ^\S = nothing
1652              * Unfortunately, some vendor locales do not meet the Posix
1653              * standard, in particular almost everything by Microsoft.
1654              * The loop below just changes e.g., \w into \W and vice versa */
1655
1656             regnode_charclass_posixl temp;
1657             int add = 1;    /* To calculate the index of the complement */
1658
1659             ANYOF_POSIXL_ZERO(&temp);
1660             for (i = 0; i < ANYOF_MAX; i++) {
1661                 assert(i % 2 != 0
1662                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1663                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1664
1665                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1666                     ANYOF_POSIXL_SET(&temp, i + add);
1667                 }
1668                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1669             }
1670             ANYOF_POSIXL_AND(&temp, ssc);
1671
1672         } /* else ssc already has no posixes */
1673     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1674          in its initial state */
1675     else if (! is_ANYOF_SYNTHETIC(and_with)
1676              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1677     {
1678         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1679          * copy it over 'ssc' */
1680         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1681             if (is_ANYOF_SYNTHETIC(and_with)) {
1682                 StructCopy(and_with, ssc, regnode_ssc);
1683             }
1684             else {
1685                 ssc->invlist = anded_cp_list;
1686                 ANYOF_POSIXL_ZERO(ssc);
1687                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1688                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1689                 }
1690             }
1691         }
1692         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1693                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1694         {
1695             /* One or the other of P1, P2 is non-empty. */
1696             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1697                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1698             }
1699             ssc_union(ssc, anded_cp_list, FALSE);
1700         }
1701         else { /* P1 = P2 = empty */
1702             ssc_intersection(ssc, anded_cp_list, FALSE);
1703         }
1704     }
1705 }
1706
1707 STATIC void
1708 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1709                const regnode_charclass *or_with)
1710 {
1711     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1712      * another SSC or a regular ANYOF class.  Can create false positives if
1713      * 'or_with' is to be inverted. */
1714
1715     SV* ored_cp_list;
1716     U8 ored_flags;
1717
1718     PERL_ARGS_ASSERT_SSC_OR;
1719
1720     assert(is_ANYOF_SYNTHETIC(ssc));
1721
1722     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1723      * the code point inversion list and just the relevant flags */
1724     if (is_ANYOF_SYNTHETIC(or_with)) {
1725         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1726         ored_flags = ANYOF_FLAGS(or_with);
1727     }
1728     else {
1729         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1730         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1731         if (OP(or_with) != ANYOFD) {
1732             ored_flags
1733             |= ANYOF_FLAGS(or_with)
1734              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1735                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1736             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1737                 ored_flags |=
1738                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1739             }
1740         }
1741     }
1742
1743     ANYOF_FLAGS(ssc) |= ored_flags;
1744
1745     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1746      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1747      * 'or_with' may be inverted.  When not inverted, we have the simple
1748      * situation of computing:
1749      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1750      * If P1|P2 yields a situation with both a class and its complement are
1751      * set, like having both \w and \W, this matches all code points, and we
1752      * can delete these from the P component of the ssc going forward.  XXX We
1753      * might be able to delete all the P components, but I (khw) am not certain
1754      * about this, and it is better to be safe.
1755      *
1756      * Inverted, we have
1757      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1758      *                         <=  (C1 | P1) | ~C2
1759      *                         <=  (C1 | ~C2) | P1
1760      * (which results in actually simpler code than the non-inverted case)
1761      * */
1762
1763     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1764         && ! is_ANYOF_SYNTHETIC(or_with))
1765     {
1766         /* We ignore P2, leaving P1 going forward */
1767     }   /* else  Not inverted */
1768     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1769         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1770         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1771             unsigned int i;
1772             for (i = 0; i < ANYOF_MAX; i += 2) {
1773                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1774                 {
1775                     ssc_match_all_cp(ssc);
1776                     ANYOF_POSIXL_CLEAR(ssc, i);
1777                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1778                 }
1779             }
1780         }
1781     }
1782
1783     ssc_union(ssc,
1784               ored_cp_list,
1785               FALSE /* Already has been inverted */
1786               );
1787 }
1788
1789 PERL_STATIC_INLINE void
1790 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1791 {
1792     PERL_ARGS_ASSERT_SSC_UNION;
1793
1794     assert(is_ANYOF_SYNTHETIC(ssc));
1795
1796     _invlist_union_maybe_complement_2nd(ssc->invlist,
1797                                         invlist,
1798                                         invert2nd,
1799                                         &ssc->invlist);
1800 }
1801
1802 PERL_STATIC_INLINE void
1803 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1804                          SV* const invlist,
1805                          const bool invert2nd)
1806 {
1807     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1808
1809     assert(is_ANYOF_SYNTHETIC(ssc));
1810
1811     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1812                                                invlist,
1813                                                invert2nd,
1814                                                &ssc->invlist);
1815 }
1816
1817 PERL_STATIC_INLINE void
1818 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1819 {
1820     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1821
1822     assert(is_ANYOF_SYNTHETIC(ssc));
1823
1824     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1825 }
1826
1827 PERL_STATIC_INLINE void
1828 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1829 {
1830     /* AND just the single code point 'cp' into the SSC 'ssc' */
1831
1832     SV* cp_list = _new_invlist(2);
1833
1834     PERL_ARGS_ASSERT_SSC_CP_AND;
1835
1836     assert(is_ANYOF_SYNTHETIC(ssc));
1837
1838     cp_list = add_cp_to_invlist(cp_list, cp);
1839     ssc_intersection(ssc, cp_list,
1840                      FALSE /* Not inverted */
1841                      );
1842     SvREFCNT_dec_NN(cp_list);
1843 }
1844
1845 PERL_STATIC_INLINE void
1846 S_ssc_clear_locale(regnode_ssc *ssc)
1847 {
1848     /* Set the SSC 'ssc' to not match any locale things */
1849     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1850
1851     assert(is_ANYOF_SYNTHETIC(ssc));
1852
1853     ANYOF_POSIXL_ZERO(ssc);
1854     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1855 }
1856
1857 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1858
1859 STATIC bool
1860 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1861 {
1862     /* The synthetic start class is used to hopefully quickly winnow down
1863      * places where a pattern could start a match in the target string.  If it
1864      * doesn't really narrow things down that much, there isn't much point to
1865      * having the overhead of using it.  This function uses some very crude
1866      * heuristics to decide if to use the ssc or not.
1867      *
1868      * It returns TRUE if 'ssc' rules out more than half what it considers to
1869      * be the "likely" possible matches, but of course it doesn't know what the
1870      * actual things being matched are going to be; these are only guesses
1871      *
1872      * For /l matches, it assumes that the only likely matches are going to be
1873      *      in the 0-255 range, uniformly distributed, so half of that is 127
1874      * For /a and /d matches, it assumes that the likely matches will be just
1875      *      the ASCII range, so half of that is 63
1876      * For /u and there isn't anything matching above the Latin1 range, it
1877      *      assumes that that is the only range likely to be matched, and uses
1878      *      half that as the cut-off: 127.  If anything matches above Latin1,
1879      *      it assumes that all of Unicode could match (uniformly), except for
1880      *      non-Unicode code points and things in the General Category "Other"
1881      *      (unassigned, private use, surrogates, controls and formats).  This
1882      *      is a much large number. */
1883
1884     U32 count = 0;      /* Running total of number of code points matched by
1885                            'ssc' */
1886     UV start, end;      /* Start and end points of current range in inversion
1887                            list */
1888     const U32 max_code_points = (LOC)
1889                                 ?  256
1890                                 : ((   ! UNI_SEMANTICS
1891                                      || invlist_highest(ssc->invlist) < 256)
1892                                   ? 128
1893                                   : NON_OTHER_COUNT);
1894     const U32 max_match = max_code_points / 2;
1895
1896     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1897
1898     invlist_iterinit(ssc->invlist);
1899     while (invlist_iternext(ssc->invlist, &start, &end)) {
1900         if (start >= max_code_points) {
1901             break;
1902         }
1903         end = MIN(end, max_code_points - 1);
1904         count += end - start + 1;
1905         if (count >= max_match) {
1906             invlist_iterfinish(ssc->invlist);
1907             return FALSE;
1908         }
1909     }
1910
1911     return TRUE;
1912 }
1913
1914
1915 STATIC void
1916 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1917 {
1918     /* The inversion list in the SSC is marked mortal; now we need a more
1919      * permanent copy, which is stored the same way that is done in a regular
1920      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1921      * map */
1922
1923     SV* invlist = invlist_clone(ssc->invlist);
1924
1925     PERL_ARGS_ASSERT_SSC_FINALIZE;
1926
1927     assert(is_ANYOF_SYNTHETIC(ssc));
1928
1929     /* The code in this file assumes that all but these flags aren't relevant
1930      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1931      * by the time we reach here */
1932     assert(! (ANYOF_FLAGS(ssc)
1933         & ~( ANYOF_COMMON_FLAGS
1934             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1935             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1936
1937     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1938
1939     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1940                                 NULL, NULL, NULL, FALSE);
1941
1942     /* Make sure is clone-safe */
1943     ssc->invlist = NULL;
1944
1945     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1946         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1947     }
1948
1949     if (RExC_contains_locale) {
1950         OP(ssc) = ANYOFL;
1951     }
1952
1953     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1954 }
1955
1956 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1957 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1958 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1959 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1960                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1961                                : 0 )
1962
1963
1964 #ifdef DEBUGGING
1965 /*
1966    dump_trie(trie,widecharmap,revcharmap)
1967    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1968    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1969
1970    These routines dump out a trie in a somewhat readable format.
1971    The _interim_ variants are used for debugging the interim
1972    tables that are used to generate the final compressed
1973    representation which is what dump_trie expects.
1974
1975    Part of the reason for their existence is to provide a form
1976    of documentation as to how the different representations function.
1977
1978 */
1979
1980 /*
1981   Dumps the final compressed table form of the trie to Perl_debug_log.
1982   Used for debugging make_trie().
1983 */
1984
1985 STATIC void
1986 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1987             AV *revcharmap, U32 depth)
1988 {
1989     U32 state;
1990     SV *sv=sv_newmortal();
1991     int colwidth= widecharmap ? 6 : 4;
1992     U16 word;
1993     GET_RE_DEBUG_FLAGS_DECL;
1994
1995     PERL_ARGS_ASSERT_DUMP_TRIE;
1996
1997     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
1998         depth+1, "Match","Base","Ofs" );
1999
2000     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2001         SV ** const tmp = av_fetch( revcharmap, state, 0);
2002         if ( tmp ) {
2003             Perl_re_printf( aTHX_  "%*s",
2004                 colwidth,
2005                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2006                             PL_colors[0], PL_colors[1],
2007                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2008                             PERL_PV_ESCAPE_FIRSTCHAR
2009                 )
2010             );
2011         }
2012     }
2013     Perl_re_printf( aTHX_  "\n");
2014     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2015
2016     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2017         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2018     Perl_re_printf( aTHX_  "\n");
2019
2020     for( state = 1 ; state < trie->statecount ; state++ ) {
2021         const U32 base = trie->states[ state ].trans.base;
2022
2023         Perl_re_indentf( aTHX_  "#%4"UVXf"|", depth+1, (UV)state);
2024
2025         if ( trie->states[ state ].wordnum ) {
2026             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2027         } else {
2028             Perl_re_printf( aTHX_  "%6s", "" );
2029         }
2030
2031         Perl_re_printf( aTHX_  " @%4"UVXf" ", (UV)base );
2032
2033         if ( base ) {
2034             U32 ofs = 0;
2035
2036             while( ( base + ofs  < trie->uniquecharcount ) ||
2037                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2038                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2039                                                                     != state))
2040                     ofs++;
2041
2042             Perl_re_printf( aTHX_  "+%2"UVXf"[ ", (UV)ofs);
2043
2044             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2045                 if ( ( base + ofs >= trie->uniquecharcount )
2046                         && ( base + ofs - trie->uniquecharcount
2047                                                         < trie->lasttrans )
2048                         && trie->trans[ base + ofs
2049                                     - trie->uniquecharcount ].check == state )
2050                 {
2051                    Perl_re_printf( aTHX_  "%*"UVXf, colwidth,
2052                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2053                    );
2054                 } else {
2055                     Perl_re_printf( aTHX_  "%*s",colwidth,"   ." );
2056                 }
2057             }
2058
2059             Perl_re_printf( aTHX_  "]");
2060
2061         }
2062         Perl_re_printf( aTHX_  "\n" );
2063     }
2064     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2065                                 depth);
2066     for (word=1; word <= trie->wordcount; word++) {
2067         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2068             (int)word, (int)(trie->wordinfo[word].prev),
2069             (int)(trie->wordinfo[word].len));
2070     }
2071     Perl_re_printf( aTHX_  "\n" );
2072 }
2073 /*
2074   Dumps a fully constructed but uncompressed trie in list form.
2075   List tries normally only are used for construction when the number of
2076   possible chars (trie->uniquecharcount) is very high.
2077   Used for debugging make_trie().
2078 */
2079 STATIC void
2080 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2081                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2082                          U32 depth)
2083 {
2084     U32 state;
2085     SV *sv=sv_newmortal();
2086     int colwidth= widecharmap ? 6 : 4;
2087     GET_RE_DEBUG_FLAGS_DECL;
2088
2089     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2090
2091     /* print out the table precompression.  */
2092     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2093             depth+1 );
2094     Perl_re_indentf( aTHX_  "%s",
2095             depth+1, "------:-----+-----------------\n" );
2096
2097     for( state=1 ; state < next_alloc ; state ++ ) {
2098         U16 charid;
2099
2100         Perl_re_indentf( aTHX_  " %4"UVXf" :",
2101             depth+1, (UV)state  );
2102         if ( ! trie->states[ state ].wordnum ) {
2103             Perl_re_printf( aTHX_  "%5s| ","");
2104         } else {
2105             Perl_re_printf( aTHX_  "W%4x| ",
2106                 trie->states[ state ].wordnum
2107             );
2108         }
2109         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2110             SV ** const tmp = av_fetch( revcharmap,
2111                                         TRIE_LIST_ITEM(state,charid).forid, 0);
2112             if ( tmp ) {
2113                 Perl_re_printf( aTHX_  "%*s:%3X=%4"UVXf" | ",
2114                     colwidth,
2115                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2116                               colwidth,
2117                               PL_colors[0], PL_colors[1],
2118                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2119                               | PERL_PV_ESCAPE_FIRSTCHAR
2120                     ) ,
2121                     TRIE_LIST_ITEM(state,charid).forid,
2122                     (UV)TRIE_LIST_ITEM(state,charid).newstate
2123                 );
2124                 if (!(charid % 10))
2125                     Perl_re_printf( aTHX_  "\n%*s| ",
2126                         (int)((depth * 2) + 14), "");
2127             }
2128         }
2129         Perl_re_printf( aTHX_  "\n");
2130     }
2131 }
2132
2133 /*
2134   Dumps a fully constructed but uncompressed trie in table form.
2135   This is the normal DFA style state transition table, with a few
2136   twists to facilitate compression later.
2137   Used for debugging make_trie().
2138 */
2139 STATIC void
2140 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2141                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2142                           U32 depth)
2143 {
2144     U32 state;
2145     U16 charid;
2146     SV *sv=sv_newmortal();
2147     int colwidth= widecharmap ? 6 : 4;
2148     GET_RE_DEBUG_FLAGS_DECL;
2149
2150     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2151
2152     /*
2153        print out the table precompression so that we can do a visual check
2154        that they are identical.
2155      */
2156
2157     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2158
2159     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2160         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2161         if ( tmp ) {
2162             Perl_re_printf( aTHX_  "%*s",
2163                 colwidth,
2164                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2165                             PL_colors[0], PL_colors[1],
2166                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2167                             PERL_PV_ESCAPE_FIRSTCHAR
2168                 )
2169             );
2170         }
2171     }
2172
2173     Perl_re_printf( aTHX_ "\n");
2174     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2175
2176     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2177         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2178     }
2179
2180     Perl_re_printf( aTHX_  "\n" );
2181
2182     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2183
2184         Perl_re_indentf( aTHX_  "%4"UVXf" : ",
2185             depth+1,
2186             (UV)TRIE_NODENUM( state ) );
2187
2188         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2189             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2190             if (v)
2191                 Perl_re_printf( aTHX_  "%*"UVXf, colwidth, v );
2192             else
2193                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2194         }
2195         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2196             Perl_re_printf( aTHX_  " (%4"UVXf")\n",
2197                                             (UV)trie->trans[ state ].check );
2198         } else {
2199             Perl_re_printf( aTHX_  " (%4"UVXf") W%4X\n",
2200                                             (UV)trie->trans[ state ].check,
2201             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2202         }
2203     }
2204 }
2205
2206 #endif
2207
2208
2209 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2210   startbranch: the first branch in the whole branch sequence
2211   first      : start branch of sequence of branch-exact nodes.
2212                May be the same as startbranch
2213   last       : Thing following the last branch.
2214                May be the same as tail.
2215   tail       : item following the branch sequence
2216   count      : words in the sequence
2217   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2218   depth      : indent depth
2219
2220 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2221
2222 A trie is an N'ary tree where the branches are determined by digital
2223 decomposition of the key. IE, at the root node you look up the 1st character and
2224 follow that branch repeat until you find the end of the branches. Nodes can be
2225 marked as "accepting" meaning they represent a complete word. Eg:
2226
2227   /he|she|his|hers/
2228
2229 would convert into the following structure. Numbers represent states, letters
2230 following numbers represent valid transitions on the letter from that state, if
2231 the number is in square brackets it represents an accepting state, otherwise it
2232 will be in parenthesis.
2233
2234       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2235       |    |
2236       |   (2)
2237       |    |
2238      (1)   +-i->(6)-+-s->[7]
2239       |
2240       +-s->(3)-+-h->(4)-+-e->[5]
2241
2242       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2243
2244 This shows that when matching against the string 'hers' we will begin at state 1
2245 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2246 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2247 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2248 single traverse. We store a mapping from accepting to state to which word was
2249 matched, and then when we have multiple possibilities we try to complete the
2250 rest of the regex in the order in which they occurred in the alternation.
2251
2252 The only prior NFA like behaviour that would be changed by the TRIE support is
2253 the silent ignoring of duplicate alternations which are of the form:
2254
2255  / (DUPE|DUPE) X? (?{ ... }) Y /x
2256
2257 Thus EVAL blocks following a trie may be called a different number of times with
2258 and without the optimisation. With the optimisations dupes will be silently
2259 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2260 the following demonstrates:
2261
2262  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2263
2264 which prints out 'word' three times, but
2265
2266  'words'=~/(word|word|word)(?{ print $1 })S/
2267
2268 which doesnt print it out at all. This is due to other optimisations kicking in.
2269
2270 Example of what happens on a structural level:
2271
2272 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2273
2274    1: CURLYM[1] {1,32767}(18)
2275    5:   BRANCH(8)
2276    6:     EXACT <ac>(16)
2277    8:   BRANCH(11)
2278    9:     EXACT <ad>(16)
2279   11:   BRANCH(14)
2280   12:     EXACT <ab>(16)
2281   16:   SUCCEED(0)
2282   17:   NOTHING(18)
2283   18: END(0)
2284
2285 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2286 and should turn into:
2287
2288    1: CURLYM[1] {1,32767}(18)
2289    5:   TRIE(16)
2290         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2291           <ac>
2292           <ad>
2293           <ab>
2294   16:   SUCCEED(0)
2295   17:   NOTHING(18)
2296   18: END(0)
2297
2298 Cases where tail != last would be like /(?foo|bar)baz/:
2299
2300    1: BRANCH(4)
2301    2:   EXACT <foo>(8)
2302    4: BRANCH(7)
2303    5:   EXACT <bar>(8)
2304    7: TAIL(8)
2305    8: EXACT <baz>(10)
2306   10: END(0)
2307
2308 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2309 and would end up looking like:
2310
2311     1: TRIE(8)
2312       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2313         <foo>
2314         <bar>
2315    7: TAIL(8)
2316    8: EXACT <baz>(10)
2317   10: END(0)
2318
2319     d = uvchr_to_utf8_flags(d, uv, 0);
2320
2321 is the recommended Unicode-aware way of saying
2322
2323     *(d++) = uv;
2324 */
2325
2326 #define TRIE_STORE_REVCHAR(val)                                            \
2327     STMT_START {                                                           \
2328         if (UTF) {                                                         \
2329             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2330             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2331             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2332             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2333             SvPOK_on(zlopp);                                               \
2334             SvUTF8_on(zlopp);                                              \
2335             av_push(revcharmap, zlopp);                                    \
2336         } else {                                                           \
2337             char ooooff = (char)val;                                           \
2338             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2339         }                                                                  \
2340         } STMT_END
2341
2342 /* This gets the next character from the input, folding it if not already
2343  * folded. */
2344 #define TRIE_READ_CHAR STMT_START {                                           \
2345     wordlen++;                                                                \
2346     if ( UTF ) {                                                              \
2347         /* if it is UTF then it is either already folded, or does not need    \
2348          * folding */                                                         \
2349         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2350     }                                                                         \
2351     else if (folder == PL_fold_latin1) {                                      \
2352         /* This folder implies Unicode rules, which in the range expressible  \
2353          *  by not UTF is the lower case, with the two exceptions, one of     \
2354          *  which should have been taken care of before calling this */       \
2355         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2356         uvc = toLOWER_L1(*uc);                                                \
2357         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2358         len = 1;                                                              \
2359     } else {                                                                  \
2360         /* raw data, will be folded later if needed */                        \
2361         uvc = (U32)*uc;                                                       \
2362         len = 1;                                                              \
2363     }                                                                         \
2364 } STMT_END
2365
2366
2367
2368 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2369     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2370         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2371         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2372     }                                                           \
2373     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2374     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2375     TRIE_LIST_CUR( state )++;                                   \
2376 } STMT_END
2377
2378 #define TRIE_LIST_NEW(state) STMT_START {                       \
2379     Newxz( trie->states[ state ].trans.list,               \
2380         4, reg_trie_trans_le );                                 \
2381      TRIE_LIST_CUR( state ) = 1;                                \
2382      TRIE_LIST_LEN( state ) = 4;                                \
2383 } STMT_END
2384
2385 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2386     U16 dupe= trie->states[ state ].wordnum;                    \
2387     regnode * const noper_next = regnext( noper );              \
2388                                                                 \
2389     DEBUG_r({                                                   \
2390         /* store the word for dumping */                        \
2391         SV* tmp;                                                \
2392         if (OP(noper) != NOTHING)                               \
2393             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2394         else                                                    \
2395             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2396         av_push( trie_words, tmp );                             \
2397     });                                                         \
2398                                                                 \
2399     curword++;                                                  \
2400     trie->wordinfo[curword].prev   = 0;                         \
2401     trie->wordinfo[curword].len    = wordlen;                   \
2402     trie->wordinfo[curword].accept = state;                     \
2403                                                                 \
2404     if ( noper_next < tail ) {                                  \
2405         if (!trie->jump)                                        \
2406             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2407                                                  sizeof(U16) ); \
2408         trie->jump[curword] = (U16)(noper_next - convert);      \
2409         if (!jumper)                                            \
2410             jumper = noper_next;                                \
2411         if (!nextbranch)                                        \
2412             nextbranch= regnext(cur);                           \
2413     }                                                           \
2414                                                                 \
2415     if ( dupe ) {                                               \
2416         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2417         /* chain, so that when the bits of chain are later    */\
2418         /* linked together, the dups appear in the chain      */\
2419         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2420         trie->wordinfo[dupe].prev = curword;                    \
2421     } else {                                                    \
2422         /* we haven't inserted this word yet.                */ \
2423         trie->states[ state ].wordnum = curword;                \
2424     }                                                           \
2425 } STMT_END
2426
2427
2428 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2429      ( ( base + charid >=  ucharcount                                   \
2430          && base + charid < ubound                                      \
2431          && state == trie->trans[ base - ucharcount + charid ].check    \
2432          && trie->trans[ base - ucharcount + charid ].next )            \
2433            ? trie->trans[ base - ucharcount + charid ].next             \
2434            : ( state==1 ? special : 0 )                                 \
2435       )
2436
2437 #define MADE_TRIE       1
2438 #define MADE_JUMP_TRIE  2
2439 #define MADE_EXACT_TRIE 4
2440
2441 STATIC I32
2442 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2443                   regnode *first, regnode *last, regnode *tail,
2444                   U32 word_count, U32 flags, U32 depth)
2445 {
2446     /* first pass, loop through and scan words */
2447     reg_trie_data *trie;
2448     HV *widecharmap = NULL;
2449     AV *revcharmap = newAV();
2450     regnode *cur;
2451     STRLEN len = 0;
2452     UV uvc = 0;
2453     U16 curword = 0;
2454     U32 next_alloc = 0;
2455     regnode *jumper = NULL;
2456     regnode *nextbranch = NULL;
2457     regnode *convert = NULL;
2458     U32 *prev_states; /* temp array mapping each state to previous one */
2459     /* we just use folder as a flag in utf8 */
2460     const U8 * folder = NULL;
2461
2462 #ifdef DEBUGGING
2463     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2464     AV *trie_words = NULL;
2465     /* along with revcharmap, this only used during construction but both are
2466      * useful during debugging so we store them in the struct when debugging.
2467      */
2468 #else
2469     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2470     STRLEN trie_charcount=0;
2471 #endif
2472     SV *re_trie_maxbuff;
2473     GET_RE_DEBUG_FLAGS_DECL;
2474
2475     PERL_ARGS_ASSERT_MAKE_TRIE;
2476 #ifndef DEBUGGING
2477     PERL_UNUSED_ARG(depth);
2478 #endif
2479
2480     switch (flags) {
2481         case EXACT: case EXACTL: break;
2482         case EXACTFA:
2483         case EXACTFU_SS:
2484         case EXACTFU:
2485         case EXACTFLU8: folder = PL_fold_latin1; break;
2486         case EXACTF:  folder = PL_fold; break;
2487         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2488     }
2489
2490     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2491     trie->refcount = 1;
2492     trie->startstate = 1;
2493     trie->wordcount = word_count;
2494     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2495     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2496     if (flags == EXACT || flags == EXACTL)
2497         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2498     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2499                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2500
2501     DEBUG_r({
2502         trie_words = newAV();
2503     });
2504
2505     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2506     assert(re_trie_maxbuff);
2507     if (!SvIOK(re_trie_maxbuff)) {
2508         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2509     }
2510     DEBUG_TRIE_COMPILE_r({
2511         Perl_re_indentf( aTHX_
2512           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2513           depth+1,
2514           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2515           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2516     });
2517
2518    /* Find the node we are going to overwrite */
2519     if ( first == startbranch && OP( last ) != BRANCH ) {
2520         /* whole branch chain */
2521         convert = first;
2522     } else {
2523         /* branch sub-chain */
2524         convert = NEXTOPER( first );
2525     }
2526
2527     /*  -- First loop and Setup --
2528
2529        We first traverse the branches and scan each word to determine if it
2530        contains widechars, and how many unique chars there are, this is
2531        important as we have to build a table with at least as many columns as we
2532        have unique chars.
2533
2534        We use an array of integers to represent the character codes 0..255
2535        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2536        the native representation of the character value as the key and IV's for
2537        the coded index.
2538
2539        *TODO* If we keep track of how many times each character is used we can
2540        remap the columns so that the table compression later on is more
2541        efficient in terms of memory by ensuring the most common value is in the
2542        middle and the least common are on the outside.  IMO this would be better
2543        than a most to least common mapping as theres a decent chance the most
2544        common letter will share a node with the least common, meaning the node
2545        will not be compressible. With a middle is most common approach the worst
2546        case is when we have the least common nodes twice.
2547
2548      */
2549
2550     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2551         regnode *noper = NEXTOPER( cur );
2552         const U8 *uc;
2553         const U8 *e;
2554         int foldlen = 0;
2555         U32 wordlen      = 0;         /* required init */
2556         STRLEN minchars = 0;
2557         STRLEN maxchars = 0;
2558         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2559                                                bitmap?*/
2560
2561         if (OP(noper) == NOTHING) {
2562             regnode *noper_next= regnext(noper);
2563             if (noper_next < tail)
2564                 noper= noper_next;
2565         }
2566
2567         if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2568             uc= (U8*)STRING(noper);
2569             e= uc + STR_LEN(noper);
2570         } else {
2571             trie->minlen= 0;
2572             continue;
2573         }
2574
2575
2576         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2577             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2578                                           regardless of encoding */
2579             if (OP( noper ) == EXACTFU_SS) {
2580                 /* false positives are ok, so just set this */
2581                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2582             }
2583         }
2584         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2585                                            branch */
2586             TRIE_CHARCOUNT(trie)++;
2587             TRIE_READ_CHAR;
2588
2589             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2590              * is in effect.  Under /i, this character can match itself, or
2591              * anything that folds to it.  If not under /i, it can match just
2592              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2593              * all fold to k, and all are single characters.   But some folds
2594              * expand to more than one character, so for example LATIN SMALL
2595              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2596              * the string beginning at 'uc' is 'ffi', it could be matched by
2597              * three characters, or just by the one ligature character. (It
2598              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2599              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2600              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2601              * match.)  The trie needs to know the minimum and maximum number
2602              * of characters that could match so that it can use size alone to
2603              * quickly reject many match attempts.  The max is simple: it is
2604              * the number of folded characters in this branch (since a fold is
2605              * never shorter than what folds to it. */
2606
2607             maxchars++;
2608
2609             /* And the min is equal to the max if not under /i (indicated by
2610              * 'folder' being NULL), or there are no multi-character folds.  If
2611              * there is a multi-character fold, the min is incremented just
2612              * once, for the character that folds to the sequence.  Each
2613              * character in the sequence needs to be added to the list below of
2614              * characters in the trie, but we count only the first towards the
2615              * min number of characters needed.  This is done through the
2616              * variable 'foldlen', which is returned by the macros that look
2617              * for these sequences as the number of bytes the sequence
2618              * occupies.  Each time through the loop, we decrement 'foldlen' by
2619              * how many bytes the current char occupies.  Only when it reaches
2620              * 0 do we increment 'minchars' or look for another multi-character
2621              * sequence. */
2622             if (folder == NULL) {
2623                 minchars++;
2624             }
2625             else if (foldlen > 0) {
2626                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2627             }
2628             else {
2629                 minchars++;
2630
2631                 /* See if *uc is the beginning of a multi-character fold.  If
2632                  * so, we decrement the length remaining to look at, to account
2633                  * for the current character this iteration.  (We can use 'uc'
2634                  * instead of the fold returned by TRIE_READ_CHAR because for
2635                  * non-UTF, the latin1_safe macro is smart enough to account
2636                  * for all the unfolded characters, and because for UTF, the
2637                  * string will already have been folded earlier in the
2638                  * compilation process */
2639                 if (UTF) {
2640                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2641                         foldlen -= UTF8SKIP(uc);
2642                     }
2643                 }
2644                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2645                     foldlen--;
2646                 }
2647             }
2648
2649             /* The current character (and any potential folds) should be added
2650              * to the possible matching characters for this position in this
2651              * branch */
2652             if ( uvc < 256 ) {
2653                 if ( folder ) {
2654                     U8 folded= folder[ (U8) uvc ];
2655                     if ( !trie->charmap[ folded ] ) {
2656                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2657                         TRIE_STORE_REVCHAR( folded );
2658                     }
2659                 }
2660                 if ( !trie->charmap[ uvc ] ) {
2661                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2662                     TRIE_STORE_REVCHAR( uvc );
2663                 }
2664                 if ( set_bit ) {
2665                     /* store the codepoint in the bitmap, and its folded
2666                      * equivalent. */
2667                     TRIE_BITMAP_SET(trie, uvc);
2668
2669                     /* store the folded codepoint */
2670                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2671
2672                     if ( !UTF ) {
2673                         /* store first byte of utf8 representation of
2674                            variant codepoints */
2675                         if (! UVCHR_IS_INVARIANT(uvc)) {
2676                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2677                         }
2678                     }
2679                     set_bit = 0; /* We've done our bit :-) */
2680                 }
2681             } else {
2682
2683                 /* XXX We could come up with the list of code points that fold
2684                  * to this using PL_utf8_foldclosures, except not for
2685                  * multi-char folds, as there may be multiple combinations
2686                  * there that could work, which needs to wait until runtime to
2687                  * resolve (The comment about LIGATURE FFI above is such an
2688                  * example */
2689
2690                 SV** svpp;
2691                 if ( !widecharmap )
2692                     widecharmap = newHV();
2693
2694                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2695
2696                 if ( !svpp )
2697                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2698
2699                 if ( !SvTRUE( *svpp ) ) {
2700                     sv_setiv( *svpp, ++trie->uniquecharcount );
2701                     TRIE_STORE_REVCHAR(uvc);
2702                 }
2703             }
2704         } /* end loop through characters in this branch of the trie */
2705
2706         /* We take the min and max for this branch and combine to find the min
2707          * and max for all branches processed so far */
2708         if( cur == first ) {
2709             trie->minlen = minchars;
2710             trie->maxlen = maxchars;
2711         } else if (minchars < trie->minlen) {
2712             trie->minlen = minchars;
2713         } else if (maxchars > trie->maxlen) {
2714             trie->maxlen = maxchars;
2715         }
2716     } /* end first pass */
2717     DEBUG_TRIE_COMPILE_r(
2718         Perl_re_indentf( aTHX_
2719                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2720                 depth+1,
2721                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2722                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2723                 (int)trie->minlen, (int)trie->maxlen )
2724     );
2725
2726     /*
2727         We now know what we are dealing with in terms of unique chars and
2728         string sizes so we can calculate how much memory a naive
2729         representation using a flat table  will take. If it's over a reasonable
2730         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2731         conservative but potentially much slower representation using an array
2732         of lists.
2733
2734         At the end we convert both representations into the same compressed
2735         form that will be used in regexec.c for matching with. The latter
2736         is a form that cannot be used to construct with but has memory
2737         properties similar to the list form and access properties similar
2738         to the table form making it both suitable for fast searches and
2739         small enough that its feasable to store for the duration of a program.
2740
2741         See the comment in the code where the compressed table is produced
2742         inplace from the flat tabe representation for an explanation of how
2743         the compression works.
2744
2745     */
2746
2747
2748     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2749     prev_states[1] = 0;
2750
2751     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2752                                                     > SvIV(re_trie_maxbuff) )
2753     {
2754         /*
2755             Second Pass -- Array Of Lists Representation
2756
2757             Each state will be represented by a list of charid:state records
2758             (reg_trie_trans_le) the first such element holds the CUR and LEN
2759             points of the allocated array. (See defines above).
2760
2761             We build the initial structure using the lists, and then convert
2762             it into the compressed table form which allows faster lookups
2763             (but cant be modified once converted).
2764         */
2765
2766         STRLEN transcount = 1;
2767
2768         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2769             depth+1));
2770
2771         trie->states = (reg_trie_state *)
2772             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2773                                   sizeof(reg_trie_state) );
2774         TRIE_LIST_NEW(1);
2775         next_alloc = 2;
2776
2777         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2778
2779             regnode *noper   = NEXTOPER( cur );
2780             U32 state        = 1;         /* required init */
2781             U16 charid       = 0;         /* sanity init */
2782             U32 wordlen      = 0;         /* required init */
2783
2784             if (OP(noper) == NOTHING) {
2785                 regnode *noper_next= regnext(noper);
2786                 if (noper_next < tail)
2787                     noper= noper_next;
2788             }
2789
2790             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2791                 const U8 *uc= (U8*)STRING(noper);
2792                 const U8 *e= uc + STR_LEN(noper);
2793
2794                 for ( ; uc < e ; uc += len ) {
2795
2796                     TRIE_READ_CHAR;
2797
2798                     if ( uvc < 256 ) {
2799                         charid = trie->charmap[ uvc ];
2800                     } else {
2801                         SV** const svpp = hv_fetch( widecharmap,
2802                                                     (char*)&uvc,
2803                                                     sizeof( UV ),
2804                                                     0);
2805                         if ( !svpp ) {
2806                             charid = 0;
2807                         } else {
2808                             charid=(U16)SvIV( *svpp );
2809                         }
2810                     }
2811                     /* charid is now 0 if we dont know the char read, or
2812                      * nonzero if we do */
2813                     if ( charid ) {
2814
2815                         U16 check;
2816                         U32 newstate = 0;
2817
2818                         charid--;
2819                         if ( !trie->states[ state ].trans.list ) {
2820                             TRIE_LIST_NEW( state );
2821                         }
2822                         for ( check = 1;
2823                               check <= TRIE_LIST_USED( state );
2824                               check++ )
2825                         {
2826                             if ( TRIE_LIST_ITEM( state, check ).forid
2827                                                                     == charid )
2828                             {
2829                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2830                                 break;
2831                             }
2832                         }
2833                         if ( ! newstate ) {
2834                             newstate = next_alloc++;
2835                             prev_states[newstate] = state;
2836                             TRIE_LIST_PUSH( state, charid, newstate );
2837                             transcount++;
2838                         }
2839                         state = newstate;
2840                     } else {
2841                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2842                     }
2843                 }
2844             }
2845             TRIE_HANDLE_WORD(state);
2846
2847         } /* end second pass */
2848
2849         /* next alloc is the NEXT state to be allocated */
2850         trie->statecount = next_alloc;
2851         trie->states = (reg_trie_state *)
2852             PerlMemShared_realloc( trie->states,
2853                                    next_alloc
2854                                    * sizeof(reg_trie_state) );
2855
2856         /* and now dump it out before we compress it */
2857         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2858                                                          revcharmap, next_alloc,
2859                                                          depth+1)
2860         );
2861
2862         trie->trans = (reg_trie_trans *)
2863             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2864         {
2865             U32 state;
2866             U32 tp = 0;
2867             U32 zp = 0;
2868
2869
2870             for( state=1 ; state < next_alloc ; state ++ ) {
2871                 U32 base=0;
2872
2873                 /*
2874                 DEBUG_TRIE_COMPILE_MORE_r(
2875                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
2876                 );
2877                 */
2878
2879                 if (trie->states[state].trans.list) {
2880                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2881                     U16 maxid=minid;
2882                     U16 idx;
2883
2884                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2885                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2886                         if ( forid < minid ) {
2887                             minid=forid;
2888                         } else if ( forid > maxid ) {
2889                             maxid=forid;
2890                         }
2891                     }
2892                     if ( transcount < tp + maxid - minid + 1) {
2893                         transcount *= 2;
2894                         trie->trans = (reg_trie_trans *)
2895                             PerlMemShared_realloc( trie->trans,
2896                                                      transcount
2897                                                      * sizeof(reg_trie_trans) );
2898                         Zero( trie->trans + (transcount / 2),
2899                               transcount / 2,
2900                               reg_trie_trans );
2901                     }
2902                     base = trie->uniquecharcount + tp - minid;
2903                     if ( maxid == minid ) {
2904                         U32 set = 0;
2905                         for ( ; zp < tp ; zp++ ) {
2906                             if ( ! trie->trans[ zp ].next ) {
2907                                 base = trie->uniquecharcount + zp - minid;
2908                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2909                                                                    1).newstate;
2910                                 trie->trans[ zp ].check = state;
2911                                 set = 1;
2912                                 break;
2913                             }
2914                         }
2915                         if ( !set ) {
2916                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2917                                                                    1).newstate;
2918                             trie->trans[ tp ].check = state;
2919                             tp++;
2920                             zp = tp;
2921                         }
2922                     } else {
2923                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2924                             const U32 tid = base
2925                                            - trie->uniquecharcount
2926                                            + TRIE_LIST_ITEM( state, idx ).forid;
2927                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2928                                                                 idx ).newstate;
2929                             trie->trans[ tid ].check = state;
2930                         }
2931                         tp += ( maxid - minid + 1 );
2932                     }
2933                     Safefree(trie->states[ state ].trans.list);
2934                 }
2935                 /*
2936                 DEBUG_TRIE_COMPILE_MORE_r(
2937                     Perl_re_printf( aTHX_  " base: %d\n",base);
2938                 );
2939                 */
2940                 trie->states[ state ].trans.base=base;
2941             }
2942             trie->lasttrans = tp + 1;
2943         }
2944     } else {
2945         /*
2946            Second Pass -- Flat Table Representation.
2947
2948            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2949            each.  We know that we will need Charcount+1 trans at most to store
2950            the data (one row per char at worst case) So we preallocate both
2951            structures assuming worst case.
2952
2953            We then construct the trie using only the .next slots of the entry
2954            structs.
2955
2956            We use the .check field of the first entry of the node temporarily
2957            to make compression both faster and easier by keeping track of how
2958            many non zero fields are in the node.
2959
2960            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2961            transition.
2962
2963            There are two terms at use here: state as a TRIE_NODEIDX() which is
2964            a number representing the first entry of the node, and state as a
2965            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2966            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2967            if there are 2 entrys per node. eg:
2968
2969              A B       A B
2970           1. 2 4    1. 3 7
2971           2. 0 3    3. 0 5
2972           3. 0 0    5. 0 0
2973           4. 0 0    7. 0 0
2974
2975            The table is internally in the right hand, idx form. However as we
2976            also have to deal with the states array which is indexed by nodenum
2977            we have to use TRIE_NODENUM() to convert.
2978
2979         */
2980         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
2981             depth+1));
2982
2983         trie->trans = (reg_trie_trans *)
2984             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2985                                   * trie->uniquecharcount + 1,
2986                                   sizeof(reg_trie_trans) );
2987         trie->states = (reg_trie_state *)
2988             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2989                                   sizeof(reg_trie_state) );
2990         next_alloc = trie->uniquecharcount + 1;
2991
2992
2993         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2994
2995             regnode *noper   = NEXTOPER( cur );
2996
2997             U32 state        = 1;         /* required init */
2998
2999             U16 charid       = 0;         /* sanity init */
3000             U32 accept_state = 0;         /* sanity init */
3001
3002             U32 wordlen      = 0;         /* required init */
3003
3004             if (OP(noper) == NOTHING) {
3005                 regnode *noper_next= regnext(noper);
3006                 if (noper_next < tail)
3007                     noper= noper_next;
3008             }
3009
3010             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3011                 const U8 *uc= (U8*)STRING(noper);
3012                 const U8 *e= uc + STR_LEN(noper);
3013
3014                 for ( ; uc < e ; uc += len ) {
3015
3016                     TRIE_READ_CHAR;
3017
3018                     if ( uvc < 256 ) {
3019                         charid = trie->charmap[ uvc ];
3020                     } else {
3021                         SV* const * const svpp = hv_fetch( widecharmap,
3022                                                            (char*)&uvc,
3023                                                            sizeof( UV ),
3024                                                            0);
3025                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3026                     }
3027                     if ( charid ) {
3028                         charid--;
3029                         if ( !trie->trans[ state + charid ].next ) {
3030                             trie->trans[ state + charid ].next = next_alloc;
3031                             trie->trans[ state ].check++;
3032                             prev_states[TRIE_NODENUM(next_alloc)]
3033                                     = TRIE_NODENUM(state);
3034                             next_alloc += trie->uniquecharcount;
3035                         }
3036                         state = trie->trans[ state + charid ].next;
3037                     } else {
3038                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
3039                     }
3040                     /* charid is now 0 if we dont know the char read, or
3041                      * nonzero if we do */
3042                 }
3043             }
3044             accept_state = TRIE_NODENUM( state );
3045             TRIE_HANDLE_WORD(accept_state);
3046
3047         } /* end second pass */
3048
3049         /* and now dump it out before we compress it */
3050         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3051                                                           revcharmap,
3052                                                           next_alloc, depth+1));
3053
3054         {
3055         /*
3056            * Inplace compress the table.*
3057
3058            For sparse data sets the table constructed by the trie algorithm will
3059            be mostly 0/FAIL transitions or to put it another way mostly empty.
3060            (Note that leaf nodes will not contain any transitions.)
3061
3062            This algorithm compresses the tables by eliminating most such
3063            transitions, at the cost of a modest bit of extra work during lookup:
3064
3065            - Each states[] entry contains a .base field which indicates the
3066            index in the state[] array wheres its transition data is stored.
3067
3068            - If .base is 0 there are no valid transitions from that node.
3069
3070            - If .base is nonzero then charid is added to it to find an entry in
3071            the trans array.
3072
3073            -If trans[states[state].base+charid].check!=state then the
3074            transition is taken to be a 0/Fail transition. Thus if there are fail
3075            transitions at the front of the node then the .base offset will point
3076            somewhere inside the previous nodes data (or maybe even into a node
3077            even earlier), but the .check field determines if the transition is
3078            valid.
3079
3080            XXX - wrong maybe?
3081            The following process inplace converts the table to the compressed
3082            table: We first do not compress the root node 1,and mark all its
3083            .check pointers as 1 and set its .base pointer as 1 as well. This
3084            allows us to do a DFA construction from the compressed table later,
3085            and ensures that any .base pointers we calculate later are greater
3086            than 0.
3087
3088            - We set 'pos' to indicate the first entry of the second node.
3089
3090            - We then iterate over the columns of the node, finding the first and
3091            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3092            and set the .check pointers accordingly, and advance pos
3093            appropriately and repreat for the next node. Note that when we copy
3094            the next pointers we have to convert them from the original
3095            NODEIDX form to NODENUM form as the former is not valid post
3096            compression.
3097
3098            - If a node has no transitions used we mark its base as 0 and do not
3099            advance the pos pointer.
3100
3101            - If a node only has one transition we use a second pointer into the
3102            structure to fill in allocated fail transitions from other states.
3103            This pointer is independent of the main pointer and scans forward
3104            looking for null transitions that are allocated to a state. When it
3105            finds one it writes the single transition into the "hole".  If the
3106            pointer doesnt find one the single transition is appended as normal.
3107
3108            - Once compressed we can Renew/realloc the structures to release the
3109            excess space.
3110
3111            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3112            specifically Fig 3.47 and the associated pseudocode.
3113
3114            demq
3115         */
3116         const U32 laststate = TRIE_NODENUM( next_alloc );
3117         U32 state, charid;
3118         U32 pos = 0, zp=0;
3119         trie->statecount = laststate;
3120
3121         for ( state = 1 ; state < laststate ; state++ ) {
3122             U8 flag = 0;
3123             const U32 stateidx = TRIE_NODEIDX( state );
3124             const U32 o_used = trie->trans[ stateidx ].check;
3125             U32 used = trie->trans[ stateidx ].check;
3126             trie->trans[ stateidx ].check = 0;
3127
3128             for ( charid = 0;
3129                   used && charid < trie->uniquecharcount;
3130                   charid++ )
3131             {
3132                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3133                     if ( trie->trans[ stateidx + charid ].next ) {
3134                         if (o_used == 1) {
3135                             for ( ; zp < pos ; zp++ ) {
3136                                 if ( ! trie->trans[ zp ].next ) {
3137                                     break;
3138                                 }
3139                             }
3140                             trie->states[ state ].trans.base
3141                                                     = zp
3142                                                       + trie->uniquecharcount
3143                                                       - charid ;
3144                             trie->trans[ zp ].next
3145                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3146                                                              + charid ].next );
3147                             trie->trans[ zp ].check = state;
3148                             if ( ++zp > pos ) pos = zp;
3149                             break;
3150                         }
3151                         used--;
3152                     }
3153                     if ( !flag ) {
3154                         flag = 1;
3155                         trie->states[ state ].trans.base
3156                                        = pos + trie->uniquecharcount - charid ;
3157                     }
3158                     trie->trans[ pos ].next
3159                         = SAFE_TRIE_NODENUM(
3160                                        trie->trans[ stateidx + charid ].next );
3161                     trie->trans[ pos ].check = state;
3162                     pos++;
3163                 }
3164             }
3165         }
3166         trie->lasttrans = pos + 1;
3167         trie->states = (reg_trie_state *)
3168             PerlMemShared_realloc( trie->states, laststate
3169                                    * sizeof(reg_trie_state) );
3170         DEBUG_TRIE_COMPILE_MORE_r(
3171             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
3172                 depth+1,
3173                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3174                        + 1 ),
3175                 (IV)next_alloc,
3176                 (IV)pos,
3177                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3178             );
3179
3180         } /* end table compress */
3181     }
3182     DEBUG_TRIE_COMPILE_MORE_r(
3183             Perl_re_indentf( aTHX_  "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
3184                 depth+1,
3185                 (UV)trie->statecount,
3186                 (UV)trie->lasttrans)
3187     );
3188     /* resize the trans array to remove unused space */
3189     trie->trans = (reg_trie_trans *)
3190         PerlMemShared_realloc( trie->trans, trie->lasttrans
3191                                * sizeof(reg_trie_trans) );
3192
3193     {   /* Modify the program and insert the new TRIE node */
3194         U8 nodetype =(U8)(flags & 0xFF);
3195         char *str=NULL;
3196
3197 #ifdef DEBUGGING
3198         regnode *optimize = NULL;
3199 #ifdef RE_TRACK_PATTERN_OFFSETS
3200
3201         U32 mjd_offset = 0;
3202         U32 mjd_nodelen = 0;
3203 #endif /* RE_TRACK_PATTERN_OFFSETS */
3204 #endif /* DEBUGGING */
3205         /*
3206            This means we convert either the first branch or the first Exact,
3207            depending on whether the thing following (in 'last') is a branch
3208            or not and whther first is the startbranch (ie is it a sub part of
3209            the alternation or is it the whole thing.)
3210            Assuming its a sub part we convert the EXACT otherwise we convert
3211            the whole branch sequence, including the first.
3212          */
3213         /* Find the node we are going to overwrite */
3214         if ( first != startbranch || OP( last ) == BRANCH ) {
3215             /* branch sub-chain */
3216             NEXT_OFF( first ) = (U16)(last - first);
3217 #ifdef RE_TRACK_PATTERN_OFFSETS
3218             DEBUG_r({
3219                 mjd_offset= Node_Offset((convert));
3220                 mjd_nodelen= Node_Length((convert));
3221             });
3222 #endif
3223             /* whole branch chain */
3224         }
3225 #ifdef RE_TRACK_PATTERN_OFFSETS
3226         else {
3227             DEBUG_r({
3228                 const  regnode *nop = NEXTOPER( convert );
3229                 mjd_offset= Node_Offset((nop));
3230                 mjd_nodelen= Node_Length((nop));
3231             });
3232         }
3233         DEBUG_OPTIMISE_r(
3234             Perl_re_indentf( aTHX_  "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
3235                 depth+1,
3236                 (UV)mjd_offset, (UV)mjd_nodelen)
3237         );
3238 #endif
3239         /* But first we check to see if there is a common prefix we can
3240            split out as an EXACT and put in front of the TRIE node.  */
3241         trie->startstate= 1;
3242         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3243             U32 state;
3244             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3245                 U32 ofs = 0;
3246                 I32 idx = -1;
3247                 U32 count = 0;
3248                 const U32 base = trie->states[ state ].trans.base;
3249
3250                 if ( trie->states[state].wordnum )
3251                         count = 1;
3252
3253                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3254                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3255                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3256                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3257                     {
3258                         if ( ++count > 1 ) {
3259                             SV **tmp = av_fetch( revcharmap, ofs, 0);
3260                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3261                             if ( state == 1 ) break;
3262                             if ( count == 2 ) {
3263                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3264                                 DEBUG_OPTIMISE_r(
3265                                     Perl_re_indentf( aTHX_  "New Start State=%"UVuf" Class: [",
3266                                         depth+1,
3267                                         (UV)state));
3268                                 if (idx >= 0) {
3269                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
3270                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3271
3272                                     TRIE_BITMAP_SET(trie,*ch);
3273                                     if ( folder )
3274                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
3275                                     DEBUG_OPTIMISE_r(
3276                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3277                                     );
3278                                 }
3279                             }
3280                             TRIE_BITMAP_SET(trie,*ch);
3281                             if ( folder )
3282                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3283                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3284                         }
3285                         idx = ofs;
3286                     }
3287                 }
3288                 if ( count == 1 ) {
3289                     SV **tmp = av_fetch( revcharmap, idx, 0);
3290                     STRLEN len;
3291                     char *ch = SvPV( *tmp, len );
3292                     DEBUG_OPTIMISE_r({
3293                         SV *sv=sv_newmortal();
3294                         Perl_re_indentf( aTHX_  "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3295                             depth+1,
3296                             (UV)state, (UV)idx,
3297                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3298                                 PL_colors[0], PL_colors[1],
3299                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3300                                 PERL_PV_ESCAPE_FIRSTCHAR
3301                             )
3302                         );
3303                     });
3304                     if ( state==1 ) {
3305                         OP( convert ) = nodetype;
3306                         str=STRING(convert);
3307                         STR_LEN(convert)=0;
3308                     }
3309                     STR_LEN(convert) += len;
3310                     while (len--)
3311                         *str++ = *ch++;
3312                 } else {
3313 #ifdef DEBUGGING
3314                     if (state>1)
3315                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3316 #endif
3317                     break;
3318                 }
3319             }
3320             trie->prefixlen = (state-1);
3321             if (str) {
3322                 regnode *n = convert+NODE_SZ_STR(convert);
3323                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3324                 trie->startstate = state;
3325                 trie->minlen -= (state - 1);
3326                 trie->maxlen -= (state - 1);
3327 #ifdef DEBUGGING
3328                /* At least the UNICOS C compiler choked on this
3329                 * being argument to DEBUG_r(), so let's just have
3330                 * it right here. */
3331                if (
3332 #ifdef PERL_EXT_RE_BUILD
3333                    1
3334 #else
3335                    DEBUG_r_TEST
3336 #endif
3337                    ) {
3338                    regnode *fix = convert;
3339                    U32 word = trie->wordcount;
3340                    mjd_nodelen++;
3341                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3342                    while( ++fix < n ) {
3343                        Set_Node_Offset_Length(fix, 0, 0);
3344                    }
3345                    while (word--) {
3346                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3347                        if (tmp) {
3348                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3349                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3350                            else
3351                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3352                        }
3353                    }
3354                }
3355 #endif
3356                 if (trie->maxlen) {
3357                     convert = n;
3358                 } else {
3359                     NEXT_OFF(convert) = (U16)(tail - convert);
3360                     DEBUG_r(optimize= n);
3361                 }
3362             }
3363         }
3364         if (!jumper)
3365             jumper = last;
3366         if ( trie->maxlen ) {
3367             NEXT_OFF( convert ) = (U16)(tail - convert);
3368             ARG_SET( convert, data_slot );
3369             /* Store the offset to the first unabsorbed branch in
3370                jump[0], which is otherwise unused by the jump logic.
3371                We use this when dumping a trie and during optimisation. */
3372             if (trie->jump)
3373                 trie->jump[0] = (U16)(nextbranch - convert);
3374
3375             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3376              *   and there is a bitmap
3377              *   and the first "jump target" node we found leaves enough room
3378              * then convert the TRIE node into a TRIEC node, with the bitmap
3379              * embedded inline in the opcode - this is hypothetically faster.
3380              */
3381             if ( !trie->states[trie->startstate].wordnum
3382                  && trie->bitmap
3383                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3384             {
3385                 OP( convert ) = TRIEC;
3386                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3387                 PerlMemShared_free(trie->bitmap);
3388                 trie->bitmap= NULL;
3389             } else
3390                 OP( convert ) = TRIE;
3391
3392             /* store the type in the flags */
3393             convert->flags = nodetype;
3394             DEBUG_r({
3395             optimize = convert
3396                       + NODE_STEP_REGNODE
3397                       + regarglen[ OP( convert ) ];
3398             });
3399             /* XXX We really should free up the resource in trie now,
3400                    as we won't use them - (which resources?) dmq */
3401         }
3402         /* needed for dumping*/
3403         DEBUG_r(if (optimize) {
3404             regnode *opt = convert;
3405
3406             while ( ++opt < optimize) {
3407                 Set_Node_Offset_Length(opt,0,0);
3408             }
3409             /*
3410                 Try to clean up some of the debris left after the
3411                 optimisation.
3412              */
3413             while( optimize < jumper ) {
3414                 mjd_nodelen += Node_Length((optimize));
3415                 OP( optimize ) = OPTIMIZED;
3416                 Set_Node_Offset_Length(optimize,0,0);
3417                 optimize++;
3418             }
3419             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3420         });
3421     } /* end node insert */
3422
3423     /*  Finish populating the prev field of the wordinfo array.  Walk back
3424      *  from each accept state until we find another accept state, and if
3425      *  so, point the first word's .prev field at the second word. If the
3426      *  second already has a .prev field set, stop now. This will be the
3427      *  case either if we've already processed that word's accept state,
3428      *  or that state had multiple words, and the overspill words were
3429      *  already linked up earlier.
3430      */
3431     {
3432         U16 word;
3433         U32 state;
3434         U16 prev;
3435
3436         for (word=1; word <= trie->wordcount; word++) {
3437             prev = 0;
3438             if (trie->wordinfo[word].prev)
3439                 continue;
3440             state = trie->wordinfo[word].accept;
3441             while (state) {
3442                 state = prev_states[state];
3443                 if (!state)
3444                     break;
3445                 prev = trie->states[state].wordnum;
3446                 if (prev)
3447                     break;
3448             }
3449             trie->wordinfo[word].prev = prev;
3450         }
3451         Safefree(prev_states);
3452     }
3453
3454
3455     /* and now dump out the compressed format */
3456     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3457
3458     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3459 #ifdef DEBUGGING
3460     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3461     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3462 #else
3463     SvREFCNT_dec_NN(revcharmap);
3464 #endif
3465     return trie->jump
3466            ? MADE_JUMP_TRIE
3467            : trie->startstate>1
3468              ? MADE_EXACT_TRIE
3469              : MADE_TRIE;
3470 }
3471
3472 STATIC regnode *
3473 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3474 {
3475 /* The Trie is constructed and compressed now so we can build a fail array if
3476  * it's needed
3477
3478    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3479    3.32 in the
3480    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3481    Ullman 1985/88
3482    ISBN 0-201-10088-6
3483
3484    We find the fail state for each state in the trie, this state is the longest
3485    proper suffix of the current state's 'word' that is also a proper prefix of
3486    another word in our trie. State 1 represents the word '' and is thus the
3487    default fail state. This allows the DFA not to have to restart after its
3488    tried and failed a word at a given point, it simply continues as though it
3489    had been matching the other word in the first place.
3490    Consider
3491       'abcdgu'=~/abcdefg|cdgu/
3492    When we get to 'd' we are still matching the first word, we would encounter
3493    'g' which would fail, which would bring us to the state representing 'd' in
3494    the second word where we would try 'g' and succeed, proceeding to match
3495    'cdgu'.
3496  */
3497  /* add a fail transition */
3498     const U32 trie_offset = ARG(source);
3499     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3500     U32 *q;
3501     const U32 ucharcount = trie->uniquecharcount;
3502     const U32 numstates = trie->statecount;
3503     const U32 ubound = trie->lasttrans + ucharcount;
3504     U32 q_read = 0;
3505     U32 q_write = 0;
3506     U32 charid;
3507     U32 base = trie->states[ 1 ].trans.base;
3508     U32 *fail;
3509     reg_ac_data *aho;
3510     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3511     regnode *stclass;
3512     GET_RE_DEBUG_FLAGS_DECL;
3513
3514     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3515     PERL_UNUSED_CONTEXT;
3516 #ifndef DEBUGGING
3517     PERL_UNUSED_ARG(depth);
3518 #endif
3519
3520     if ( OP(source) == TRIE ) {
3521         struct regnode_1 *op = (struct regnode_1 *)
3522             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3523         StructCopy(source,op,struct regnode_1);
3524         stclass = (regnode *)op;
3525     } else {
3526         struct regnode_charclass *op = (struct regnode_charclass *)
3527             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3528         StructCopy(source,op,struct regnode_charclass);
3529         stclass = (regnode *)op;
3530     }
3531     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3532
3533     ARG_SET( stclass, data_slot );
3534     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3535     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3536     aho->trie=trie_offset;
3537     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3538     Copy( trie->states, aho->states, numstates, reg_trie_state );
3539     Newxz( q, numstates, U32);
3540     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3541     aho->refcount = 1;
3542     fail = aho->fail;
3543     /* initialize fail[0..1] to be 1 so that we always have
3544        a valid final fail state */
3545     fail[ 0 ] = fail[ 1 ] = 1;
3546
3547     for ( charid = 0; charid < ucharcount ; charid++ ) {
3548         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3549         if ( newstate ) {
3550             q[ q_write ] = newstate;
3551             /* set to point at the root */
3552             fail[ q[ q_write++ ] ]=1;
3553         }
3554     }
3555     while ( q_read < q_write) {
3556         const U32 cur = q[ q_read++ % numstates ];
3557         base = trie->states[ cur ].trans.base;
3558
3559         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3560             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3561             if (ch_state) {
3562                 U32 fail_state = cur;
3563                 U32 fail_base;
3564                 do {
3565                     fail_state = fail[ fail_state ];
3566                     fail_base = aho->states[ fail_state ].trans.base;
3567                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3568
3569                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3570                 fail[ ch_state ] = fail_state;
3571                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3572                 {
3573                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3574                 }
3575                 q[ q_write++ % numstates] = ch_state;
3576             }
3577         }
3578     }
3579     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3580        when we fail in state 1, this allows us to use the
3581        charclass scan to find a valid start char. This is based on the principle
3582        that theres a good chance the string being searched contains lots of stuff
3583        that cant be a start char.
3584      */
3585     fail[ 0 ] = fail[ 1 ] = 0;
3586     DEBUG_TRIE_COMPILE_r({
3587         Perl_re_indentf( aTHX_  "Stclass Failtable (%"UVuf" states): 0",
3588                       depth, (UV)numstates
3589         );
3590         for( q_read=1; q_read<numstates; q_read++ ) {
3591             Perl_re_printf( aTHX_  ", %"UVuf, (UV)fail[q_read]);
3592         }
3593         Perl_re_printf( aTHX_  "\n");
3594     });
3595     Safefree(q);
3596     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3597     return stclass;
3598 }
3599
3600
3601 #define DEBUG_PEEP(str,scan,depth)         \
3602     DEBUG_OPTIMISE_r({if (scan){           \
3603        regnode *Next = regnext(scan);      \
3604        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
3605        Perl_re_indentf( aTHX_  "" str ">%3d: %s (%d)", \
3606            depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3607            Next ? (REG_NODE_NUM(Next)) : 0 );\
3608        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3609        Perl_re_printf( aTHX_  "\n");                   \
3610    }});
3611
3612 /* The below joins as many adjacent EXACTish nodes as possible into a single
3613  * one.  The regop may be changed if the node(s) contain certain sequences that
3614  * require special handling.  The joining is only done if:
3615  * 1) there is room in the current conglomerated node to entirely contain the
3616  *    next one.
3617  * 2) they are the exact same node type
3618  *
3619  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3620  * these get optimized out
3621  *
3622  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3623  * as possible, even if that means splitting an existing node so that its first
3624  * part is moved to the preceeding node.  This would maximise the efficiency of
3625  * memEQ during matching.  Elsewhere in this file, khw proposes splitting
3626  * EXACTFish nodes into portions that don't change under folding vs those that
3627  * do.  Those portions that don't change may be the only things in the pattern that
3628  * could be used to find fixed and floating strings.
3629  *
3630  * If a node is to match under /i (folded), the number of characters it matches
3631  * can be different than its character length if it contains a multi-character
3632  * fold.  *min_subtract is set to the total delta number of characters of the
3633  * input nodes.
3634  *
3635  * And *unfolded_multi_char is set to indicate whether or not the node contains
3636  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3637  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3638  * SMALL LETTER SHARP S, as only if the target string being matched against
3639  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3640  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3641  * whose components are all above the Latin1 range are not run-time locale
3642  * dependent, and have already been folded by the time this function is
3643  * called.)
3644  *
3645  * This is as good a place as any to discuss the design of handling these
3646  * multi-character fold sequences.  It's been wrong in Perl for a very long
3647  * time.  There are three code points in Unicode whose multi-character folds
3648  * were long ago discovered to mess things up.  The previous designs for
3649  * dealing with these involved assigning a special node for them.  This
3650  * approach doesn't always work, as evidenced by this example:
3651  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3652  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3653  * would match just the \xDF, it won't be able to handle the case where a
3654  * successful match would have to cross the node's boundary.  The new approach
3655  * that hopefully generally solves the problem generates an EXACTFU_SS node
3656  * that is "sss" in this case.
3657  *
3658  * It turns out that there are problems with all multi-character folds, and not
3659  * just these three.  Now the code is general, for all such cases.  The
3660  * approach taken is:
3661  * 1)   This routine examines each EXACTFish node that could contain multi-
3662  *      character folded sequences.  Since a single character can fold into
3663  *      such a sequence, the minimum match length for this node is less than
3664  *      the number of characters in the node.  This routine returns in
3665  *      *min_subtract how many characters to subtract from the the actual
3666  *      length of the string to get a real minimum match length; it is 0 if
3667  *      there are no multi-char foldeds.  This delta is used by the caller to
3668  *      adjust the min length of the match, and the delta between min and max,
3669  *      so that the optimizer doesn't reject these possibilities based on size
3670  *      constraints.
3671  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3672  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3673  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3674  *      there is a possible fold length change.  That means that a regular
3675  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3676  *      with length changes, and so can be processed faster.  regexec.c takes
3677  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3678  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3679  *      known until runtime).  This saves effort in regex matching.  However,
3680  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3681  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3682  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3683  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3684  *      possibilities for the non-UTF8 patterns are quite simple, except for
3685  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3686  *      members of a fold-pair, and arrays are set up for all of them so that
3687  *      the other member of the pair can be found quickly.  Code elsewhere in
3688  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3689  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3690  *      described in the next item.
3691  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3692  *      validity of the fold won't be known until runtime, and so must remain
3693  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3694  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3695  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3696  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3697  *      The reason this is a problem is that the optimizer part of regexec.c
3698  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3699  *      that a character in the pattern corresponds to at most a single
3700  *      character in the target string.  (And I do mean character, and not byte
3701  *      here, unlike other parts of the documentation that have never been
3702  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3703  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3704  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3705  *      nodes, violate the assumption, and they are the only instances where it
3706  *      is violated.  I'm reluctant to try to change the assumption, as the
3707  *      code involved is impenetrable to me (khw), so instead the code here
3708  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3709  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3710  *      boolean indicating whether or not the node contains such a fold.  When
3711  *      it is true, the caller sets a flag that later causes the optimizer in
3712  *      this file to not set values for the floating and fixed string lengths,
3713  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3714  *      assumption.  Thus, there is no optimization based on string lengths for
3715  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3716  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3717  *      assumption is wrong only in these cases is that all other non-UTF-8
3718  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3719  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3720  *      EXACTF nodes because we don't know at compile time if it actually
3721  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3722  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3723  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3724  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3725  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3726  *      string would require the pattern to be forced into UTF-8, the overhead
3727  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3728  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3729  *      locale.)
3730  *
3731  *      Similarly, the code that generates tries doesn't currently handle
3732  *      not-already-folded multi-char folds, and it looks like a pain to change
3733  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3734  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3735  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3736  *      using /iaa matching will be doing so almost entirely with ASCII
3737  *      strings, so this should rarely be encountered in practice */
3738
3739 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3740     if (PL_regkind[OP(scan)] == EXACT) \
3741         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3742
3743 STATIC U32
3744 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3745                    UV *min_subtract, bool *unfolded_multi_char,
3746                    U32 flags,regnode *val, U32 depth)
3747 {
3748     /* Merge several consecutive EXACTish nodes into one. */
3749     regnode *n = regnext(scan);
3750     U32 stringok = 1;
3751     regnode *next = scan + NODE_SZ_STR(scan);
3752     U32 merged = 0;
3753     U32 stopnow = 0;
3754 #ifdef DEBUGGING
3755     regnode *stop = scan;
3756     GET_RE_DEBUG_FLAGS_DECL;
3757 #else
3758     PERL_UNUSED_ARG(depth);
3759 #endif
3760
3761     PERL_ARGS_ASSERT_JOIN_EXACT;
3762 #ifndef EXPERIMENTAL_INPLACESCAN
3763     PERL_UNUSED_ARG(flags);
3764     PERL_UNUSED_ARG(val);
3765 #endif
3766     DEBUG_PEEP("join",scan,depth);
3767
3768     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3769      * EXACT ones that are mergeable to the current one. */
3770     while (n
3771            && (PL_regkind[OP(n)] == NOTHING
3772                || (stringok && OP(n) == OP(scan)))
3773            && NEXT_OFF(n)
3774            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3775     {
3776
3777         if (OP(n) == TAIL || n > next)
3778             stringok = 0;
3779         if (PL_regkind[OP(n)] == NOTHING) {
3780             DEBUG_PEEP("skip:",n,depth);
3781             NEXT_OFF(scan) += NEXT_OFF(n);
3782             next = n + NODE_STEP_REGNODE;
3783 #ifdef DEBUGGING
3784             if (stringok)
3785                 stop = n;
3786 #endif
3787             n = regnext(n);
3788         }
3789         else if (stringok) {
3790             const unsigned int oldl = STR_LEN(scan);
3791             regnode * const nnext = regnext(n);
3792
3793             /* XXX I (khw) kind of doubt that this works on platforms (should
3794              * Perl ever run on one) where U8_MAX is above 255 because of lots
3795              * of other assumptions */
3796             /* Don't join if the sum can't fit into a single node */
3797             if (oldl + STR_LEN(n) > U8_MAX)
3798                 break;
3799
3800             DEBUG_PEEP("merg",n,depth);
3801             merged++;
3802
3803             NEXT_OFF(scan) += NEXT_OFF(n);
3804             STR_LEN(scan) += STR_LEN(n);
3805             next = n + NODE_SZ_STR(n);
3806             /* Now we can overwrite *n : */
3807             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3808 #ifdef DEBUGGING
3809             stop = next - 1;
3810 #endif
3811             n = nnext;
3812             if (stopnow) break;
3813         }
3814
3815 #ifdef EXPERIMENTAL_INPLACESCAN
3816         if (flags && !NEXT_OFF(n)) {
3817             DEBUG_PEEP("atch", val, depth);
3818             if (reg_off_by_arg[OP(n)]) {
3819                 ARG_SET(n, val - n);
3820             }
3821             else {
3822                 NEXT_OFF(n) = val - n;
3823             }
3824             stopnow = 1;
3825         }
3826 #endif
3827     }
3828
3829     *min_subtract = 0;
3830     *unfolded_multi_char = FALSE;
3831
3832     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3833      * can now analyze for sequences of problematic code points.  (Prior to
3834      * this final joining, sequences could have been split over boundaries, and
3835      * hence missed).  The sequences only happen in folding, hence for any
3836      * non-EXACT EXACTish node */
3837     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3838         U8* s0 = (U8*) STRING(scan);
3839         U8* s = s0;
3840         U8* s_end = s0 + STR_LEN(scan);
3841
3842         int total_count_delta = 0;  /* Total delta number of characters that
3843                                        multi-char folds expand to */
3844
3845         /* One pass is made over the node's string looking for all the
3846          * possibilities.  To avoid some tests in the loop, there are two main
3847          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3848          * non-UTF-8 */
3849         if (UTF) {
3850             U8* folded = NULL;
3851
3852             if (OP(scan) == EXACTFL) {
3853                 U8 *d;
3854
3855                 /* An EXACTFL node would already have been changed to another
3856                  * node type unless there is at least one character in it that
3857                  * is problematic; likely a character whose fold definition
3858                  * won't be known until runtime, and so has yet to be folded.
3859                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3860                  * to handle the UTF-8 case, we need to create a temporary
3861                  * folded copy using UTF-8 locale rules in order to analyze it.
3862                  * This is because our macros that look to see if a sequence is
3863                  * a multi-char fold assume everything is folded (otherwise the
3864                  * tests in those macros would be too complicated and slow).
3865                  * Note that here, the non-problematic folds will have already
3866                  * been done, so we can just copy such characters.  We actually
3867                  * don't completely fold the EXACTFL string.  We skip the
3868                  * unfolded multi-char folds, as that would just create work
3869                  * below to figure out the size they already are */
3870
3871                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3872                 d = folded;
3873                 while (s < s_end) {
3874                     STRLEN s_len = UTF8SKIP(s);
3875                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3876                         Copy(s, d, s_len, U8);
3877                         d += s_len;
3878                     }
3879                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3880                         *unfolded_multi_char = TRUE;
3881                         Copy(s, d, s_len, U8);
3882                         d += s_len;
3883                     }
3884                     else if (isASCII(*s)) {
3885                         *(d++) = toFOLD(*s);
3886                     }
3887                     else {
3888                         STRLEN len;
3889                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3890                         d += len;
3891                     }
3892                     s += s_len;
3893                 }
3894
3895                 /* Point the remainder of the routine to look at our temporary
3896                  * folded copy */
3897                 s = folded;
3898                 s_end = d;
3899             } /* End of creating folded copy of EXACTFL string */
3900
3901             /* Examine the string for a multi-character fold sequence.  UTF-8
3902              * patterns have all characters pre-folded by the time this code is
3903              * executed */
3904             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3905                                      length sequence we are looking for is 2 */
3906             {
3907                 int count = 0;  /* How many characters in a multi-char fold */
3908                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3909                 if (! len) {    /* Not a multi-char fold: get next char */
3910                     s += UTF8SKIP(s);
3911                     continue;
3912                 }
3913
3914                 /* Nodes with 'ss' require special handling, except for
3915                  * EXACTFA-ish for which there is no multi-char fold to this */
3916                 if (len == 2 && *s == 's' && *(s+1) == 's'
3917                     && OP(scan) != EXACTFA
3918                     && OP(scan) != EXACTFA_NO_TRIE)
3919                 {
3920                     count = 2;
3921                     if (OP(scan) != EXACTFL) {
3922                         OP(scan) = EXACTFU_SS;
3923                     }
3924                     s += 2;
3925                 }
3926                 else { /* Here is a generic multi-char fold. */
3927                     U8* multi_end  = s + len;
3928
3929                     /* Count how many characters are in it.  In the case of
3930                      * /aa, no folds which contain ASCII code points are
3931                      * allowed, so check for those, and skip if found. */
3932                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3933                         count = utf8_length(s, multi_end);
3934                         s = multi_end;
3935                     }
3936                     else {
3937                         while (s < multi_end) {
3938                             if (isASCII(*s)) {
3939                                 s++;
3940                                 goto next_iteration;
3941                             }
3942                             else {
3943                                 s += UTF8SKIP(s);
3944                             }
3945                             count++;
3946                         }
3947                     }
3948                 }
3949
3950                 /* The delta is how long the sequence is minus 1 (1 is how long
3951                  * the character that folds to the sequence is) */
3952                 total_count_delta += count - 1;
3953               next_iteration: ;
3954             }
3955
3956             /* We created a temporary folded copy of the string in EXACTFL
3957              * nodes.  Therefore we need to be sure it doesn't go below zero,
3958              * as the real string could be shorter */
3959             if (OP(scan) == EXACTFL) {
3960                 int total_chars = utf8_length((U8*) STRING(scan),
3961                                            (U8*) STRING(scan) + STR_LEN(scan));
3962                 if (total_count_delta > total_chars) {
3963                     total_count_delta = total_chars;
3964                 }
3965             }
3966
3967             *min_subtract += total_count_delta;
3968             Safefree(folded);
3969         }
3970         else if (OP(scan) == EXACTFA) {
3971
3972             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3973              * fold to the ASCII range (and there are no existing ones in the
3974              * upper latin1 range).  But, as outlined in the comments preceding
3975              * this function, we need to flag any occurrences of the sharp s.
3976              * This character forbids trie formation (because of added
3977              * complexity) */
3978 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3979    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3980                                       || UNICODE_DOT_DOT_VERSION > 0)
3981             while (s < s_end) {
3982                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3983                     OP(scan) = EXACTFA_NO_TRIE;
3984                     *unfolded_multi_char = TRUE;
3985                     break;
3986                 }
3987                 s++;
3988             }
3989         }
3990         else {
3991
3992             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3993              * folds that are all Latin1.  As explained in the comments
3994              * preceding this function, we look also for the sharp s in EXACTF
3995              * and EXACTFL nodes; it can be in the final position.  Otherwise
3996              * we can stop looking 1 byte earlier because have to find at least
3997              * two characters for a multi-fold */
3998             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3999                               ? s_end
4000                               : s_end -1;
4001
4002             while (s < upper) {
4003                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4004                 if (! len) {    /* Not a multi-char fold. */
4005                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4006                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4007                     {
4008                         *unfolded_multi_char = TRUE;
4009                     }
4010                     s++;
4011                     continue;
4012                 }
4013
4014                 if (len == 2
4015                     && isALPHA_FOLD_EQ(*s, 's')
4016                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4017                 {
4018
4019                     /* EXACTF nodes need to know that the minimum length
4020                      * changed so that a sharp s in the string can match this
4021                      * ss in the pattern, but they remain EXACTF nodes, as they
4022                      * won't match this unless the target string is is UTF-8,
4023                      * which we don't know until runtime.  EXACTFL nodes can't
4024                      * transform into EXACTFU nodes */
4025                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4026                         OP(scan) = EXACTFU_SS;
4027                     }
4028                 }
4029
4030                 *min_subtract += len - 1;
4031                 s += len;
4032             }
4033 #endif
4034         }
4035     }
4036
4037 #ifdef DEBUGGING
4038     /* Allow dumping but overwriting the collection of skipped
4039      * ops and/or strings with fake optimized ops */
4040     n = scan + NODE_SZ_STR(scan);
4041     while (n <= stop) {
4042         OP(n) = OPTIMIZED;
4043         FLAGS(n) = 0;
4044         NEXT_OFF(n) = 0;
4045         n++;
4046     }
4047 #endif
4048     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
4049     return stopnow;
4050 }
4051
4052 /* REx optimizer.  Converts nodes into quicker variants "in place".
4053    Finds fixed substrings.  */
4054
4055 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4056    to the position after last scanned or to NULL. */
4057
4058 #define INIT_AND_WITHP \
4059     assert(!and_withp); \
4060     Newx(and_withp,1, regnode_ssc); \
4061     SAVEFREEPV(and_withp)
4062
4063
4064 static void
4065 S_unwind_scan_frames(pTHX_ const void *p)
4066 {
4067     scan_frame *f= (scan_frame *)p;
4068     do {
4069         scan_frame *n= f->next_frame;
4070         Safefree(f);
4071         f= n;
4072     } while (f);
4073 }
4074
4075
4076 STATIC SSize_t
4077 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4078                         SSize_t *minlenp, SSize_t *deltap,
4079                         regnode *last,
4080                         scan_data_t *data,
4081                         I32 stopparen,
4082                         U32 recursed_depth,
4083                         regnode_ssc *and_withp,
4084                         U32 flags, U32 depth)
4085                         /* scanp: Start here (read-write). */
4086                         /* deltap: Write maxlen-minlen here. */
4087                         /* last: Stop before this one. */
4088                         /* data: string data about the pattern */
4089                         /* stopparen: treat close N as END */
4090                         /* recursed: which subroutines have we recursed into */
4091                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4092 {
4093     /* There must be at least this number of characters to match */
4094     SSize_t min = 0;
4095     I32 pars = 0, code;
4096     regnode *scan = *scanp, *next;
4097     SSize_t delta = 0;
4098     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4099     int is_inf_internal = 0;            /* The studied chunk is infinite */
4100     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4101     scan_data_t data_fake;
4102     SV *re_trie_maxbuff = NULL;
4103     regnode *first_non_open = scan;
4104     SSize_t stopmin = SSize_t_MAX;
4105     scan_frame *frame = NULL;
4106     GET_RE_DEBUG_FLAGS_DECL;
4107
4108     PERL_ARGS_ASSERT_STUDY_CHUNK;
4109     RExC_study_started= 1;
4110
4111
4112     if ( depth == 0 ) {
4113         while (first_non_open && OP(first_non_open) == OPEN)
4114             first_non_open=regnext(first_non_open);
4115     }
4116
4117
4118   fake_study_recurse:
4119     DEBUG_r(
4120         RExC_study_chunk_recursed_count++;
4121     );
4122     DEBUG_OPTIMISE_MORE_r(
4123     {
4124         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4125             depth, (long)stopparen,
4126             (unsigned long)RExC_study_chunk_recursed_count,
4127             (unsigned long)depth, (unsigned long)recursed_depth,
4128             scan,
4129             last);
4130         if (recursed_depth) {
4131             U32 i;
4132             U32 j;
4133             for ( j = 0 ; j < recursed_depth ; j++ ) {
4134                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4135                     if (
4136                         PAREN_TEST(RExC_study_chunk_recursed +
4137                                    ( j * RExC_study_chunk_recursed_bytes), i )
4138                         && (
4139                             !j ||
4140                             !PAREN_TEST(RExC_study_chunk_recursed +
4141                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4142                         )
4143                     ) {
4144                         Perl_re_printf( aTHX_ " %d",(int)i);
4145                         break;
4146                     }
4147                 }
4148                 if ( j + 1 < recursed_depth ) {
4149                     Perl_re_printf( aTHX_  ",");
4150                 }
4151             }
4152         }
4153         Perl_re_printf( aTHX_ "\n");
4154     }
4155     );
4156     while ( scan && OP(scan) != END && scan < last ){
4157         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4158                                    node length to get a real minimum (because
4159                                    the folded version may be shorter) */
4160         bool unfolded_multi_char = FALSE;
4161         /* Peephole optimizer: */
4162         DEBUG_STUDYDATA("Peep:", data, depth);
4163         DEBUG_PEEP("Peep", scan, depth);
4164
4165
4166         /* The reason we do this here is that we need to deal with things like
4167          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4168          * parsing code, as each (?:..) is handled by a different invocation of
4169          * reg() -- Yves
4170          */
4171         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4172
4173         /* Follow the next-chain of the current node and optimize
4174            away all the NOTHINGs from it.  */
4175         if (OP(scan) != CURLYX) {
4176             const int max = (reg_off_by_arg[OP(scan)]
4177                        ? I32_MAX
4178                        /* I32 may be smaller than U16 on CRAYs! */
4179                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4180             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4181             int noff;
4182             regnode *n = scan;
4183
4184             /* Skip NOTHING and LONGJMP. */
4185             while ((n = regnext(n))
4186                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4187                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4188                    && off + noff < max)
4189                 off += noff;
4190             if (reg_off_by_arg[OP(scan)])
4191                 ARG(scan) = off;
4192             else
4193                 NEXT_OFF(scan) = off;
4194         }
4195
4196         /* The principal pseudo-switch.  Cannot be a switch, since we
4197            look into several different things.  */
4198         if ( OP(scan) == DEFINEP ) {
4199             SSize_t minlen = 0;
4200             SSize_t deltanext = 0;
4201             SSize_t fake_last_close = 0;
4202             I32 f = SCF_IN_DEFINE;
4203
4204             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4205             scan = regnext(scan);
4206             assert( OP(scan) == IFTHEN );
4207             DEBUG_PEEP("expect IFTHEN", scan, depth);
4208
4209             data_fake.last_closep= &fake_last_close;
4210             minlen = *minlenp;
4211             next = regnext(scan);
4212             scan = NEXTOPER(NEXTOPER(scan));
4213             DEBUG_PEEP("scan", scan, depth);
4214             DEBUG_PEEP("next", next, depth);
4215
4216             /* we suppose the run is continuous, last=next...
4217              * NOTE we dont use the return here! */
4218             (void)study_chunk(pRExC_state, &scan, &minlen,
4219                               &deltanext, next, &data_fake, stopparen,
4220                               recursed_depth, NULL, f, depth+1);
4221
4222             scan = next;
4223         } else
4224         if (
4225             OP(scan) == BRANCH  ||
4226             OP(scan) == BRANCHJ ||
4227             OP(scan) == IFTHEN
4228         ) {
4229             next = regnext(scan);
4230             code = OP(scan);
4231
4232             /* The op(next)==code check below is to see if we
4233              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4234              * IFTHEN is special as it might not appear in pairs.
4235              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4236              * we dont handle it cleanly. */
4237             if (OP(next) == code || code == IFTHEN) {
4238                 /* NOTE - There is similar code to this block below for
4239                  * handling TRIE nodes on a re-study.  If you change stuff here
4240                  * check there too. */
4241                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4242                 regnode_ssc accum;
4243                 regnode * const startbranch=scan;
4244
4245                 if (flags & SCF_DO_SUBSTR) {
4246                     /* Cannot merge strings after this. */
4247                     scan_commit(pRExC_state, data, minlenp, is_inf);
4248                 }
4249
4250                 if (flags & SCF_DO_STCLASS)
4251                     ssc_init_zero(pRExC_state, &accum);
4252
4253                 while (OP(scan) == code) {
4254                     SSize_t deltanext, minnext, fake;
4255                     I32 f = 0;
4256                     regnode_ssc this_class;
4257
4258                     DEBUG_PEEP("Branch", scan, depth);
4259
4260                     num++;
4261                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4262                     if (data) {
4263                         data_fake.whilem_c = data->whilem_c;
4264                         data_fake.last_closep = data->last_closep;
4265                     }
4266                     else
4267                         data_fake.last_closep = &fake;
4268
4269                     data_fake.pos_delta = delta;
4270                     next = regnext(scan);
4271
4272                     scan = NEXTOPER(scan); /* everything */
4273                     if (code != BRANCH)    /* everything but BRANCH */
4274                         scan = NEXTOPER(scan);
4275
4276                     if (flags & SCF_DO_STCLASS) {
4277                         ssc_init(pRExC_state, &this_class);
4278                         data_fake.start_class = &this_class;
4279                         f = SCF_DO_STCLASS_AND;
4280                     }
4281                     if (flags & SCF_WHILEM_VISITED_POS)
4282                         f |= SCF_WHILEM_VISITED_POS;
4283
4284                     /* we suppose the run is continuous, last=next...*/
4285                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4286                                       &deltanext, next, &data_fake, stopparen,
4287                                       recursed_depth, NULL, f,depth+1);
4288
4289                     if (min1 > minnext)
4290                         min1 = minnext;
4291                     if (deltanext == SSize_t_MAX) {
4292                         is_inf = is_inf_internal = 1;
4293                         max1 = SSize_t_MAX;
4294                     } else if (max1 < minnext + deltanext)
4295                         max1 = minnext + deltanext;
4296                     scan = next;
4297                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4298                         pars++;
4299                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4300                         if ( stopmin > minnext)
4301                             stopmin = min + min1;
4302                         flags &= ~SCF_DO_SUBSTR;
4303                         if (data)
4304                             data->flags |= SCF_SEEN_ACCEPT;
4305                     }
4306                     if (data) {
4307                         if (data_fake.flags & SF_HAS_EVAL)
4308                             data->flags |= SF_HAS_EVAL;
4309                         data->whilem_c = data_fake.whilem_c;
4310                     }
4311                     if (flags & SCF_DO_STCLASS)
4312                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4313                 }
4314                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4315                     min1 = 0;
4316                 if (flags & SCF_DO_SUBSTR) {
4317                     data->pos_min += min1;
4318                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4319                         data->pos_delta = SSize_t_MAX;
4320                     else
4321                         data->pos_delta += max1 - min1;
4322                     if (max1 != min1 || is_inf)
4323                         data->longest = &(data->longest_float);
4324                 }
4325                 min += min1;
4326                 if (delta == SSize_t_MAX
4327                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4328                     delta = SSize_t_MAX;
4329                 else
4330                     delta += max1 - min1;
4331                 if (flags & SCF_DO_STCLASS_OR) {
4332                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4333                     if (min1) {
4334                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4335                         flags &= ~SCF_DO_STCLASS;
4336                     }
4337                 }
4338                 else if (flags & SCF_DO_STCLASS_AND) {
4339                     if (min1) {
4340                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4341                         flags &= ~SCF_DO_STCLASS;
4342                     }
4343                     else {
4344                         /* Switch to OR mode: cache the old value of
4345                          * data->start_class */
4346                         INIT_AND_WITHP;
4347                         StructCopy(data->start_class, and_withp, regnode_ssc);
4348                         flags &= ~SCF_DO_STCLASS_AND;
4349                         StructCopy(&accum, data->start_class, regnode_ssc);
4350                         flags |= SCF_DO_STCLASS_OR;
4351                     }
4352                 }
4353
4354                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4355                         OP( startbranch ) == BRANCH )
4356                 {
4357                 /* demq.
4358
4359                    Assuming this was/is a branch we are dealing with: 'scan'
4360                    now points at the item that follows the branch sequence,
4361                    whatever it is. We now start at the beginning of the
4362                    sequence and look for subsequences of
4363
4364                    BRANCH->EXACT=>x1
4365                    BRANCH->EXACT=>x2
4366                    tail
4367
4368                    which would be constructed from a pattern like
4369                    /A|LIST|OF|WORDS/
4370
4371                    If we can find such a subsequence we need to turn the first
4372                    element into a trie and then add the subsequent branch exact
4373                    strings to the trie.
4374
4375                    We have two cases
4376
4377                      1. patterns where the whole set of branches can be
4378                         converted.
4379
4380                      2. patterns where only a subset can be converted.
4381
4382                    In case 1 we can replace the whole set with a single regop
4383                    for the trie. In case 2 we need to keep the start and end
4384                    branches so
4385
4386                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4387                      becomes BRANCH TRIE; BRANCH X;
4388
4389                   There is an additional case, that being where there is a
4390                   common prefix, which gets split out into an EXACT like node
4391                   preceding the TRIE node.
4392
4393                   If x(1..n)==tail then we can do a simple trie, if not we make
4394                   a "jump" trie, such that when we match the appropriate word
4395                   we "jump" to the appropriate tail node. Essentially we turn
4396                   a nested if into a case structure of sorts.
4397
4398                 */
4399
4400                     int made=0;
4401                     if (!re_trie_maxbuff) {
4402                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4403                         if (!SvIOK(re_trie_maxbuff))
4404                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4405                     }
4406                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4407                         regnode *cur;
4408                         regnode *first = (regnode *)NULL;
4409                         regnode *last = (regnode *)NULL;
4410                         regnode *tail = scan;
4411                         U8 trietype = 0;
4412                         U32 count=0;
4413
4414                         /* var tail is used because there may be a TAIL
4415                            regop in the way. Ie, the exacts will point to the
4416                            thing following the TAIL, but the last branch will
4417                            point at the TAIL. So we advance tail. If we
4418                            have nested (?:) we may have to move through several
4419                            tails.
4420                          */
4421
4422                         while ( OP( tail ) == TAIL ) {
4423                             /* this is the TAIL generated by (?:) */
4424                             tail = regnext( tail );
4425                         }
4426
4427
4428                         DEBUG_TRIE_COMPILE_r({
4429                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4430                             Perl_re_indentf( aTHX_  "%s %"UVuf":%s\n",
4431                               depth+1,
4432                               "Looking for TRIE'able sequences. Tail node is ",
4433                               (UV)(tail - RExC_emit_start),
4434                               SvPV_nolen_const( RExC_mysv )
4435                             );
4436                         });
4437
4438                         /*
4439
4440                             Step through the branches
4441                                 cur represents each branch,
4442                                 noper is the first thing to be matched as part
4443                                       of that branch
4444                                 noper_next is the regnext() of that node.
4445
4446                             We normally handle a case like this
4447                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4448                             support building with NOJUMPTRIE, which restricts
4449                             the trie logic to structures like /FOO|BAR/.
4450
4451                             If noper is a trieable nodetype then the branch is
4452                             a possible optimization target. If we are building
4453                             under NOJUMPTRIE then we require that noper_next is
4454                             the same as scan (our current position in the regex
4455                             program).
4456
4457                             Once we have two or more consecutive such branches
4458                             we can create a trie of the EXACT's contents and
4459                             stitch it in place into the program.
4460
4461                             If the sequence represents all of the branches in
4462                             the alternation we replace the entire thing with a
4463                             single TRIE node.
4464
4465                             Otherwise when it is a subsequence we need to
4466                             stitch it in place and replace only the relevant
4467                             branches. This means the first branch has to remain
4468                             as it is used by the alternation logic, and its
4469                             next pointer, and needs to be repointed at the item
4470                             on the branch chain following the last branch we
4471                             have optimized away.
4472
4473                             This could be either a BRANCH, in which case the
4474                             subsequence is internal, or it could be the item
4475                             following the branch sequence in which case the
4476                             subsequence is at the end (which does not
4477                             necessarily mean the first node is the start of the
4478                             alternation).
4479
4480                             TRIE_TYPE(X) is a define which maps the optype to a
4481                             trietype.
4482
4483                                 optype          |  trietype
4484                                 ----------------+-----------
4485                                 NOTHING         | NOTHING
4486                                 EXACT           | EXACT
4487                                 EXACTFU         | EXACTFU
4488                                 EXACTFU_SS      | EXACTFU
4489                                 EXACTFA         | EXACTFA
4490                                 EXACTL          | EXACTL
4491                                 EXACTFLU8       | EXACTFLU8
4492
4493
4494                         */
4495 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4496                        ? NOTHING                                            \
4497                        : ( EXACT == (X) )                                   \
4498                          ? EXACT                                            \
4499                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4500                            ? EXACTFU                                        \
4501                            : ( EXACTFA == (X) )                             \
4502                              ? EXACTFA                                      \
4503                              : ( EXACTL == (X) )                            \
4504                                ? EXACTL                                     \
4505                                : ( EXACTFLU8 == (X) )                        \
4506                                  ? EXACTFLU8                                 \
4507                                  : 0 )
4508
4509                         /* dont use tail as the end marker for this traverse */
4510                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4511                             regnode * const noper = NEXTOPER( cur );
4512                             U8 noper_type = OP( noper );
4513                             U8 noper_trietype = TRIE_TYPE( noper_type );
4514 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4515                             regnode * const noper_next = regnext( noper );
4516                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4517                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4518 #endif
4519
4520                             DEBUG_TRIE_COMPILE_r({
4521                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4522                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4523                                    depth+1,
4524                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4525
4526                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4527                                 Perl_re_printf( aTHX_  " -> %d:%s",
4528                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4529
4530                                 if ( noper_next ) {
4531                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4532                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4533                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4534                                 }
4535                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4536                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4537                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4538                                 );
4539                             });
4540
4541                             /* Is noper a trieable nodetype that can be merged
4542                              * with the current trie (if there is one)? */
4543                             if ( noper_trietype
4544                                   &&
4545                                   (
4546                                         ( noper_trietype == NOTHING )
4547                                         || ( trietype == NOTHING )
4548                                         || ( trietype == noper_trietype )
4549                                   )
4550 #ifdef NOJUMPTRIE
4551                                   && noper_next >= tail
4552 #endif
4553                                   && count < U16_MAX)
4554                             {
4555                                 /* Handle mergable triable node Either we are
4556                                  * the first node in a new trieable sequence,
4557                                  * in which case we do some bookkeeping,
4558                                  * otherwise we update the end pointer. */
4559                                 if ( !first ) {
4560                                     first = cur;
4561                                     if ( noper_trietype == NOTHING ) {
4562 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4563                                         regnode * const noper_next = regnext( noper );
4564                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4565                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4566 #endif
4567
4568                                         if ( noper_next_trietype ) {
4569                                             trietype = noper_next_trietype;
4570                                         } else if (noper_next_type)  {
4571                                             /* a NOTHING regop is 1 regop wide.
4572                                              * We need at least two for a trie
4573                                              * so we can't merge this in */
4574                                             first = NULL;
4575                                         }
4576                                     } else {
4577                                         trietype = noper_trietype;
4578                                     }
4579                                 } else {
4580                                     if ( trietype == NOTHING )
4581                                         trietype = noper_trietype;
4582                                     last = cur;
4583                                 }
4584                                 if (first)
4585                                     count++;
4586                             } /* end handle mergable triable node */
4587                             else {
4588                                 /* handle unmergable node -
4589                                  * noper may either be a triable node which can
4590                                  * not be tried together with the current trie,
4591                                  * or a non triable node */
4592                                 if ( last ) {
4593                                     /* If last is set and trietype is not
4594                                      * NOTHING then we have found at least two
4595                                      * triable branch sequences in a row of a
4596                                      * similar trietype so we can turn them
4597                                      * into a trie. If/when we allow NOTHING to
4598                                      * start a trie sequence this condition
4599                                      * will be required, and it isn't expensive
4600                                      * so we leave it in for now. */
4601                                     if ( trietype && trietype != NOTHING )
4602                                         make_trie( pRExC_state,
4603                                                 startbranch, first, cur, tail,
4604                                                 count, trietype, depth+1 );
4605                                     last = NULL; /* note: we clear/update
4606                                                     first, trietype etc below,
4607                                                     so we dont do it here */
4608                                 }
4609                                 if ( noper_trietype
4610 #ifdef NOJUMPTRIE
4611                                      && noper_next >= tail
4612 #endif
4613                                 ){
4614                                     /* noper is triable, so we can start a new
4615                                      * trie sequence */
4616                                     count = 1;
4617                                     first = cur;
4618                                     trietype = noper_trietype;
4619                                 } else if (first) {
4620                                     /* if we already saw a first but the
4621                                      * current node is not triable then we have
4622                                      * to reset the first information. */
4623                                     count = 0;
4624                                     first = NULL;
4625                                     trietype = 0;
4626                                 }
4627                             } /* end handle unmergable node */
4628                         } /* loop over branches */
4629                         DEBUG_TRIE_COMPILE_r({
4630                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4631                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4632                               depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4633                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4634                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4635                                PL_reg_name[trietype]
4636                             );
4637
4638                         });
4639                         if ( last && trietype ) {
4640                             if ( trietype != NOTHING ) {
4641                                 /* the last branch of the sequence was part of
4642                                  * a trie, so we have to construct it here
4643                                  * outside of the loop */
4644                                 made= make_trie( pRExC_state, startbranch,
4645                                                  first, scan, tail, count,
4646                                                  trietype, depth+1 );
4647 #ifdef TRIE_STUDY_OPT
4648                                 if ( ((made == MADE_EXACT_TRIE &&
4649                                      startbranch == first)
4650                                      || ( first_non_open == first )) &&
4651                                      depth==0 ) {
4652                                     flags |= SCF_TRIE_RESTUDY;
4653                                     if ( startbranch == first
4654                                          && scan >= tail )
4655                                     {
4656                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4657                                     }
4658                                 }
4659 #endif
4660                             } else {
4661                                 /* at this point we know whatever we have is a
4662                                  * NOTHING sequence/branch AND if 'startbranch'
4663                                  * is 'first' then we can turn the whole thing
4664                                  * into a NOTHING
4665                                  */
4666                                 if ( startbranch == first ) {
4667                                     regnode *opt;
4668                                     /* the entire thing is a NOTHING sequence,
4669                                      * something like this: (?:|) So we can
4670                                      * turn it into a plain NOTHING op. */
4671                                     DEBUG_TRIE_COMPILE_r({
4672                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4673                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4674                                           depth+1,
4675                                           SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4676
4677                                     });
4678                                     OP(startbranch)= NOTHING;
4679                                     NEXT_OFF(startbranch)= tail - startbranch;
4680                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4681                                         OP(opt)= OPTIMIZED;
4682                                 }
4683                             }
4684                         } /* end if ( last) */
4685                     } /* TRIE_MAXBUF is non zero */
4686
4687                 } /* do trie */
4688
4689             }
4690             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4691                 scan = NEXTOPER(NEXTOPER(scan));
4692             } else                      /* single branch is optimized. */
4693                 scan = NEXTOPER(scan);
4694             continue;
4695         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4696             I32 paren = 0;
4697             regnode *start = NULL;
4698             regnode *end = NULL;
4699             U32 my_recursed_depth= recursed_depth;
4700
4701             if (OP(scan) != SUSPEND) { /* GOSUB */
4702                 /* Do setup, note this code has side effects beyond
4703                  * the rest of this block. Specifically setting
4704                  * RExC_recurse[] must happen at least once during
4705                  * study_chunk(). */
4706                 paren = ARG(scan);
4707                 RExC_recurse[ARG2L(scan)] = scan;
4708                 start = RExC_open_parens[paren];
4709                 end   = RExC_close_parens[paren];
4710
4711                 /* NOTE we MUST always execute the above code, even
4712                  * if we do nothing with a GOSUB */
4713                 if (
4714                     ( flags & SCF_IN_DEFINE )
4715                     ||
4716                     (
4717                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4718                         &&
4719                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4720                     )
4721                 ) {
4722                     /* no need to do anything here if we are in a define. */
4723                     /* or we are after some kind of infinite construct
4724                      * so we can skip recursing into this item.
4725                      * Since it is infinite we will not change the maxlen
4726                      * or delta, and if we miss something that might raise
4727                      * the minlen it will merely pessimise a little.
4728                      *
4729                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4730                      * might result in a minlen of 1 and not of 4,
4731                      * but this doesn't make us mismatch, just try a bit
4732                      * harder than we should.
4733                      * */
4734                     scan= regnext(scan);
4735                     continue;
4736                 }
4737
4738                 if (
4739                     !recursed_depth
4740                     ||
4741                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4742                 ) {
4743                     /* it is quite possible that there are more efficient ways
4744                      * to do this. We maintain a bitmap per level of recursion
4745                      * of which patterns we have entered so we can detect if a
4746                      * pattern creates a possible infinite loop. When we
4747                      * recurse down a level we copy the previous levels bitmap
4748                      * down. When we are at recursion level 0 we zero the top
4749                      * level bitmap. It would be nice to implement a different
4750                      * more efficient way of doing this. In particular the top
4751                      * level bitmap may be unnecessary.
4752                      */
4753                     if (!recursed_depth) {
4754                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4755                     } else {
4756                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4757                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4758                              RExC_study_chunk_recursed_bytes, U8);
4759                     }
4760                     /* we havent recursed into this paren yet, so recurse into it */
4761                     DEBUG_STUDYDATA("gosub-set:", data,depth);
4762                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4763                     my_recursed_depth= recursed_depth + 1;
4764                 } else {
4765                     DEBUG_STUDYDATA("gosub-inf:", data,depth);
4766                     /* some form of infinite recursion, assume infinite length
4767                      * */
4768                     if (flags & SCF_DO_SUBSTR) {
4769                         scan_commit(pRExC_state, data, minlenp, is_inf);
4770                         data->longest = &(data->longest_float);
4771                     }
4772                     is_inf = is_inf_internal = 1;
4773                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4774                         ssc_anything(data->start_class);
4775                     flags &= ~SCF_DO_STCLASS;
4776
4777                     start= NULL; /* reset start so we dont recurse later on. */
4778                 }
4779             } else {
4780                 paren = stopparen;
4781                 start = scan + 2;
4782                 end = regnext(scan);
4783             }
4784             if (start) {
4785                 scan_frame *newframe;
4786                 assert(end);
4787                 if (!RExC_frame_last) {
4788                     Newxz(newframe, 1, scan_frame);
4789                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4790                     RExC_frame_head= newframe;
4791                     RExC_frame_count++;
4792                 } else if (!RExC_frame_last->next_frame) {
4793                     Newxz(newframe,1,scan_frame);
4794                     RExC_frame_last->next_frame= newframe;
4795                     newframe->prev_frame= RExC_frame_last;
4796                     RExC_frame_count++;
4797                 } else {
4798                     newframe= RExC_frame_last->next_frame;
4799                 }
4800                 RExC_frame_last= newframe;
4801
4802                 newframe->next_regnode = regnext(scan);
4803                 newframe->last_regnode = last;
4804                 newframe->stopparen = stopparen;
4805                 newframe->prev_recursed_depth = recursed_depth;
4806                 newframe->this_prev_frame= frame;
4807
4808                 DEBUG_STUDYDATA("frame-new:",data,depth);
4809                 DEBUG_PEEP("fnew", scan, depth);
4810
4811                 frame = newframe;
4812                 scan =  start;
4813                 stopparen = paren;
4814                 last = end;
4815                 depth = depth + 1;
4816                 recursed_depth= my_recursed_depth;
4817
4818                 continue;
4819             }
4820         }
4821         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4822             SSize_t l = STR_LEN(scan);
4823             UV uc;
4824             if (UTF) {
4825                 const U8 * const s = (U8*)STRING(scan);
4826                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4827                 l = utf8_length(s, s + l);
4828             } else {
4829                 uc = *((U8*)STRING(scan));
4830             }
4831             min += l;
4832             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4833                 /* The code below prefers earlier match for fixed
4834                    offset, later match for variable offset.  */
4835                 if (data->last_end == -1) { /* Update the start info. */
4836                     data->last_start_min = data->pos_min;
4837                     data->last_start_max = is_inf
4838                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4839                 }
4840                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4841                 if (UTF)
4842                     SvUTF8_on(data->last_found);
4843                 {
4844                     SV * const sv = data->last_found;
4845                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4846                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4847                     if (mg && mg->mg_len >= 0)
4848                         mg->mg_len += utf8_length((U8*)STRING(scan),
4849                                               (U8*)STRING(scan)+STR_LEN(scan));
4850                 }
4851                 data->last_end = data->pos_min + l;
4852                 data->pos_min += l; /* As in the first entry. */
4853                 data->flags &= ~SF_BEFORE_EOL;
4854             }
4855
4856             /* ANDing the code point leaves at most it, and not in locale, and
4857              * can't match null string */
4858             if (flags & SCF_DO_STCLASS_AND) {
4859                 ssc_cp_and(data->start_class, uc);
4860                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4861                 ssc_clear_locale(data->start_class);
4862             }
4863             else if (flags & SCF_DO_STCLASS_OR) {
4864                 ssc_add_cp(data->start_class, uc);
4865                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4866
4867                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4868                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4869             }
4870             flags &= ~SCF_DO_STCLASS;
4871         }
4872         else if (PL_regkind[OP(scan)] == EXACT) {
4873             /* But OP != EXACT!, so is EXACTFish */
4874             SSize_t l = STR_LEN(scan);
4875             const U8 * s = (U8*)STRING(scan);
4876
4877             /* Search for fixed substrings supports EXACT only. */
4878             if (flags & SCF_DO_SUBSTR) {
4879                 assert(data);
4880                 scan_commit(pRExC_state, data, minlenp, is_inf);
4881             }
4882             if (UTF) {
4883                 l = utf8_length(s, s + l);
4884             }
4885             if (unfolded_multi_char) {
4886                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4887             }
4888             min += l - min_subtract;
4889             assert (min >= 0);
4890             delta += min_subtract;
4891             if (flags & SCF_DO_SUBSTR) {
4892                 data->pos_min += l - min_subtract;
4893                 if (data->pos_min < 0) {
4894                     data->pos_min = 0;
4895                 }
4896                 data->pos_delta += min_subtract;
4897                 if (min_subtract) {
4898                     data->longest = &(data->longest_float);
4899                 }
4900             }
4901
4902             if (flags & SCF_DO_STCLASS) {
4903                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4904
4905                 assert(EXACTF_invlist);
4906                 if (flags & SCF_DO_STCLASS_AND) {
4907                     if (OP(scan) != EXACTFL)
4908                         ssc_clear_locale(data->start_class);
4909                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4910                     ANYOF_POSIXL_ZERO(data->start_class);
4911                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4912                 }
4913                 else {  /* SCF_DO_STCLASS_OR */
4914                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4915                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4916
4917                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4918                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4919                 }
4920                 flags &= ~SCF_DO_STCLASS;
4921                 SvREFCNT_dec(EXACTF_invlist);
4922             }
4923         }
4924         else if (REGNODE_VARIES(OP(scan))) {
4925             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4926             I32 fl = 0, f = flags;
4927             regnode * const oscan = scan;
4928             regnode_ssc this_class;
4929             regnode_ssc *oclass = NULL;
4930             I32 next_is_eval = 0;
4931
4932             switch (PL_regkind[OP(scan)]) {
4933             case WHILEM:                /* End of (?:...)* . */
4934                 scan = NEXTOPER(scan);
4935                 goto finish;
4936             case PLUS:
4937                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4938                     next = NEXTOPER(scan);
4939                     if (OP(next) == EXACT
4940                         || OP(next) == EXACTL
4941                         || (flags & SCF_DO_STCLASS))
4942                     {
4943                         mincount = 1;
4944                         maxcount = REG_INFTY;
4945                         next = regnext(scan);
4946                         scan = NEXTOPER(scan);
4947                         goto do_curly;
4948                     }
4949                 }
4950                 if (flags & SCF_DO_SUBSTR)
4951                     data->pos_min++;
4952                 min++;
4953                 /* FALLTHROUGH */
4954             case STAR:
4955                 if (flags & SCF_DO_STCLASS) {
4956                     mincount = 0;
4957                     maxcount = REG_INFTY;
4958                     next = regnext(scan);
4959                     scan = NEXTOPER(scan);
4960                     goto do_curly;
4961                 }
4962                 if (flags & SCF_DO_SUBSTR) {
4963                     scan_commit(pRExC_state, data, minlenp, is_inf);
4964                     /* Cannot extend fixed substrings */
4965                     data->longest = &(data->longest_float);
4966                 }
4967                 is_inf = is_inf_internal = 1;
4968                 scan = regnext(scan);
4969                 goto optimize_curly_tail;
4970             case CURLY:
4971                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4972                     && (scan->flags == stopparen))
4973                 {
4974                     mincount = 1;
4975                     maxcount = 1;
4976                 } else {
4977                     mincount = ARG1(scan);
4978                     maxcount = ARG2(scan);
4979                 }
4980                 next = regnext(scan);
4981                 if (OP(scan) == CURLYX) {
4982                     I32 lp = (data ? *(data->last_closep) : 0);
4983                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4984                 }
4985                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4986                 next_is_eval = (OP(scan) == EVAL);
4987               do_curly:
4988                 if (flags & SCF_DO_SUBSTR) {
4989                     if (mincount == 0)
4990                         scan_commit(pRExC_state, data, minlenp, is_inf);
4991                     /* Cannot extend fixed substrings */
4992                     pos_before = data->pos_min;
4993                 }
4994                 if (data) {
4995                     fl = data->flags;
4996                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4997                     if (is_inf)
4998                         data->flags |= SF_IS_INF;
4999                 }
5000                 if (flags & SCF_DO_STCLASS) {
5001                     ssc_init(pRExC_state, &this_class);
5002                     oclass = data->start_class;
5003                     data->start_class = &this_class;
5004                     f |= SCF_DO_STCLASS_AND;
5005                     f &= ~SCF_DO_STCLASS_OR;
5006                 }
5007                 /* Exclude from super-linear cache processing any {n,m}
5008                    regops for which the combination of input pos and regex
5009                    pos is not enough information to determine if a match
5010                    will be possible.
5011
5012                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5013                    regex pos at the \s*, the prospects for a match depend not
5014                    only on the input position but also on how many (bar\s*)
5015                    repeats into the {4,8} we are. */
5016                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5017                     f &= ~SCF_WHILEM_VISITED_POS;
5018
5019                 /* This will finish on WHILEM, setting scan, or on NULL: */
5020                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5021                                   last, data, stopparen, recursed_depth, NULL,
5022                                   (mincount == 0
5023                                    ? (f & ~SCF_DO_SUBSTR)
5024                                    : f)
5025                                   ,depth+1);
5026
5027                 if (flags & SCF_DO_STCLASS)
5028                     data->start_class = oclass;
5029                 if (mincount == 0 || minnext == 0) {
5030                     if (flags & SCF_DO_STCLASS_OR) {
5031                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5032                     }
5033                     else if (flags & SCF_DO_STCLASS_AND) {
5034                         /* Switch to OR mode: cache the old value of
5035                          * data->start_class */
5036                         INIT_AND_WITHP;
5037                         StructCopy(data->start_class, and_withp, regnode_ssc);
5038                         flags &= ~SCF_DO_STCLASS_AND;
5039                         StructCopy(&this_class, data->start_class, regnode_ssc);
5040                         flags |= SCF_DO_STCLASS_OR;
5041                         ANYOF_FLAGS(data->start_class)
5042                                                 |= SSC_MATCHES_EMPTY_STRING;
5043                     }
5044                 } else {                /* Non-zero len */
5045                     if (flags & SCF_DO_STCLASS_OR) {
5046                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5047                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5048                     }
5049                     else if (flags & SCF_DO_STCLASS_AND)
5050                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5051                     flags &= ~SCF_DO_STCLASS;
5052                 }
5053                 if (!scan)              /* It was not CURLYX, but CURLY. */
5054                     scan = next;
5055                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
5056                     /* ? quantifier ok, except for (?{ ... }) */
5057                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5058                     && (minnext == 0) && (deltanext == 0)
5059                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5060                     && maxcount <= REG_INFTY/3) /* Complement check for big
5061                                                    count */
5062                 {
5063                     /* Fatal warnings may leak the regexp without this: */
5064                     SAVEFREESV(RExC_rx_sv);
5065                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5066                         "Quantifier unexpected on zero-length expression "
5067                         "in regex m/%"UTF8f"/",
5068                          UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5069                                   RExC_precomp));
5070                     (void)ReREFCNT_inc(RExC_rx_sv);
5071                 }
5072
5073                 min += minnext * mincount;
5074                 is_inf_internal |= deltanext == SSize_t_MAX
5075                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5076                 is_inf |= is_inf_internal;
5077                 if (is_inf) {
5078                     delta = SSize_t_MAX;
5079                 } else {
5080                     delta += (minnext + deltanext) * maxcount
5081                              - minnext * mincount;
5082                 }
5083                 /* Try powerful optimization CURLYX => CURLYN. */
5084                 if (  OP(oscan) == CURLYX && data
5085                       && data->flags & SF_IN_PAR
5086                       && !(data->flags & SF_HAS_EVAL)
5087                       && !deltanext && minnext == 1 ) {
5088                     /* Try to optimize to CURLYN.  */
5089                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5090                     regnode * const nxt1 = nxt;
5091 #ifdef DEBUGGING
5092                     regnode *nxt2;
5093 #endif
5094
5095                     /* Skip open. */
5096                     nxt = regnext(nxt);
5097                     if (!REGNODE_SIMPLE(OP(nxt))
5098                         && !(PL_regkind[OP(nxt)] == EXACT
5099                              && STR_LEN(nxt) == 1))
5100                         goto nogo;
5101 #ifdef DEBUGGING
5102                     nxt2 = nxt;
5103 #endif
5104                     nxt = regnext(nxt);
5105                     if (OP(nxt) != CLOSE)
5106                         goto nogo;
5107                     if (RExC_open_parens) {
5108                         RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5109                         RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5110                     }
5111                     /* Now we know that nxt2 is the only contents: */
5112                     oscan->flags = (U8)ARG(nxt);
5113                     OP(oscan) = CURLYN;
5114                     OP(nxt1) = NOTHING; /* was OPEN. */
5115
5116 #ifdef DEBUGGING
5117                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5118                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5119                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5120                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5121                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5122                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5123 #endif
5124                 }
5125               nogo:
5126
5127                 /* Try optimization CURLYX => CURLYM. */
5128                 if (  OP(oscan) == CURLYX && data
5129                       && !(data->flags & SF_HAS_PAR)
5130                       && !(data->flags & SF_HAS_EVAL)
5131                       && !deltanext     /* atom is fixed width */
5132                       && minnext != 0   /* CURLYM can't handle zero width */
5133
5134                          /* Nor characters whose fold at run-time may be
5135                           * multi-character */
5136                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5137                 ) {
5138                     /* XXXX How to optimize if data == 0? */
5139                     /* Optimize to a simpler form.  */
5140                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5141                     regnode *nxt2;
5142
5143                     OP(oscan) = CURLYM;
5144                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5145                             && (OP(nxt2) != WHILEM))
5146                         nxt = nxt2;
5147                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5148                     /* Need to optimize away parenths. */
5149                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5150                         /* Set the parenth number.  */
5151                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5152
5153                         oscan->flags = (U8)ARG(nxt);
5154                         if (RExC_open_parens) {
5155                             RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5156                             RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5157                         }
5158                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5159                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5160
5161 #ifdef DEBUGGING
5162                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5163                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5164                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5165                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5166 #endif
5167 #if 0
5168                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5169                             regnode *nnxt = regnext(nxt1);
5170                             if (nnxt == nxt) {
5171                                 if (reg_off_by_arg[OP(nxt1)])
5172                                     ARG_SET(nxt1, nxt2 - nxt1);
5173                                 else if (nxt2 - nxt1 < U16_MAX)
5174                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5175                                 else
5176                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5177                             }
5178                             nxt1 = nnxt;
5179                         }
5180 #endif
5181                         /* Optimize again: */
5182                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5183                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5184                     }
5185                     else
5186                         oscan->flags = 0;
5187                 }
5188                 else if ((OP(oscan) == CURLYX)
5189                          && (flags & SCF_WHILEM_VISITED_POS)
5190                          /* See the comment on a similar expression above.
5191                             However, this time it's not a subexpression
5192                             we care about, but the expression itself. */
5193                          && (maxcount == REG_INFTY)
5194                          && data && ++data->whilem_c < 16) {
5195                     /* This stays as CURLYX, we can put the count/of pair. */
5196                     /* Find WHILEM (as in regexec.c) */
5197                     regnode *nxt = oscan + NEXT_OFF(oscan);
5198
5199                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5200                         nxt += ARG(nxt);
5201                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
5202                         | (RExC_whilem_seen << 4)); /* On WHILEM */
5203                 }
5204                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5205                     pars++;
5206                 if (flags & SCF_DO_SUBSTR) {
5207                     SV *last_str = NULL;
5208                     STRLEN last_chrs = 0;
5209                     int counted = mincount != 0;
5210
5211                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5212                                                                   string. */
5213                         SSize_t b = pos_before >= data->last_start_min
5214                             ? pos_before : data->last_start_min;
5215                         STRLEN l;
5216                         const char * const s = SvPV_const(data->last_found, l);
5217                         SSize_t old = b - data->last_start_min;
5218
5219                         if (UTF)
5220                             old = utf8_hop((U8*)s, old) - (U8*)s;
5221                         l -= old;
5222                         /* Get the added string: */
5223                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5224                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5225                                             (U8*)(s + old + l)) : l;
5226                         if (deltanext == 0 && pos_before == b) {
5227                             /* What was added is a constant string */
5228                             if (mincount > 1) {
5229
5230                                 SvGROW(last_str, (mincount * l) + 1);
5231                                 repeatcpy(SvPVX(last_str) + l,
5232                                           SvPVX_const(last_str), l,
5233                                           mincount - 1);
5234                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5235                                 /* Add additional parts. */
5236                                 SvCUR_set(data->last_found,
5237                                           SvCUR(data->last_found) - l);
5238                                 sv_catsv(data->last_found, last_str);
5239                                 {
5240                                     SV * sv = data->last_found;
5241                                     MAGIC *mg =
5242                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5243                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5244                                     if (mg && mg->mg_len >= 0)
5245                                         mg->mg_len += last_chrs * (mincount-1);
5246                                 }
5247                                 last_chrs *= mincount;
5248                                 data->last_end += l * (mincount - 1);
5249                             }
5250                         } else {
5251                             /* start offset must point into the last copy */
5252                             data->last_start_min += minnext * (mincount - 1);
5253                             data->last_start_max =
5254                               is_inf
5255                                ? SSize_t_MAX
5256                                : data->last_start_max +
5257                                  (maxcount - 1) * (minnext + data->pos_delta);
5258                         }
5259                     }
5260                     /* It is counted once already... */
5261                     data->pos_min += minnext * (mincount - counted);
5262 #if 0
5263 Perl_re_printf( aTHX_  "counted=%"UVuf" deltanext=%"UVuf
5264                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5265                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5266     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5267     (UV)mincount);
5268 if (deltanext != SSize_t_MAX)
5269 Perl_re_printf( aTHX_  "LHS=%"UVuf" RHS=%"UVuf"\n",
5270     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5271           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5272 #endif
5273                     if (deltanext == SSize_t_MAX
5274                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5275                         data->pos_delta = SSize_t_MAX;
5276                     else
5277                         data->pos_delta += - counted * deltanext +
5278                         (minnext + deltanext) * maxcount - minnext * mincount;
5279                     if (mincount != maxcount) {
5280                          /* Cannot extend fixed substrings found inside
5281                             the group.  */
5282                         scan_commit(pRExC_state, data, minlenp, is_inf);
5283                         if (mincount && last_str) {
5284                             SV * const sv = data->last_found;
5285                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5286                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5287
5288                             if (mg)
5289                                 mg->mg_len = -1;
5290                             sv_setsv(sv, last_str);
5291                             data->last_end = data->pos_min;
5292                             data->last_start_min = data->pos_min - last_chrs;
5293                             data->last_start_max = is_inf
5294                                 ? SSize_t_MAX
5295                                 : data->pos_min + data->pos_delta - last_chrs;
5296                         }
5297                         data->longest = &(data->longest_float);
5298                     }
5299                     SvREFCNT_dec(last_str);
5300                 }
5301                 if (data && (fl & SF_HAS_EVAL))
5302                     data->flags |= SF_HAS_EVAL;
5303               optimize_curly_tail:
5304                 if (OP(oscan) != CURLYX) {
5305                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5306                            && NEXT_OFF(next))
5307                         NEXT_OFF(oscan) += NEXT_OFF(next);
5308                 }
5309                 continue;
5310
5311             default:
5312 #ifdef DEBUGGING
5313                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5314                                                                     OP(scan));
5315 #endif
5316             case REF:
5317             case CLUMP:
5318                 if (flags & SCF_DO_SUBSTR) {
5319                     /* Cannot expect anything... */
5320                     scan_commit(pRExC_state, data, minlenp, is_inf);
5321                     data->longest = &(data->longest_float);
5322                 }
5323                 is_inf = is_inf_internal = 1;
5324                 if (flags & SCF_DO_STCLASS_OR) {
5325                     if (OP(scan) == CLUMP) {
5326                         /* Actually is any start char, but very few code points
5327                          * aren't start characters */
5328                         ssc_match_all_cp(data->start_class);
5329                     }
5330                     else {
5331                         ssc_anything(data->start_class);
5332                     }
5333                 }
5334                 flags &= ~SCF_DO_STCLASS;
5335                 break;
5336             }
5337         }
5338         else if (OP(scan) == LNBREAK) {
5339             if (flags & SCF_DO_STCLASS) {
5340                 if (flags & SCF_DO_STCLASS_AND) {
5341                     ssc_intersection(data->start_class,
5342                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5343                     ssc_clear_locale(data->start_class);
5344                     ANYOF_FLAGS(data->start_class)
5345                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5346                 }
5347                 else if (flags & SCF_DO_STCLASS_OR) {
5348                     ssc_union(data->start_class,
5349                               PL_XPosix_ptrs[_CC_VERTSPACE],
5350                               FALSE);
5351                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5352
5353                     /* See commit msg for
5354                      * 749e076fceedeb708a624933726e7989f2302f6a */
5355                     ANYOF_FLAGS(data->start_class)
5356                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5357                 }
5358                 flags &= ~SCF_DO_STCLASS;
5359             }
5360             min++;
5361             if (delta != SSize_t_MAX)
5362                 delta++;    /* Because of the 2 char string cr-lf */
5363             if (flags & SCF_DO_SUBSTR) {
5364                 /* Cannot expect anything... */
5365                 scan_commit(pRExC_state, data, minlenp, is_inf);
5366                 data->pos_min += 1;
5367                 data->pos_delta += 1;
5368                 data->longest = &(data->longest_float);
5369             }
5370         }
5371         else if (REGNODE_SIMPLE(OP(scan))) {
5372
5373             if (flags & SCF_DO_SUBSTR) {
5374                 scan_commit(pRExC_state, data, minlenp, is_inf);
5375                 data->pos_min++;
5376             }
5377             min++;
5378             if (flags & SCF_DO_STCLASS) {
5379                 bool invert = 0;
5380                 SV* my_invlist = NULL;
5381                 U8 namedclass;
5382
5383                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5384                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5385
5386                 /* Some of the logic below assumes that switching
5387                    locale on will only add false positives. */
5388                 switch (OP(scan)) {
5389
5390                 default:
5391 #ifdef DEBUGGING
5392                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5393                                                                      OP(scan));
5394 #endif
5395                 case SANY:
5396                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5397                         ssc_match_all_cp(data->start_class);
5398                     break;
5399
5400                 case REG_ANY:
5401                     {
5402                         SV* REG_ANY_invlist = _new_invlist(2);
5403                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5404                                                             '\n');
5405                         if (flags & SCF_DO_STCLASS_OR) {
5406                             ssc_union(data->start_class,
5407                                       REG_ANY_invlist,
5408                                       TRUE /* TRUE => invert, hence all but \n
5409                                             */
5410                                       );
5411                         }
5412                         else if (flags & SCF_DO_STCLASS_AND) {
5413                             ssc_intersection(data->start_class,
5414                                              REG_ANY_invlist,
5415                                              TRUE  /* TRUE => invert */
5416                                              );
5417                             ssc_clear_locale(data->start_class);
5418                         }
5419                         SvREFCNT_dec_NN(REG_ANY_invlist);
5420                     }
5421                     break;
5422
5423                 case ANYOFD:
5424                 case ANYOFL:
5425                 case ANYOF:
5426                     if (flags & SCF_DO_STCLASS_AND)
5427                         ssc_and(pRExC_state, data->start_class,
5428                                 (regnode_charclass *) scan);
5429                     else
5430                         ssc_or(pRExC_state, data->start_class,
5431                                                           (regnode_charclass *) scan);
5432                     break;
5433
5434                 case NPOSIXL:
5435                     invert = 1;
5436                     /* FALLTHROUGH */
5437
5438                 case POSIXL:
5439                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5440                     if (flags & SCF_DO_STCLASS_AND) {
5441                         bool was_there = cBOOL(
5442                                           ANYOF_POSIXL_TEST(data->start_class,
5443                                                                  namedclass));
5444                         ANYOF_POSIXL_ZERO(data->start_class);
5445                         if (was_there) {    /* Do an AND */
5446                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5447                         }
5448                         /* No individual code points can now match */
5449                         data->start_class->invlist
5450                                                 = sv_2mortal(_new_invlist(0));
5451                     }
5452                     else {
5453                         int complement = namedclass + ((invert) ? -1 : 1);
5454
5455                         assert(flags & SCF_DO_STCLASS_OR);
5456
5457                         /* If the complement of this class was already there,
5458                          * the result is that they match all code points,
5459                          * (\d + \D == everything).  Remove the classes from
5460                          * future consideration.  Locale is not relevant in
5461                          * this case */
5462                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5463                             ssc_match_all_cp(data->start_class);
5464                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5465                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5466                         }
5467                         else {  /* The usual case; just add this class to the
5468                                    existing set */
5469                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5470                         }
5471                     }
5472                     break;
5473
5474                 case NPOSIXA:   /* For these, we always know the exact set of
5475                                    what's matched */
5476                     invert = 1;
5477                     /* FALLTHROUGH */
5478                 case POSIXA:
5479                     if (FLAGS(scan) == _CC_ASCII) {
5480                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5481                     }
5482                     else {
5483                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5484                                               PL_XPosix_ptrs[_CC_ASCII],
5485                                               &my_invlist);
5486                     }
5487                     goto join_posix;
5488
5489                 case NPOSIXD:
5490                 case NPOSIXU:
5491                     invert = 1;
5492                     /* FALLTHROUGH */
5493                 case POSIXD:
5494                 case POSIXU:
5495                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5496
5497                     /* NPOSIXD matches all upper Latin1 code points unless the
5498                      * target string being matched is UTF-8, which is
5499                      * unknowable until match time.  Since we are going to
5500                      * invert, we want to get rid of all of them so that the
5501                      * inversion will match all */
5502                     if (OP(scan) == NPOSIXD) {
5503                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5504                                           &my_invlist);
5505                     }
5506
5507                   join_posix:
5508
5509                     if (flags & SCF_DO_STCLASS_AND) {
5510                         ssc_intersection(data->start_class, my_invlist, invert);
5511                         ssc_clear_locale(data->start_class);
5512                     }
5513                     else {
5514                         assert(flags & SCF_DO_STCLASS_OR);
5515                         ssc_union(data->start_class, my_invlist, invert);
5516                     }
5517                     SvREFCNT_dec(my_invlist);
5518                 }
5519                 if (flags & SCF_DO_STCLASS_OR)
5520                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5521                 flags &= ~SCF_DO_STCLASS;
5522             }
5523         }
5524         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5525             data->flags |= (OP(scan) == MEOL
5526                             ? SF_BEFORE_MEOL
5527                             : SF_BEFORE_SEOL);
5528             scan_commit(pRExC_state, data, minlenp, is_inf);
5529
5530         }
5531         else if (  PL_regkind[OP(scan)] == BRANCHJ
5532                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5533                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5534                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5535         {
5536             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5537                 || OP(scan) == UNLESSM )
5538             {
5539                 /* Negative Lookahead/lookbehind
5540                    In this case we can't do fixed string optimisation.
5541                 */
5542
5543                 SSize_t deltanext, minnext, fake = 0;
5544                 regnode *nscan;
5545                 regnode_ssc intrnl;
5546                 int f = 0;
5547
5548                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5549                 if (data) {
5550                     data_fake.whilem_c = data->whilem_c;
5551                     data_fake.last_closep = data->last_closep;
5552                 }
5553                 else
5554                     data_fake.last_closep = &fake;
5555                 data_fake.pos_delta = delta;
5556                 if ( flags & SCF_DO_STCLASS && !scan->flags
5557                      && OP(scan) == IFMATCH ) { /* Lookahead */
5558                     ssc_init(pRExC_state, &intrnl);
5559                     data_fake.start_class = &intrnl;
5560                     f |= SCF_DO_STCLASS_AND;
5561                 }
5562                 if (flags & SCF_WHILEM_VISITED_POS)
5563                     f |= SCF_WHILEM_VISITED_POS;
5564                 next = regnext(scan);
5565                 nscan = NEXTOPER(NEXTOPER(scan));
5566                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5567                                       last, &data_fake, stopparen,
5568                                       recursed_depth, NULL, f, depth+1);
5569                 if (scan->flags) {
5570                     if (deltanext) {
5571                         FAIL("Variable length lookbehind not implemented");
5572                     }
5573                     else if (minnext > (I32)U8_MAX) {
5574                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5575                               (UV)U8_MAX);
5576                     }
5577                     scan->flags = (U8)minnext;
5578                 }
5579                 if (data) {
5580                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5581                         pars++;
5582                     if (data_fake.flags & SF_HAS_EVAL)
5583                         data->flags |= SF_HAS_EVAL;
5584                     data->whilem_c = data_fake.whilem_c;
5585                 }
5586                 if (f & SCF_DO_STCLASS_AND) {
5587                     if (flags & SCF_DO_STCLASS_OR) {
5588                         /* OR before, AND after: ideally we would recurse with
5589                          * data_fake to get the AND applied by study of the
5590                          * remainder of the pattern, and then derecurse;
5591                          * *** HACK *** for now just treat as "no information".
5592                          * See [perl #56690].
5593                          */
5594                         ssc_init(pRExC_state, data->start_class);
5595                     }  else {
5596                         /* AND before and after: combine and continue.  These
5597                          * assertions are zero-length, so can match an EMPTY
5598                          * string */
5599                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5600                         ANYOF_FLAGS(data->start_class)
5601                                                    |= SSC_MATCHES_EMPTY_STRING;
5602                     }
5603                 }
5604             }
5605 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5606             else {
5607                 /* Positive Lookahead/lookbehind
5608                    In this case we can do fixed string optimisation,
5609                    but we must be careful about it. Note in the case of
5610                    lookbehind the positions will be offset by the minimum
5611                    length of the pattern, something we won't know about
5612                    until after the recurse.
5613                 */
5614                 SSize_t deltanext, fake = 0;
5615                 regnode *nscan;
5616                 regnode_ssc intrnl;
5617                 int f = 0;
5618                 /* We use SAVEFREEPV so that when the full compile
5619                     is finished perl will clean up the allocated
5620                     minlens when it's all done. This way we don't
5621                     have to worry about freeing them when we know
5622                     they wont be used, which would be a pain.
5623                  */
5624                 SSize_t *minnextp;
5625                 Newx( minnextp, 1, SSize_t );
5626                 SAVEFREEPV(minnextp);
5627
5628                 if (data) {
5629                     StructCopy(data, &data_fake, scan_data_t);
5630                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5631                         f |= SCF_DO_SUBSTR;
5632                         if (scan->flags)
5633                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5634                         data_fake.last_found=newSVsv(data->last_found);
5635                     }
5636                 }
5637                 else
5638                     data_fake.last_closep = &fake;
5639                 data_fake.flags = 0;
5640                 data_fake.pos_delta = delta;
5641                 if (is_inf)
5642                     data_fake.flags |= SF_IS_INF;
5643                 if ( flags & SCF_DO_STCLASS && !scan->flags
5644                      && OP(scan) == IFMATCH ) { /* Lookahead */
5645                     ssc_init(pRExC_state, &intrnl);
5646                     data_fake.start_class = &intrnl;
5647                     f |= SCF_DO_STCLASS_AND;
5648                 }
5649                 if (flags & SCF_WHILEM_VISITED_POS)
5650                     f |= SCF_WHILEM_VISITED_POS;
5651                 next = regnext(scan);
5652                 nscan = NEXTOPER(NEXTOPER(scan));
5653
5654                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5655                                         &deltanext, last, &data_fake,
5656                                         stopparen, recursed_depth, NULL,
5657                                         f,depth+1);
5658                 if (scan->flags) {
5659                     if (deltanext) {
5660                         FAIL("Variable length lookbehind not implemented");
5661                     }
5662                     else if (*minnextp > (I32)U8_MAX) {
5663                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5664                               (UV)U8_MAX);
5665                     }
5666                     scan->flags = (U8)*minnextp;
5667                 }
5668
5669                 *minnextp += min;
5670
5671                 if (f & SCF_DO_STCLASS_AND) {
5672                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5673                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5674                 }
5675                 if (data) {
5676                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5677                         pars++;
5678                     if (data_fake.flags & SF_HAS_EVAL)
5679                         data->flags |= SF_HAS_EVAL;
5680                     data->whilem_c = data_fake.whilem_c;
5681                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5682                         if (RExC_rx->minlen<*minnextp)
5683                             RExC_rx->minlen=*minnextp;
5684                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5685                         SvREFCNT_dec_NN(data_fake.last_found);
5686
5687                         if ( data_fake.minlen_fixed != minlenp )
5688                         {
5689                             data->offset_fixed= data_fake.offset_fixed;
5690                             data->minlen_fixed= data_fake.minlen_fixed;
5691                             data->lookbehind_fixed+= scan->flags;
5692                         }
5693                         if ( data_fake.minlen_float != minlenp )
5694                         {
5695                             data->minlen_float= data_fake.minlen_float;
5696                             data->offset_float_min=data_fake.offset_float_min;
5697                             data->offset_float_max=data_fake.offset_float_max;
5698                             data->lookbehind_float+= scan->flags;
5699                         }
5700                     }
5701                 }
5702             }
5703 #endif
5704         }
5705         else if (OP(scan) == OPEN) {
5706             if (stopparen != (I32)ARG(scan))
5707                 pars++;
5708         }
5709         else if (OP(scan) == CLOSE) {
5710             if (stopparen == (I32)ARG(scan)) {
5711                 break;
5712             }
5713             if ((I32)ARG(scan) == is_par) {
5714                 next = regnext(scan);
5715
5716                 if ( next && (OP(next) != WHILEM) && next < last)
5717                     is_par = 0;         /* Disable optimization */
5718             }
5719             if (data)
5720                 *(data->last_closep) = ARG(scan);
5721         }
5722         else if (OP(scan) == EVAL) {
5723                 if (data)
5724                     data->flags |= SF_HAS_EVAL;
5725         }
5726         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5727             if (flags & SCF_DO_SUBSTR) {
5728                 scan_commit(pRExC_state, data, minlenp, is_inf);
5729                 flags &= ~SCF_DO_SUBSTR;
5730             }
5731             if (data && OP(scan)==ACCEPT) {
5732                 data->flags |= SCF_SEEN_ACCEPT;
5733                 if (stopmin > min)
5734                     stopmin = min;
5735             }
5736         }
5737         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5738         {
5739                 if (flags & SCF_DO_SUBSTR) {
5740                     scan_commit(pRExC_state, data, minlenp, is_inf);
5741                     data->longest = &(data->longest_float);
5742                 }
5743                 is_inf = is_inf_internal = 1;
5744                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5745                     ssc_anything(data->start_class);
5746                 flags &= ~SCF_DO_STCLASS;
5747         }
5748         else if (OP(scan) == GPOS) {
5749             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5750                 !(delta || is_inf || (data && data->pos_delta)))
5751             {
5752                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5753                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5754                 if (RExC_rx->gofs < (STRLEN)min)
5755                     RExC_rx->gofs = min;
5756             } else {
5757                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5758                 RExC_rx->gofs = 0;
5759             }
5760         }
5761 #ifdef TRIE_STUDY_OPT
5762 #ifdef FULL_TRIE_STUDY
5763         else if (PL_regkind[OP(scan)] == TRIE) {
5764             /* NOTE - There is similar code to this block above for handling
5765                BRANCH nodes on the initial study.  If you change stuff here
5766                check there too. */
5767             regnode *trie_node= scan;
5768             regnode *tail= regnext(scan);
5769             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5770             SSize_t max1 = 0, min1 = SSize_t_MAX;
5771             regnode_ssc accum;
5772
5773             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5774                 /* Cannot merge strings after this. */
5775                 scan_commit(pRExC_state, data, minlenp, is_inf);
5776             }
5777             if (flags & SCF_DO_STCLASS)
5778                 ssc_init_zero(pRExC_state, &accum);
5779
5780             if (!trie->jump) {
5781                 min1= trie->minlen;
5782                 max1= trie->maxlen;
5783             } else {
5784                 const regnode *nextbranch= NULL;
5785                 U32 word;
5786
5787                 for ( word=1 ; word <= trie->wordcount ; word++)
5788                 {
5789                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5790                     regnode_ssc this_class;
5791
5792                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5793                     if (data) {
5794                         data_fake.whilem_c = data->whilem_c;
5795                         data_fake.last_closep = data->last_closep;
5796                     }
5797                     else
5798                         data_fake.last_closep = &fake;
5799                     data_fake.pos_delta = delta;
5800                     if (flags & SCF_DO_STCLASS) {
5801                         ssc_init(pRExC_state, &this_class);
5802                         data_fake.start_class = &this_class;
5803                         f = SCF_DO_STCLASS_AND;
5804                     }
5805                     if (flags & SCF_WHILEM_VISITED_POS)
5806                         f |= SCF_WHILEM_VISITED_POS;
5807
5808                     if (trie->jump[word]) {
5809                         if (!nextbranch)
5810                             nextbranch = trie_node + trie->jump[0];
5811                         scan= trie_node + trie->jump[word];
5812                         /* We go from the jump point to the branch that follows
5813                            it. Note this means we need the vestigal unused
5814                            branches even though they arent otherwise used. */
5815                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5816                             &deltanext, (regnode *)nextbranch, &data_fake,
5817                             stopparen, recursed_depth, NULL, f,depth+1);
5818                     }
5819                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5820                         nextbranch= regnext((regnode*)nextbranch);
5821
5822                     if (min1 > (SSize_t)(minnext + trie->minlen))
5823                         min1 = minnext + trie->minlen;
5824                     if (deltanext == SSize_t_MAX) {
5825                         is_inf = is_inf_internal = 1;
5826                         max1 = SSize_t_MAX;
5827                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5828                         max1 = minnext + deltanext + trie->maxlen;
5829
5830                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5831                         pars++;
5832                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5833                         if ( stopmin > min + min1)
5834                             stopmin = min + min1;
5835                         flags &= ~SCF_DO_SUBSTR;
5836                         if (data)
5837                             data->flags |= SCF_SEEN_ACCEPT;
5838                     }
5839                     if (data) {
5840                         if (data_fake.flags & SF_HAS_EVAL)
5841                             data->flags |= SF_HAS_EVAL;
5842                         data->whilem_c = data_fake.whilem_c;
5843                     }
5844                     if (flags & SCF_DO_STCLASS)
5845                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5846                 }
5847             }
5848             if (flags & SCF_DO_SUBSTR) {
5849                 data->pos_min += min1;
5850                 data->pos_delta += max1 - min1;
5851                 if (max1 != min1 || is_inf)
5852                     data->longest = &(data->longest_float);
5853             }
5854             min += min1;
5855             if (delta != SSize_t_MAX)
5856                 delta += max1 - min1;
5857             if (flags & SCF_DO_STCLASS_OR) {
5858                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5859                 if (min1) {
5860                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5861                     flags &= ~SCF_DO_STCLASS;
5862                 }
5863             }
5864             else if (flags & SCF_DO_STCLASS_AND) {
5865                 if (min1) {
5866                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5867                     flags &= ~SCF_DO_STCLASS;
5868                 }
5869                 else {
5870                     /* Switch to OR mode: cache the old value of
5871                      * data->start_class */
5872                     INIT_AND_WITHP;
5873                     StructCopy(data->start_class, and_withp, regnode_ssc);
5874                     flags &= ~SCF_DO_STCLASS_AND;
5875                     StructCopy(&accum, data->start_class, regnode_ssc);
5876                     flags |= SCF_DO_STCLASS_OR;
5877                 }
5878             }
5879             scan= tail;
5880             continue;
5881         }
5882 #else
5883         else if (PL_regkind[OP(scan)] == TRIE) {
5884             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5885             U8*bang=NULL;
5886
5887             min += trie->minlen;
5888             delta += (trie->maxlen - trie->minlen);
5889             flags &= ~SCF_DO_STCLASS; /* xxx */
5890             if (flags & SCF_DO_SUBSTR) {
5891                 /* Cannot expect anything... */
5892                 scan_commit(pRExC_state, data, minlenp, is_inf);
5893                 data->pos_min += trie->minlen;
5894                 data->pos_delta += (trie->maxlen - trie->minlen);
5895                 if (trie->maxlen != trie->minlen)
5896                     data->longest = &(data->longest_float);
5897             }
5898             if (trie->jump) /* no more substrings -- for now /grr*/
5899                flags &= ~SCF_DO_SUBSTR;
5900         }
5901 #endif /* old or new */
5902 #endif /* TRIE_STUDY_OPT */
5903
5904         /* Else: zero-length, ignore. */
5905         scan = regnext(scan);
5906     }
5907
5908   finish:
5909     if (frame) {
5910         /* we need to unwind recursion. */
5911         depth = depth - 1;
5912
5913         DEBUG_STUDYDATA("frame-end:",data,depth);
5914         DEBUG_PEEP("fend", scan, depth);
5915
5916         /* restore previous context */
5917         last = frame->last_regnode;
5918         scan = frame->next_regnode;
5919         stopparen = frame->stopparen;
5920         recursed_depth = frame->prev_recursed_depth;
5921
5922         RExC_frame_last = frame->prev_frame;
5923         frame = frame->this_prev_frame;
5924         goto fake_study_recurse;
5925     }
5926
5927     assert(!frame);
5928     DEBUG_STUDYDATA("pre-fin:",data,depth);
5929
5930     *scanp = scan;
5931     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5932
5933     if (flags & SCF_DO_SUBSTR && is_inf)
5934         data->pos_delta = SSize_t_MAX - data->pos_min;
5935     if (is_par > (I32)U8_MAX)
5936         is_par = 0;
5937     if (is_par && pars==1 && data) {
5938         data->flags |= SF_IN_PAR;
5939         data->flags &= ~SF_HAS_PAR;
5940     }
5941     else if (pars && data) {
5942         data->flags |= SF_HAS_PAR;
5943         data->flags &= ~SF_IN_PAR;
5944     }
5945     if (flags & SCF_DO_STCLASS_OR)
5946         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5947     if (flags & SCF_TRIE_RESTUDY)
5948         data->flags |=  SCF_TRIE_RESTUDY;
5949
5950     DEBUG_STUDYDATA("post-fin:",data,depth);
5951
5952     {
5953         SSize_t final_minlen= min < stopmin ? min : stopmin;
5954
5955         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5956             if (final_minlen > SSize_t_MAX - delta)
5957                 RExC_maxlen = SSize_t_MAX;
5958             else if (RExC_maxlen < final_minlen + delta)
5959                 RExC_maxlen = final_minlen + delta;
5960         }
5961         return final_minlen;
5962     }
5963     NOT_REACHED; /* NOTREACHED */
5964 }
5965
5966 STATIC U32
5967 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5968 {
5969     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5970
5971     PERL_ARGS_ASSERT_ADD_DATA;
5972
5973     Renewc(RExC_rxi->data,
5974            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5975            char, struct reg_data);
5976     if(count)
5977         Renew(RExC_rxi->data->what, count + n, U8);
5978     else
5979         Newx(RExC_rxi->data->what, n, U8);
5980     RExC_rxi->data->count = count + n;
5981     Copy(s, RExC_rxi->data->what + count, n, U8);
5982     return count;
5983 }
5984
5985 /*XXX: todo make this not included in a non debugging perl, but appears to be
5986  * used anyway there, in 'use re' */
5987 #ifndef PERL_IN_XSUB_RE
5988 void
5989 Perl_reginitcolors(pTHX)
5990 {
5991     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5992     if (s) {
5993         char *t = savepv(s);
5994         int i = 0;
5995         PL_colors[0] = t;
5996         while (++i < 6) {
5997             t = strchr(t, '\t');
5998             if (t) {
5999                 *t = '\0';
6000                 PL_colors[i] = ++t;
6001             }
6002             else
6003                 PL_colors[i] = t = (char *)"";
6004         }
6005     } else {
6006         int i = 0;
6007         while (i < 6)
6008             PL_colors[i++] = (char *)"";
6009     }
6010     PL_colorset = 1;
6011 }
6012 #endif
6013
6014
6015 #ifdef TRIE_STUDY_OPT
6016 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6017     STMT_START {                                            \
6018         if (                                                \
6019               (data.flags & SCF_TRIE_RESTUDY)               \
6020               && ! restudied++                              \
6021         ) {                                                 \
6022             dOsomething;                                    \
6023             goto reStudy;                                   \
6024         }                                                   \
6025     } STMT_END
6026 #else
6027 #define CHECK_RESTUDY_GOTO_butfirst
6028 #endif
6029
6030 /*
6031  * pregcomp - compile a regular expression into internal code
6032  *
6033  * Decides which engine's compiler to call based on the hint currently in
6034  * scope
6035  */
6036
6037 #ifndef PERL_IN_XSUB_RE
6038
6039 /* return the currently in-scope regex engine (or the default if none)  */
6040
6041 regexp_engine const *
6042 Perl_current_re_engine(pTHX)
6043 {
6044     if (IN_PERL_COMPILETIME) {
6045         HV * const table = GvHV(PL_hintgv);
6046         SV **ptr;
6047
6048         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6049             return &PL_core_reg_engine;
6050         ptr = hv_fetchs(table, "regcomp", FALSE);
6051         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6052             return &PL_core_reg_engine;
6053         return INT2PTR(regexp_engine*,SvIV(*ptr));
6054     }
6055     else {
6056         SV *ptr;
6057         if (!PL_curcop->cop_hints_hash)
6058             return &PL_core_reg_engine;
6059         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6060         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6061             return &PL_core_reg_engine;
6062         return INT2PTR(regexp_engine*,SvIV(ptr));
6063     }
6064 }
6065
6066
6067 REGEXP *
6068 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6069 {
6070     regexp_engine const *eng = current_re_engine();
6071     GET_RE_DEBUG_FLAGS_DECL;
6072
6073     PERL_ARGS_ASSERT_PREGCOMP;
6074
6075     /* Dispatch a request to compile a regexp to correct regexp engine. */
6076     DEBUG_COMPILE_r({
6077         Perl_re_printf( aTHX_  "Using engine %"UVxf"\n",
6078                         PTR2UV(eng));
6079     });
6080     return CALLREGCOMP_ENG(eng, pattern, flags);
6081 }
6082 #endif
6083
6084 /* public(ish) entry point for the perl core's own regex compiling code.
6085  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6086  * pattern rather than a list of OPs, and uses the internal engine rather
6087  * than the current one */
6088
6089 REGEXP *
6090 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6091 {
6092     SV *pat = pattern; /* defeat constness! */
6093     PERL_ARGS_ASSERT_RE_COMPILE;
6094     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6095 #ifdef PERL_IN_XSUB_RE
6096                                 &my_reg_engine,
6097 #else
6098                                 &PL_core_reg_engine,
6099 #endif
6100                                 NULL, NULL, rx_flags, 0);
6101 }
6102
6103
6104 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6105  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6106  * point to the realloced string and length.
6107  *
6108  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6109  * stuff added */
6110
6111 static void
6112 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6113                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6114 {
6115     U8 *const src = (U8*)*pat_p;
6116     U8 *dst, *d;
6117     int n=0;
6118     STRLEN s = 0;
6119     bool do_end = 0;
6120     GET_RE_DEBUG_FLAGS_DECL;
6121
6122     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6123         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6124
6125     Newx(dst, *plen_p * 2 + 1, U8);
6126     d = dst;
6127
6128     while (s < *plen_p) {
6129         append_utf8_from_native_byte(src[s], &d);
6130         if (n < num_code_blocks) {
6131             if (!do_end && pRExC_state->code_blocks[n].start == s) {
6132                 pRExC_state->code_blocks[n].start = d - dst - 1;
6133                 assert(*(d - 1) == '(');
6134                 do_end = 1;
6135             }
6136             else if (do_end && pRExC_state->code_blocks[n].end == s) {
6137                 pRExC_state->code_blocks[n].end = d - dst - 1;
6138                 assert(*(d - 1) == ')');
6139                 do_end = 0;
6140                 n++;
6141             }
6142         }
6143         s++;
6144     }
6145     *d = '\0';
6146     *plen_p = d - dst;
6147     *pat_p = (char*) dst;
6148     SAVEFREEPV(*pat_p);
6149     RExC_orig_utf8 = RExC_utf8 = 1;
6150 }
6151
6152
6153
6154 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6155  * while recording any code block indices, and handling overloading,
6156  * nested qr// objects etc.  If pat is null, it will allocate a new
6157  * string, or just return the first arg, if there's only one.
6158  *
6159  * Returns the malloced/updated pat.
6160  * patternp and pat_count is the array of SVs to be concatted;
6161  * oplist is the optional list of ops that generated the SVs;
6162  * recompile_p is a pointer to a boolean that will be set if
6163  *   the regex will need to be recompiled.
6164  * delim, if non-null is an SV that will be inserted between each element
6165  */
6166
6167 static SV*
6168 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6169                 SV *pat, SV ** const patternp, int pat_count,
6170                 OP *oplist, bool *recompile_p, SV *delim)
6171 {
6172     SV **svp;
6173     int n = 0;
6174     bool use_delim = FALSE;
6175     bool alloced = FALSE;
6176
6177     /* if we know we have at least two args, create an empty string,
6178      * then concatenate args to that. For no args, return an empty string */
6179     if (!pat && pat_count != 1) {
6180         pat = newSVpvs("");
6181         SAVEFREESV(pat);
6182         alloced = TRUE;
6183     }
6184
6185     for (svp = patternp; svp < patternp + pat_count; svp++) {
6186         SV *sv;
6187         SV *rx  = NULL;
6188         STRLEN orig_patlen = 0;
6189         bool code = 0;
6190         SV *msv = use_delim ? delim : *svp;
6191         if (!msv) msv = &PL_sv_undef;
6192
6193         /* if we've got a delimiter, we go round the loop twice for each
6194          * svp slot (except the last), using the delimiter the second
6195          * time round */
6196         if (use_delim) {
6197             svp--;
6198             use_delim = FALSE;
6199         }
6200         else if (delim)
6201             use_delim = TRUE;
6202
6203         if (SvTYPE(msv) == SVt_PVAV) {
6204             /* we've encountered an interpolated array within
6205              * the pattern, e.g. /...@a..../. Expand the list of elements,
6206              * then recursively append elements.
6207              * The code in this block is based on S_pushav() */
6208
6209             AV *const av = (AV*)msv;
6210             const SSize_t maxarg = AvFILL(av) + 1;
6211             SV **array;
6212
6213             if (oplist) {
6214                 assert(oplist->op_type == OP_PADAV
6215                     || oplist->op_type == OP_RV2AV);
6216                 oplist = OpSIBLING(oplist);
6217             }
6218
6219             if (SvRMAGICAL(av)) {
6220                 SSize_t i;
6221
6222                 Newx(array, maxarg, SV*);
6223                 SAVEFREEPV(array);
6224                 for (i=0; i < maxarg; i++) {
6225                     SV ** const svp = av_fetch(av, i, FALSE);
6226                     array[i] = svp ? *svp : &PL_sv_undef;
6227                 }
6228             }
6229             else
6230                 array = AvARRAY(av);
6231
6232             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6233                                 array, maxarg, NULL, recompile_p,
6234                                 /* $" */
6235                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6236
6237             continue;
6238         }
6239
6240
6241         /* we make the assumption here that each op in the list of
6242          * op_siblings maps to one SV pushed onto the stack,
6243          * except for code blocks, with have both an OP_NULL and
6244          * and OP_CONST.
6245          * This allows us to match up the list of SVs against the
6246          * list of OPs to find the next code block.
6247          *
6248          * Note that       PUSHMARK PADSV PADSV ..
6249          * is optimised to
6250          *                 PADRANGE PADSV  PADSV  ..
6251          * so the alignment still works. */
6252
6253         if (oplist) {
6254             if (oplist->op_type == OP_NULL
6255                 && (oplist->op_flags & OPf_SPECIAL))
6256             {
6257                 assert(n < pRExC_state->num_code_blocks);
6258                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6259                 pRExC_state->code_blocks[n].block = oplist;
6260                 pRExC_state->code_blocks[n].src_regex = NULL;
6261                 n++;
6262                 code = 1;
6263                 oplist = OpSIBLING(oplist); /* skip CONST */
6264                 assert(oplist);
6265             }
6266             oplist = OpSIBLING(oplist);;
6267         }
6268
6269         /* apply magic and QR overloading to arg */
6270
6271         SvGETMAGIC(msv);
6272         if (SvROK(msv) && SvAMAGIC(msv)) {
6273             SV *sv = AMG_CALLunary(msv, regexp_amg);
6274             if (sv) {
6275                 if (SvROK(sv))
6276                     sv = SvRV(sv);
6277                 if (SvTYPE(sv) != SVt_REGEXP)
6278                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6279                 msv = sv;
6280             }
6281         }
6282
6283         /* try concatenation overload ... */
6284         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6285                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6286         {
6287             sv_setsv(pat, sv);
6288             /* overloading involved: all bets are off over literal
6289              * code. Pretend we haven't seen it */
6290             pRExC_state->num_code_blocks -= n;
6291             n = 0;
6292         }
6293         else  {
6294             /* ... or failing that, try "" overload */
6295             while (SvAMAGIC(msv)
6296                     && (sv = AMG_CALLunary(msv, string_amg))
6297                     && sv != msv
6298                     &&  !(   SvROK(msv)
6299                           && SvROK(sv)
6300                           && SvRV(msv) == SvRV(sv))
6301             ) {
6302                 msv = sv;
6303                 SvGETMAGIC(msv);
6304             }
6305             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6306                 msv = SvRV(msv);
6307
6308             if (pat) {
6309                 /* this is a partially unrolled
6310                  *     sv_catsv_nomg(pat, msv);
6311                  * that allows us to adjust code block indices if
6312                  * needed */
6313                 STRLEN dlen;
6314                 char *dst = SvPV_force_nomg(pat, dlen);
6315                 orig_patlen = dlen;
6316                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6317                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6318                     sv_setpvn(pat, dst, dlen);
6319                     SvUTF8_on(pat);
6320                 }
6321                 sv_catsv_nomg(pat, msv);
6322                 rx = msv;
6323             }
6324             else
6325                 pat = msv;
6326
6327             if (code)
6328                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6329         }
6330
6331         /* extract any code blocks within any embedded qr//'s */
6332         if (rx && SvTYPE(rx) == SVt_REGEXP
6333             && RX_ENGINE((REGEXP*)rx)->op_comp)
6334         {
6335
6336             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6337             if (ri->num_code_blocks) {
6338                 int i;
6339                 /* the presence of an embedded qr// with code means
6340                  * we should always recompile: the text of the
6341                  * qr// may not have changed, but it may be a
6342                  * different closure than last time */
6343                 *recompile_p = 1;
6344                 Renew(pRExC_state->code_blocks,
6345                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6346                     struct reg_code_block);
6347                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6348
6349                 for (i=0; i < ri->num_code_blocks; i++) {
6350                     struct reg_code_block *src, *dst;
6351                     STRLEN offset =  orig_patlen
6352                         + ReANY((REGEXP *)rx)->pre_prefix;
6353                     assert(n < pRExC_state->num_code_blocks);
6354                     src = &ri->code_blocks[i];
6355                     dst = &pRExC_state->code_blocks[n];
6356                     dst->start      = src->start + offset;
6357                     dst->end        = src->end   + offset;
6358                     dst->block      = src->block;
6359                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6360                                             src->src_regex
6361                                                 ? src->src_regex
6362                                                 : (REGEXP*)rx);
6363                     n++;
6364                 }
6365             }
6366         }
6367     }
6368     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6369     if (alloced)
6370         SvSETMAGIC(pat);
6371
6372     return pat;
6373 }
6374
6375
6376
6377 /* see if there are any run-time code blocks in the pattern.
6378  * False positives are allowed */
6379
6380 static bool
6381 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6382                     char *pat, STRLEN plen)
6383 {
6384     int n = 0;
6385     STRLEN s;
6386
6387     PERL_UNUSED_CONTEXT;
6388
6389     for (s = 0; s < plen; s++) {
6390         if (n < pRExC_state->num_code_blocks
6391             && s == pRExC_state->code_blocks[n].start)
6392         {
6393             s = pRExC_state->code_blocks[n].end;
6394             n++;
6395             continue;
6396         }
6397         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6398          * positives here */
6399         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6400             (pat[s+2] == '{'
6401                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6402         )
6403             return 1;
6404     }
6405     return 0;
6406 }
6407
6408 /* Handle run-time code blocks. We will already have compiled any direct
6409  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6410  * copy of it, but with any literal code blocks blanked out and
6411  * appropriate chars escaped; then feed it into
6412  *
6413  *    eval "qr'modified_pattern'"
6414  *
6415  * For example,
6416  *
6417  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6418  *
6419  * becomes
6420  *
6421  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6422  *
6423  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6424  * and merge them with any code blocks of the original regexp.
6425  *
6426  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6427  * instead, just save the qr and return FALSE; this tells our caller that
6428  * the original pattern needs upgrading to utf8.
6429  */
6430
6431 static bool
6432 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6433     char *pat, STRLEN plen)
6434 {
6435     SV *qr;
6436
6437     GET_RE_DEBUG_FLAGS_DECL;
6438
6439     if (pRExC_state->runtime_code_qr) {
6440         /* this is the second time we've been called; this should
6441          * only happen if the main pattern got upgraded to utf8
6442          * during compilation; re-use the qr we compiled first time
6443          * round (which should be utf8 too)
6444          */
6445         qr = pRExC_state->runtime_code_qr;
6446         pRExC_state->runtime_code_qr = NULL;
6447         assert(RExC_utf8 && SvUTF8(qr));
6448     }
6449     else {
6450         int n = 0;
6451         STRLEN s;
6452         char *p, *newpat;
6453         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6454         SV *sv, *qr_ref;
6455         dSP;
6456
6457         /* determine how many extra chars we need for ' and \ escaping */
6458         for (s = 0; s < plen; s++) {
6459             if (pat[s] == '\'' || pat[s] == '\\')
6460                 newlen++;
6461         }
6462
6463         Newx(newpat, newlen, char);
6464         p = newpat;
6465         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6466
6467         for (s = 0; s < plen; s++) {
6468             if (n < pRExC_state->num_code_blocks
6469                 && s == pRExC_state->code_blocks[n].start)
6470             {
6471                 /* blank out literal code block */
6472                 assert(pat[s] == '(');
6473                 while (s <= pRExC_state->code_blocks[n].end) {
6474                     *p++ = '_';
6475                     s++;
6476                 }
6477                 s--;
6478                 n++;
6479                 continue;
6480             }
6481             if (pat[s] == '\'' || pat[s] == '\\')
6482                 *p++ = '\\';
6483             *p++ = pat[s];
6484         }
6485         *p++ = '\'';
6486         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6487             *p++ = 'x';
6488         *p++ = '\0';
6489         DEBUG_COMPILE_r({
6490             Perl_re_printf( aTHX_
6491                 "%sre-parsing pattern for runtime code:%s %s\n",
6492                 PL_colors[4],PL_colors[5],newpat);
6493         });
6494
6495         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6496         Safefree(newpat);
6497
6498         ENTER;
6499         SAVETMPS;
6500         save_re_context();
6501         PUSHSTACKi(PERLSI_REQUIRE);
6502         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6503          * parsing qr''; normally only q'' does this. It also alters
6504          * hints handling */
6505         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6506         SvREFCNT_dec_NN(sv);
6507         SPAGAIN;
6508         qr_ref = POPs;
6509         PUTBACK;
6510         {
6511             SV * const errsv = ERRSV;
6512             if (SvTRUE_NN(errsv))
6513             {
6514                 Safefree(pRExC_state->code_blocks);
6515                 /* use croak_sv ? */
6516                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6517             }
6518         }
6519         assert(SvROK(qr_ref));
6520         qr = SvRV(qr_ref);
6521         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6522         /* the leaving below frees the tmp qr_ref.
6523          * Give qr a life of its own */
6524         SvREFCNT_inc(qr);
6525         POPSTACK;
6526         FREETMPS;
6527         LEAVE;
6528
6529     }
6530
6531     if (!RExC_utf8 && SvUTF8(qr)) {
6532         /* first time through; the pattern got upgraded; save the
6533          * qr for the next time through */
6534         assert(!pRExC_state->runtime_code_qr);
6535         pRExC_state->runtime_code_qr = qr;
6536         return 0;
6537     }
6538
6539
6540     /* extract any code blocks within the returned qr//  */
6541
6542
6543     /* merge the main (r1) and run-time (r2) code blocks into one */
6544     {
6545         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6546         struct reg_code_block *new_block, *dst;
6547         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6548         int i1 = 0, i2 = 0;
6549
6550         if (!r2->num_code_blocks) /* we guessed wrong */
6551         {
6552             SvREFCNT_dec_NN(qr);
6553             return 1;
6554         }
6555
6556         Newx(new_block,
6557             r1->num_code_blocks + r2->num_code_blocks,
6558             struct reg_code_block);
6559         dst = new_block;
6560
6561         while (    i1 < r1->num_code_blocks
6562                 || i2 < r2->num_code_blocks)
6563         {
6564             struct reg_code_block *src;
6565             bool is_qr = 0;
6566
6567             if (i1 == r1->num_code_blocks) {
6568                 src = &r2->code_blocks[i2++];
6569                 is_qr = 1;
6570             }
6571             else if (i2 == r2->num_code_blocks)
6572                 src = &r1->code_blocks[i1++];
6573             else if (  r1->code_blocks[i1].start
6574                      < r2->code_blocks[i2].start)
6575             {
6576                 src = &r1->code_blocks[i1++];
6577                 assert(src->end < r2->code_blocks[i2].start);
6578             }
6579             else {
6580                 assert(  r1->code_blocks[i1].start
6581                        > r2->code_blocks[i2].start);
6582                 src = &r2->code_blocks[i2++];
6583                 is_qr = 1;
6584                 assert(src->end < r1->code_blocks[i1].start);
6585             }
6586
6587             assert(pat[src->start] == '(');
6588             assert(pat[src->end]   == ')');
6589             dst->start      = src->start;
6590             dst->end        = src->end;
6591             dst->block      = src->block;
6592             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6593                                     : src->src_regex;
6594             dst++;
6595         }
6596         r1->num_code_blocks += r2->num_code_blocks;
6597         Safefree(r1->code_blocks);
6598         r1->code_blocks = new_block;
6599     }
6600
6601     SvREFCNT_dec_NN(qr);
6602     return 1;
6603 }
6604
6605
6606 STATIC bool
6607 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6608                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6609                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6610                       STRLEN longest_length, bool eol, bool meol)
6611 {
6612     /* This is the common code for setting up the floating and fixed length
6613      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6614      * as to whether succeeded or not */
6615
6616     I32 t;
6617     SSize_t ml;
6618
6619     if (! (longest_length
6620            || (eol /* Can't have SEOL and MULTI */
6621                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6622           )
6623             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6624         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6625     {
6626         return FALSE;
6627     }
6628
6629     /* copy the information about the longest from the reg_scan_data
6630         over to the program. */
6631     if (SvUTF8(sv_longest)) {
6632         *rx_utf8 = sv_longest;
6633         *rx_substr = NULL;
6634     } else {
6635         *rx_substr = sv_longest;
6636         *rx_utf8 = NULL;
6637     }
6638     /* end_shift is how many chars that must be matched that
6639         follow this item. We calculate it ahead of time as once the
6640         lookbehind offset is added in we lose the ability to correctly
6641         calculate it.*/
6642     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6643     *rx_end_shift = ml - offset
6644         - longest_length + (SvTAIL(sv_longest) != 0)
6645         + lookbehind;
6646
6647     t = (eol/* Can't have SEOL and MULTI */
6648          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6649     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6650
6651     return TRUE;
6652 }
6653
6654 /*
6655  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6656  * regular expression into internal code.
6657  * The pattern may be passed either as:
6658  *    a list of SVs (patternp plus pat_count)
6659  *    a list of OPs (expr)
6660  * If both are passed, the SV list is used, but the OP list indicates
6661  * which SVs are actually pre-compiled code blocks
6662  *
6663  * The SVs in the list have magic and qr overloading applied to them (and
6664  * the list may be modified in-place with replacement SVs in the latter
6665  * case).
6666  *
6667  * If the pattern hasn't changed from old_re, then old_re will be
6668  * returned.
6669  *
6670  * eng is the current engine. If that engine has an op_comp method, then
6671  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6672  * do the initial concatenation of arguments and pass on to the external
6673  * engine.
6674  *
6675  * If is_bare_re is not null, set it to a boolean indicating whether the
6676  * arg list reduced (after overloading) to a single bare regex which has
6677  * been returned (i.e. /$qr/).
6678  *
6679  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6680  *
6681  * pm_flags contains the PMf_* flags, typically based on those from the
6682  * pm_flags field of the related PMOP. Currently we're only interested in
6683  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6684  *
6685  * We can't allocate space until we know how big the compiled form will be,
6686  * but we can't compile it (and thus know how big it is) until we've got a
6687  * place to put the code.  So we cheat:  we compile it twice, once with code
6688  * generation turned off and size counting turned on, and once "for real".
6689  * This also means that we don't allocate space until we are sure that the
6690  * thing really will compile successfully, and we never have to move the
6691  * code and thus invalidate pointers into it.  (Note that it has to be in
6692  * one piece because free() must be able to free it all.) [NB: not true in perl]
6693  *
6694  * Beware that the optimization-preparation code in here knows about some
6695  * of the structure of the compiled regexp.  [I'll say.]
6696  */
6697
6698 REGEXP *
6699 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6700                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6701                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6702 {
6703     REGEXP *rx;
6704     struct regexp *r;
6705     regexp_internal *ri;
6706     STRLEN plen;
6707     char *exp;
6708     regnode *scan;
6709     I32 flags;
6710     SSize_t minlen = 0;
6711     U32 rx_flags;
6712     SV *pat;
6713     SV *code_blocksv = NULL;
6714     SV** new_patternp = patternp;
6715
6716     /* these are all flags - maybe they should be turned
6717      * into a single int with different bit masks */
6718     I32 sawlookahead = 0;
6719     I32 sawplus = 0;
6720     I32 sawopen = 0;
6721     I32 sawminmod = 0;
6722
6723     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6724     bool recompile = 0;
6725     bool runtime_code = 0;
6726     scan_data_t data;
6727     RExC_state_t RExC_state;
6728     RExC_state_t * const pRExC_state = &RExC_state;
6729 #ifdef TRIE_STUDY_OPT
6730     int restudied = 0;
6731     RExC_state_t copyRExC_state;
6732 #endif
6733     GET_RE_DEBUG_FLAGS_DECL;
6734
6735     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6736
6737     DEBUG_r(if (!PL_colorset) reginitcolors());
6738
6739     /* Initialize these here instead of as-needed, as is quick and avoids
6740      * having to test them each time otherwise */
6741     if (! PL_AboveLatin1) {
6742 #ifdef DEBUGGING
6743         char * dump_len_string;
6744 #endif
6745
6746         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6747         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6748         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6749         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6750         PL_HasMultiCharFold =
6751                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6752
6753         /* This is calculated here, because the Perl program that generates the
6754          * static global ones doesn't currently have access to
6755          * NUM_ANYOF_CODE_POINTS */
6756         PL_InBitmap = _new_invlist(2);
6757         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6758                                                     NUM_ANYOF_CODE_POINTS - 1);
6759 #ifdef DEBUGGING
6760         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6761         if (   ! dump_len_string
6762             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6763         {
6764             PL_dump_re_max_len = 0;
6765         }
6766 #endif
6767     }
6768
6769     pRExC_state->warn_text = NULL;
6770     pRExC_state->code_blocks = NULL;
6771     pRExC_state->num_code_blocks = 0;
6772
6773     if (is_bare_re)
6774         *is_bare_re = FALSE;
6775
6776     if (expr && (expr->op_type == OP_LIST ||
6777                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6778         /* allocate code_blocks if needed */
6779         OP *o;
6780         int ncode = 0;
6781
6782         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6783             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6784                 ncode++; /* count of DO blocks */
6785         if (ncode) {
6786             pRExC_state->num_code_blocks = ncode;
6787             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6788         }
6789     }
6790
6791     if (!pat_count) {
6792         /* compile-time pattern with just OP_CONSTs and DO blocks */
6793
6794         int n;
6795         OP *o;
6796
6797         /* find how many CONSTs there are */
6798         assert(expr);
6799         n = 0;
6800         if (expr->op_type == OP_CONST)
6801             n = 1;
6802         else
6803             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6804                 if (o->op_type == OP_CONST)
6805                     n++;
6806             }
6807
6808         /* fake up an SV array */
6809
6810         assert(!new_patternp);
6811         Newx(new_patternp, n, SV*);
6812         SAVEFREEPV(new_patternp);
6813         pat_count = n;
6814
6815         n = 0;
6816         if (expr->op_type == OP_CONST)
6817             new_patternp[n] = cSVOPx_sv(expr);
6818         else
6819             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6820                 if (o->op_type == OP_CONST)
6821                     new_patternp[n++] = cSVOPo_sv;
6822             }
6823
6824     }
6825
6826     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6827         "Assembling pattern from %d elements%s\n", pat_count,
6828             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6829
6830     /* set expr to the first arg op */
6831
6832     if (pRExC_state->num_code_blocks
6833          && expr->op_type != OP_CONST)
6834     {
6835             expr = cLISTOPx(expr)->op_first;
6836             assert(   expr->op_type == OP_PUSHMARK
6837                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6838                    || expr->op_type == OP_PADRANGE);
6839             expr = OpSIBLING(expr);
6840     }
6841
6842     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6843                         expr, &recompile, NULL);
6844
6845     /* handle bare (possibly after overloading) regex: foo =~ $re */
6846     {
6847         SV *re = pat;
6848         if (SvROK(re))
6849             re = SvRV(re);
6850         if (SvTYPE(re) == SVt_REGEXP) {
6851             if (is_bare_re)
6852                 *is_bare_re = TRUE;
6853             SvREFCNT_inc(re);
6854             Safefree(pRExC_state->code_blocks);
6855             DEBUG_PARSE_r(Perl_re_printf( aTHX_
6856                 "Precompiled pattern%s\n",
6857                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6858
6859             return (REGEXP*)re;
6860         }
6861     }
6862
6863     exp = SvPV_nomg(pat, plen);
6864
6865     if (!eng->op_comp) {
6866         if ((SvUTF8(pat) && IN_BYTES)
6867                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6868         {
6869             /* make a temporary copy; either to convert to bytes,
6870              * or to avoid repeating get-magic / overloaded stringify */
6871             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6872                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6873         }
6874         Safefree(pRExC_state->code_blocks);
6875         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6876     }
6877
6878     /* ignore the utf8ness if the pattern is 0 length */
6879     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6880
6881     RExC_uni_semantics = 0;
6882     RExC_seen_unfolded_sharp_s = 0;
6883     RExC_contains_locale = 0;
6884     RExC_contains_i = 0;
6885     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6886     RExC_study_started = 0;
6887     pRExC_state->runtime_code_qr = NULL;
6888     RExC_frame_head= NULL;
6889     RExC_frame_last= NULL;
6890     RExC_frame_count= 0;
6891
6892     DEBUG_r({
6893         RExC_mysv1= sv_newmortal();
6894         RExC_mysv2= sv_newmortal();
6895     });
6896     DEBUG_COMPILE_r({
6897             SV *dsv= sv_newmortal();
6898             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6899             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
6900                           PL_colors[4],PL_colors[5],s);
6901         });
6902
6903   redo_first_pass:
6904     /* we jump here if we have to recompile, e.g., from upgrading the pattern
6905      * to utf8 */
6906
6907     if ((pm_flags & PMf_USE_RE_EVAL)
6908                 /* this second condition covers the non-regex literal case,
6909                  * i.e.  $foo =~ '(?{})'. */
6910                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6911     )
6912         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6913
6914     /* return old regex if pattern hasn't changed */
6915     /* XXX: note in the below we have to check the flags as well as the
6916      * pattern.
6917      *
6918      * Things get a touch tricky as we have to compare the utf8 flag
6919      * independently from the compile flags.  */
6920
6921     if (   old_re
6922         && !recompile
6923         && !!RX_UTF8(old_re) == !!RExC_utf8
6924         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6925         && RX_PRECOMP(old_re)
6926         && RX_PRELEN(old_re) == plen
6927         && memEQ(RX_PRECOMP(old_re), exp, plen)
6928         && !runtime_code /* with runtime code, always recompile */ )
6929     {
6930         Safefree(pRExC_state->code_blocks);
6931         return old_re;
6932     }
6933
6934     rx_flags = orig_rx_flags;
6935
6936     if (rx_flags & PMf_FOLD) {
6937         RExC_contains_i = 1;
6938     }
6939     if (   initial_charset == REGEX_DEPENDS_CHARSET
6940         && (RExC_utf8 ||RExC_uni_semantics))
6941     {
6942
6943         /* Set to use unicode semantics if the pattern is in utf8 and has the
6944          * 'depends' charset specified, as it means unicode when utf8  */
6945         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6946     }
6947
6948     RExC_precomp = exp;
6949     RExC_precomp_adj = 0;
6950     RExC_flags = rx_flags;
6951     RExC_pm_flags = pm_flags;
6952
6953     if (runtime_code) {
6954         assert(TAINTING_get || !TAINT_get);
6955         if (TAINT_get)
6956             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6957
6958         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6959             /* whoops, we have a non-utf8 pattern, whilst run-time code
6960              * got compiled as utf8. Try again with a utf8 pattern */
6961             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6962                                     pRExC_state->num_code_blocks);
6963             goto redo_first_pass;
6964         }
6965     }
6966     assert(!pRExC_state->runtime_code_qr);
6967
6968     RExC_sawback = 0;
6969
6970     RExC_seen = 0;
6971     RExC_maxlen = 0;
6972     RExC_in_lookbehind = 0;
6973     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6974     RExC_extralen = 0;
6975     RExC_override_recoding = 0;
6976 #ifdef EBCDIC
6977     RExC_recode_x_to_native = 0;
6978 #endif
6979     RExC_in_multi_char_class = 0;
6980
6981     /* First pass: determine size, legality. */
6982     RExC_parse = exp;
6983     RExC_start = RExC_adjusted_start = exp;
6984     RExC_end = exp + plen;
6985     RExC_precomp_end = RExC_end;
6986     RExC_naughty = 0;
6987     RExC_npar = 1;
6988     RExC_nestroot = 0;
6989     RExC_size = 0L;
6990     RExC_emit = (regnode *) &RExC_emit_dummy;
6991     RExC_whilem_seen = 0;
6992     RExC_open_parens = NULL;
6993     RExC_close_parens = NULL;
6994     RExC_end_op = NULL;
6995     RExC_paren_names = NULL;
6996 #ifdef DEBUGGING
6997     RExC_paren_name_list = NULL;
6998 #endif
6999     RExC_recurse = NULL;
7000     RExC_study_chunk_recursed = NULL;
7001     RExC_study_chunk_recursed_bytes= 0;
7002     RExC_recurse_count = 0;
7003     pRExC_state->code_index = 0;
7004
7005     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7006      * code makes sure the final byte is an uncounted NUL.  But should this
7007      * ever not be the case, lots of things could read beyond the end of the
7008      * buffer: loops like
7009      *      while(isFOO(*RExC_parse)) RExC_parse++;
7010      *      strchr(RExC_parse, "foo");
7011      * etc.  So it is worth noting. */
7012     assert(*RExC_end == '\0');
7013
7014     DEBUG_PARSE_r(
7015         Perl_re_printf( aTHX_  "Starting first pass (sizing)\n");
7016         RExC_lastnum=0;
7017         RExC_lastparse=NULL;
7018     );
7019     /* reg may croak on us, not giving us a chance to free
7020        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
7021        need it to survive as long as the regexp (qr/(?{})/).
7022        We must check that code_blocksv is not already set, because we may
7023        have jumped back to restart the sizing pass. */
7024     if (pRExC_state->code_blocks && !code_blocksv) {
7025         code_blocksv = newSV_type(SVt_PV);
7026         SAVEFREESV(code_blocksv);
7027         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
7028         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
7029     }
7030     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7031         /* It's possible to write a regexp in ascii that represents Unicode
7032         codepoints outside of the byte range, such as via \x{100}. If we
7033         detect such a sequence we have to convert the entire pattern to utf8
7034         and then recompile, as our sizing calculation will have been based
7035         on 1 byte == 1 character, but we will need to use utf8 to encode
7036         at least some part of the pattern, and therefore must convert the whole
7037         thing.
7038         -- dmq */
7039         if (flags & RESTART_PASS1) {
7040             if (flags & NEED_UTF8) {
7041                 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7042                                     pRExC_state->num_code_blocks);
7043             }
7044             else {
7045                 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7046                 "Need to redo pass 1\n"));
7047             }
7048
7049             goto redo_first_pass;
7050         }
7051         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
7052     }
7053     if (code_blocksv)
7054         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
7055
7056     DEBUG_PARSE_r({
7057         Perl_re_printf( aTHX_
7058             "Required size %"IVdf" nodes\n"
7059             "Starting second pass (creation)\n",
7060             (IV)RExC_size);
7061         RExC_lastnum=0;
7062         RExC_lastparse=NULL;
7063     });
7064
7065     /* The first pass could have found things that force Unicode semantics */
7066     if ((RExC_utf8 || RExC_uni_semantics)
7067          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7068     {
7069         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7070     }
7071
7072     /* Small enough for pointer-storage convention?
7073        If extralen==0, this means that we will not need long jumps. */
7074     if (RExC_size >= 0x10000L && RExC_extralen)
7075         RExC_size += RExC_extralen;
7076     else
7077         RExC_extralen = 0;
7078     if (RExC_whilem_seen > 15)
7079         RExC_whilem_seen = 15;
7080
7081     /* Allocate space and zero-initialize. Note, the two step process
7082        of zeroing when in debug mode, thus anything assigned has to
7083        happen after that */
7084     rx = (REGEXP*) newSV_type(SVt_REGEXP);
7085     r = ReANY(rx);
7086     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7087          char, regexp_internal);
7088     if ( r == NULL || ri == NULL )
7089         FAIL("Regexp out of space");
7090 #ifdef DEBUGGING
7091     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7092     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7093          char);
7094 #else
7095     /* bulk initialize base fields with 0. */
7096     Zero(ri, sizeof(regexp_internal), char);
7097 #endif
7098
7099     /* non-zero initialization begins here */
7100     RXi_SET( r, ri );
7101     r->engine= eng;
7102     r->extflags = rx_flags;
7103     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7104
7105     if (pm_flags & PMf_IS_QR) {
7106         ri->code_blocks = pRExC_state->code_blocks;
7107         ri->num_code_blocks = pRExC_state->num_code_blocks;
7108     }
7109     else
7110     {
7111         int n;
7112         for (n = 0; n < pRExC_state->num_code_blocks; n++)
7113             if (pRExC_state->code_blocks[n].src_regex)
7114                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
7115         if(pRExC_state->code_blocks)
7116             SAVEFREEPV(pRExC_state->code_blocks); /* often null */
7117     }
7118
7119     {
7120         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7121         bool has_charset = (get_regex_charset(r->extflags)
7122                                                     != REGEX_DEPENDS_CHARSET);
7123
7124         /* The caret is output if there are any defaults: if not all the STD
7125          * flags are set, or if no character set specifier is needed */
7126         bool has_default =
7127                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7128                     || ! has_charset);
7129         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7130                                                    == REG_RUN_ON_COMMENT_SEEN);
7131         U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7132                             >> RXf_PMf_STD_PMMOD_SHIFT);
7133         const char *fptr = STD_PAT_MODS;        /*"msixn"*/
7134         char *p;
7135
7136         /* We output all the necessary flags; we never output a minus, as all
7137          * those are defaults, so are
7138          * covered by the caret */
7139         const STRLEN wraplen = plen + has_p + has_runon
7140             + has_default       /* If needs a caret */
7141             + PL_bitcount[reganch] /* 1 char for each set standard flag */
7142
7143                 /* If needs a character set specifier */
7144             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7145             + (sizeof("(?:)") - 1);
7146
7147         /* make sure PL_bitcount bounds not exceeded */
7148         assert(sizeof(STD_PAT_MODS) <= 8);
7149
7150         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
7151         r->xpv_len_u.xpvlenu_pv = p;
7152         if (RExC_utf8)
7153             SvFLAGS(rx) |= SVf_UTF8;
7154         *p++='('; *p++='?';
7155
7156         /* If a default, cover it using the caret */
7157         if (has_default) {
7158             *p++= DEFAULT_PAT_MOD;
7159         }
7160         if (has_charset) {
7161             STRLEN len;
7162             const char* const name = get_regex_charset_name(r->extflags, &len);
7163             Copy(name, p, len, char);
7164             p += len;
7165         }
7166         if (has_p)
7167             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7168         {
7169             char ch;
7170             while((ch = *fptr++)) {
7171                 if(reganch & 1)
7172                     *p++ = ch;
7173                 reganch >>= 1;
7174             }
7175         }
7176
7177         *p++ = ':';
7178         Copy(RExC_precomp, p, plen, char);
7179         assert ((RX_WRAPPED(rx) - p) < 16);
7180         r->pre_prefix = p - RX_WRAPPED(rx);
7181         p += plen;
7182         if (has_runon)
7183             *p++ = '\n';
7184         *p++ = ')';
7185         *p = 0;
7186         SvCUR_set(rx, p - RX_WRAPPED(rx));
7187     }
7188
7189     r->intflags = 0;
7190     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
7191
7192     /* Useful during FAIL. */
7193 #ifdef RE_TRACK_PATTERN_OFFSETS
7194     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7195     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7196                           "%s %"UVuf" bytes for offset annotations.\n",
7197                           ri->u.offsets ? "Got" : "Couldn't get",
7198                           (UV)((2*RExC_size+1) * sizeof(U32))));
7199 #endif
7200     SetProgLen(ri,RExC_size);
7201     RExC_rx_sv = rx;
7202     RExC_rx = r;
7203     RExC_rxi = ri;
7204
7205     /* Second pass: emit code. */
7206     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7207     RExC_pm_flags = pm_flags;
7208     RExC_parse = exp;
7209     RExC_end = exp + plen;
7210     RExC_naughty = 0;
7211     RExC_emit_start = ri->program;
7212     RExC_emit = ri->program;
7213     RExC_emit_bound = ri->program + RExC_size + 1;
7214     pRExC_state->code_index = 0;
7215
7216     *((char*) RExC_emit++) = (char) REG_MAGIC;
7217     /* setup various meta data about recursion, this all requires
7218      * RExC_npar to be correctly set, and a bit later on we clear it */
7219     if (RExC_seen & REG_RECURSE_SEEN) {
7220         DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7221             "%*s%*s Setting up open/close parens\n",
7222                   22, "|    |", (int)(0 * 2 + 1), ""));
7223
7224         /* setup RExC_open_parens, which holds the address of each
7225          * OPEN tag, and to make things simpler for the 0 index
7226          * the start of the program - this is used later for offsets */
7227         Newxz(RExC_open_parens, RExC_npar,regnode *);
7228         SAVEFREEPV(RExC_open_parens);
7229         RExC_open_parens[0] = RExC_emit;
7230
7231         /* setup RExC_close_parens, which holds the address of each
7232          * CLOSE tag, and to make things simpler for the 0 index
7233          * the end of the program - this is used later for offsets */
7234         Newxz(RExC_close_parens, RExC_npar,regnode *);
7235         SAVEFREEPV(RExC_close_parens);
7236         /* we dont know where end op starts yet, so we dont
7237          * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7238
7239         /* Note, RExC_npar is 1 + the number of parens in a pattern.
7240          * So its 1 if there are no parens. */
7241         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7242                                          ((RExC_npar & 0x07) != 0);
7243         Newx(RExC_study_chunk_recursed,
7244              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7245         SAVEFREEPV(RExC_study_chunk_recursed);
7246     }
7247     RExC_npar = 1;
7248     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7249         ReREFCNT_dec(rx);
7250         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
7251     }
7252     DEBUG_OPTIMISE_r(
7253         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7254     );
7255
7256     /* XXXX To minimize changes to RE engine we always allocate
7257        3-units-long substrs field. */
7258     Newx(r->substrs, 1, struct reg_substr_data);
7259     if (RExC_recurse_count) {
7260         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7261         SAVEFREEPV(RExC_recurse);
7262     }
7263
7264   reStudy:
7265     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7266     DEBUG_r(
7267         RExC_study_chunk_recursed_count= 0;
7268     );
7269     Zero(r->substrs, 1, struct reg_substr_data);
7270     if (RExC_study_chunk_recursed) {
7271         Zero(RExC_study_chunk_recursed,
7272              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7273     }
7274
7275
7276 #ifdef TRIE_STUDY_OPT
7277     if (!restudied) {
7278         StructCopy(&zero_scan_data, &data, scan_data_t);
7279         copyRExC_state = RExC_state;
7280     } else {
7281         U32 seen=RExC_seen;
7282         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7283
7284         RExC_state = copyRExC_state;
7285         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7286             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7287         else
7288             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7289         StructCopy(&zero_scan_data, &data, scan_data_t);
7290     }
7291 #else
7292     StructCopy(&zero_scan_data, &data, scan_data_t);
7293 #endif
7294
7295     /* Dig out information for optimizations. */
7296     r->extflags = RExC_flags; /* was pm_op */
7297     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7298
7299     if (UTF)
7300         SvUTF8_on(rx);  /* Unicode in it? */
7301     ri->regstclass = NULL;
7302     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7303         r->intflags |= PREGf_NAUGHTY;
7304     scan = ri->program + 1;             /* First BRANCH. */
7305
7306     /* testing for BRANCH here tells us whether there is "must appear"
7307        data in the pattern. If there is then we can use it for optimisations */
7308     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7309                                                   */
7310         SSize_t fake;
7311         STRLEN longest_float_length, longest_fixed_length;
7312         regnode_ssc ch_class; /* pointed to by data */
7313         int stclass_flag;
7314         SSize_t last_close = 0; /* pointed to by data */
7315         regnode *first= scan;
7316         regnode *first_next= regnext(first);
7317         /*
7318          * Skip introductions and multiplicators >= 1
7319          * so that we can extract the 'meat' of the pattern that must
7320          * match in the large if() sequence following.
7321          * NOTE that EXACT is NOT covered here, as it is normally
7322          * picked up by the optimiser separately.
7323          *
7324          * This is unfortunate as the optimiser isnt handling lookahead
7325          * properly currently.
7326          *
7327          */
7328         while ((OP(first) == OPEN && (sawopen = 1)) ||
7329                /* An OR of *one* alternative - should not happen now. */
7330             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7331             /* for now we can't handle lookbehind IFMATCH*/
7332             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7333             (OP(first) == PLUS) ||
7334             (OP(first) == MINMOD) ||
7335                /* An {n,m} with n>0 */
7336             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7337             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7338         {
7339                 /*
7340                  * the only op that could be a regnode is PLUS, all the rest
7341                  * will be regnode_1 or regnode_2.
7342                  *
7343                  * (yves doesn't think this is true)
7344                  */
7345                 if (OP(first) == PLUS)
7346                     sawplus = 1;
7347                 else {
7348                     if (OP(first) == MINMOD)
7349                         sawminmod = 1;
7350                     first += regarglen[OP(first)];
7351                 }
7352                 first = NEXTOPER(first);
7353                 first_next= regnext(first);
7354         }
7355
7356         /* Starting-point info. */
7357       again:
7358         DEBUG_PEEP("first:",first,0);
7359         /* Ignore EXACT as we deal with it later. */
7360         if (PL_regkind[OP(first)] == EXACT) {
7361             if (OP(first) == EXACT || OP(first) == EXACTL)
7362                 NOOP;   /* Empty, get anchored substr later. */
7363             else
7364                 ri->regstclass = first;
7365         }
7366 #ifdef TRIE_STCLASS
7367         else if (PL_regkind[OP(first)] == TRIE &&
7368                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7369         {
7370             /* this can happen only on restudy */
7371             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7372         }
7373 #endif
7374         else if (REGNODE_SIMPLE(OP(first)))
7375             ri->regstclass = first;
7376         else if (PL_regkind[OP(first)] == BOUND ||
7377                  PL_regkind[OP(first)] == NBOUND)
7378             ri->regstclass = first;
7379         else if (PL_regkind[OP(first)] == BOL) {
7380             r->intflags |= (OP(first) == MBOL
7381                            ? PREGf_ANCH_MBOL
7382                            : PREGf_ANCH_SBOL);
7383             first = NEXTOPER(first);
7384             goto again;
7385         }
7386         else if (OP(first) == GPOS) {
7387             r->intflags |= PREGf_ANCH_GPOS;
7388             first = NEXTOPER(first);
7389             goto again;
7390         }
7391         else if ((!sawopen || !RExC_sawback) &&
7392             !sawlookahead &&
7393             (OP(first) == STAR &&
7394             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7395             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7396         {
7397             /* turn .* into ^.* with an implied $*=1 */
7398             const int type =
7399                 (OP(NEXTOPER(first)) == REG_ANY)
7400                     ? PREGf_ANCH_MBOL
7401                     : PREGf_ANCH_SBOL;
7402             r->intflags |= (type | PREGf_IMPLICIT);
7403             first = NEXTOPER(first);
7404             goto again;
7405         }
7406         if (sawplus && !sawminmod && !sawlookahead
7407             && (!sawopen || !RExC_sawback)
7408             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7409             /* x+ must match at the 1st pos of run of x's */
7410             r->intflags |= PREGf_SKIP;
7411
7412         /* Scan is after the zeroth branch, first is atomic matcher. */
7413 #ifdef TRIE_STUDY_OPT
7414         DEBUG_PARSE_r(
7415             if (!restudied)
7416                 Perl_re_printf( aTHX_  "first at %"IVdf"\n",
7417                               (IV)(first - scan + 1))
7418         );
7419 #else
7420         DEBUG_PARSE_r(
7421             Perl_re_printf( aTHX_  "first at %"IVdf"\n",
7422                 (IV)(first - scan + 1))
7423         );
7424 #endif
7425
7426
7427         /*
7428         * If there's something expensive in the r.e., find the
7429         * longest literal string that must appear and make it the
7430         * regmust.  Resolve ties in favor of later strings, since
7431         * the regstart check works with the beginning of the r.e.
7432         * and avoiding duplication strengthens checking.  Not a
7433         * strong reason, but sufficient in the absence of others.
7434         * [Now we resolve ties in favor of the earlier string if
7435         * it happens that c_offset_min has been invalidated, since the
7436         * earlier string may buy us something the later one won't.]
7437         */
7438
7439         data.longest_fixed = newSVpvs("");
7440         data.longest_float = newSVpvs("");
7441         data.last_found = newSVpvs("");
7442         data.longest = &(data.longest_fixed);
7443         ENTER_with_name("study_chunk");
7444         SAVEFREESV(data.longest_fixed);
7445         SAVEFREESV(data.longest_float);
7446         SAVEFREESV(data.last_found);
7447         first = scan;
7448         if (!ri->regstclass) {
7449             ssc_init(pRExC_state, &ch_class);
7450             data.start_class = &ch_class;
7451             stclass_flag = SCF_DO_STCLASS_AND;
7452         } else                          /* XXXX Check for BOUND? */
7453             stclass_flag = 0;
7454         data.last_closep = &last_close;
7455
7456         DEBUG_RExC_seen();
7457         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7458                              scan + RExC_size, /* Up to end */
7459             &data, -1, 0, NULL,
7460             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7461                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7462             0);
7463
7464
7465         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7466
7467
7468         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7469              && data.last_start_min == 0 && data.last_end > 0
7470              && !RExC_seen_zerolen
7471              && !(RExC_seen & REG_VERBARG_SEEN)
7472              && !(RExC_seen & REG_GPOS_SEEN)
7473         ){
7474             r->extflags |= RXf_CHECK_ALL;
7475         }
7476         scan_commit(pRExC_state, &data,&minlen,0);
7477
7478         longest_float_length = CHR_SVLEN(data.longest_float);
7479
7480         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7481                    && data.offset_fixed == data.offset_float_min
7482                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7483             && S_setup_longest (aTHX_ pRExC_state,
7484                                     data.longest_float,
7485                                     &(r->float_utf8),
7486                                     &(r->float_substr),
7487                                     &(r->float_end_shift),
7488                                     data.lookbehind_float,
7489                                     data.offset_float_min,
7490                                     data.minlen_float,
7491                                     longest_float_length,
7492                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7493                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7494         {
7495             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7496             r->float_max_offset = data.offset_float_max;
7497             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7498                 r->float_max_offset -= data.lookbehind_float;
7499             SvREFCNT_inc_simple_void_NN(data.longest_float);
7500         }
7501         else {
7502             r->float_substr = r->float_utf8 = NULL;
7503             longest_float_length = 0;
7504         }
7505
7506         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7507
7508         if (S_setup_longest (aTHX_ pRExC_state,
7509                                 data.longest_fixed,
7510                                 &(r->anchored_utf8),
7511                                 &(r->anchored_substr),
7512                                 &(r->anchored_end_shift),
7513                                 data.lookbehind_fixed,
7514                                 data.offset_fixed,
7515                                 data.minlen_fixed,
7516                                 longest_fixed_length,
7517                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7518                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7519         {
7520             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7521             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7522         }
7523         else {
7524             r->anchored_substr = r->anchored_utf8 = NULL;
7525             longest_fixed_length = 0;
7526         }
7527         LEAVE_with_name("study_chunk");
7528
7529         if (ri->regstclass
7530             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7531             ri->regstclass = NULL;
7532
7533         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7534             && stclass_flag
7535             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7536             && is_ssc_worth_it(pRExC_state, data.start_class))
7537         {
7538             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7539
7540             ssc_finalize(pRExC_state, data.start_class);
7541
7542             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7543             StructCopy(data.start_class,
7544                        (regnode_ssc*)RExC_rxi->data->data[n],
7545                        regnode_ssc);
7546             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7547             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7548             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7549                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7550                       Perl_re_printf( aTHX_
7551                                     "synthetic stclass \"%s\".\n",
7552                                     SvPVX_const(sv));});
7553             data.start_class = NULL;
7554         }
7555
7556         /* A temporary algorithm prefers floated substr to fixed one to dig
7557          * more info. */
7558         if (longest_fixed_length > longest_float_length) {
7559             r->substrs->check_ix = 0;
7560             r->check_end_shift = r->anchored_end_shift;
7561             r->check_substr = r->anchored_substr;
7562             r->check_utf8 = r->anchored_utf8;
7563             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7564             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7565                 r->intflags |= PREGf_NOSCAN;
7566         }
7567         else {
7568             r->substrs->check_ix = 1;
7569             r->check_end_shift = r->float_end_shift;
7570             r->check_substr = r->float_substr;
7571             r->check_utf8 = r->float_utf8;
7572             r->check_offset_min = r->float_min_offset;
7573             r->check_offset_max = r->float_max_offset;
7574         }
7575         if ((r->check_substr || r->check_utf8) ) {
7576             r->extflags |= RXf_USE_INTUIT;
7577             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7578                 r->extflags |= RXf_INTUIT_TAIL;
7579         }
7580         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7581
7582         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7583         if ( (STRLEN)minlen < longest_float_length )
7584             minlen= longest_float_length;
7585         if ( (STRLEN)minlen < longest_fixed_length )
7586             minlen= longest_fixed_length;
7587         */
7588     }
7589     else {
7590         /* Several toplevels. Best we can is to set minlen. */
7591         SSize_t fake;
7592         regnode_ssc ch_class;
7593         SSize_t last_close = 0;
7594
7595         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
7596
7597         scan = ri->program + 1;
7598         ssc_init(pRExC_state, &ch_class);
7599         data.start_class = &ch_class;
7600         data.last_closep = &last_close;
7601
7602         DEBUG_RExC_seen();
7603         minlen = study_chunk(pRExC_state,
7604             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7605             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7606                                                       ? SCF_TRIE_DOING_RESTUDY
7607                                                       : 0),
7608             0);
7609
7610         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7611
7612         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7613                 = r->float_substr = r->float_utf8 = NULL;
7614
7615         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7616             && is_ssc_worth_it(pRExC_state, data.start_class))
7617         {
7618             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7619
7620             ssc_finalize(pRExC_state, data.start_class);
7621
7622             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7623             StructCopy(data.start_class,
7624                        (regnode_ssc*)RExC_rxi->data->data[n],
7625                        regnode_ssc);
7626             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7627             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7628             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7629                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7630                       Perl_re_printf( aTHX_
7631                                     "synthetic stclass \"%s\".\n",
7632                                     SvPVX_const(sv));});
7633             data.start_class = NULL;
7634         }
7635     }
7636
7637     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7638         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7639         r->maxlen = REG_INFTY;
7640     }
7641     else {
7642         r->maxlen = RExC_maxlen;
7643     }
7644
7645     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7646        the "real" pattern. */
7647     DEBUG_OPTIMISE_r({
7648         Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7649                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7650     });
7651     r->minlenret = minlen;
7652     if (r->minlen < minlen)
7653         r->minlen = minlen;
7654
7655     if (RExC_seen & REG_RECURSE_SEEN ) {
7656         r->intflags |= PREGf_RECURSE_SEEN;
7657         Newxz(r->recurse_locinput, r->nparens + 1, char *);
7658     }
7659     if (RExC_seen & REG_GPOS_SEEN)
7660         r->intflags |= PREGf_GPOS_SEEN;
7661     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7662         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7663                                                 lookbehind */
7664     if (pRExC_state->num_code_blocks)
7665         r->extflags |= RXf_EVAL_SEEN;
7666     if (RExC_seen & REG_VERBARG_SEEN)
7667     {
7668         r->intflags |= PREGf_VERBARG_SEEN;
7669         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7670     }
7671     if (RExC_seen & REG_CUTGROUP_SEEN)
7672         r->intflags |= PREGf_CUTGROUP_SEEN;
7673     if (pm_flags & PMf_USE_RE_EVAL)
7674         r->intflags |= PREGf_USE_RE_EVAL;
7675     if (RExC_paren_names)
7676         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7677     else
7678         RXp_PAREN_NAMES(r) = NULL;
7679
7680     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7681      * so it can be used in pp.c */
7682     if (r->intflags & PREGf_ANCH)
7683         r->extflags |= RXf_IS_ANCHORED;
7684
7685
7686     {
7687         /* this is used to identify "special" patterns that might result
7688          * in Perl NOT calling the regex engine and instead doing the match "itself",
7689          * particularly special cases in split//. By having the regex compiler
7690          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7691          * we avoid weird issues with equivalent patterns resulting in different behavior,
7692          * AND we allow non Perl engines to get the same optimizations by the setting the
7693          * flags appropriately - Yves */
7694         regnode *first = ri->program + 1;
7695         U8 fop = OP(first);
7696         regnode *next = regnext(first);
7697         U8 nop = OP(next);
7698
7699         if (PL_regkind[fop] == NOTHING && nop == END)
7700             r->extflags |= RXf_NULL;
7701         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7702             /* when fop is SBOL first->flags will be true only when it was
7703              * produced by parsing /\A/, and not when parsing /^/. This is
7704              * very important for the split code as there we want to
7705              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7706              * See rt #122761 for more details. -- Yves */
7707             r->extflags |= RXf_START_ONLY;
7708         else if (fop == PLUS
7709                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7710                  && nop == END)
7711             r->extflags |= RXf_WHITE;
7712         else if ( r->extflags & RXf_SPLIT
7713                   && (fop == EXACT || fop == EXACTL)
7714                   && STR_LEN(first) == 1
7715                   && *(STRING(first)) == ' '
7716                   && nop == END )
7717             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7718
7719     }
7720
7721     if (RExC_contains_locale) {
7722         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7723     }
7724
7725 #ifdef DEBUGGING
7726     if (RExC_paren_names) {
7727         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7728         ri->data->data[ri->name_list_idx]
7729                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7730     } else
7731 #endif
7732     ri->name_list_idx = 0;
7733
7734     while ( RExC_recurse_count > 0 ) {
7735         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7736         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7737     }
7738
7739     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7740     /* assume we don't need to swap parens around before we match */
7741     DEBUG_TEST_r({
7742         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7743             (unsigned long)RExC_study_chunk_recursed_count);
7744     });
7745     DEBUG_DUMP_r({
7746         DEBUG_RExC_seen();
7747         Perl_re_printf( aTHX_ "Final program:\n");
7748         regdump(r);
7749     });
7750 #ifdef RE_TRACK_PATTERN_OFFSETS
7751     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7752         const STRLEN len = ri->u.offsets[0];
7753         STRLEN i;
7754         GET_RE_DEBUG_FLAGS_DECL;
7755         Perl_re_printf( aTHX_
7756                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7757         for (i = 1; i <= len; i++) {
7758             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7759                 Perl_re_printf( aTHX_  "%"UVuf":%"UVuf"[%"UVuf"] ",
7760                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7761             }
7762         Perl_re_printf( aTHX_  "\n");
7763     });
7764 #endif
7765
7766 #ifdef USE_ITHREADS
7767     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7768      * by setting the regexp SV to readonly-only instead. If the
7769      * pattern's been recompiled, the USEDness should remain. */
7770     if (old_re && SvREADONLY(old_re))
7771         SvREADONLY_on(rx);
7772 #endif
7773     return rx;
7774 }
7775
7776
7777 SV*
7778 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7779                     const U32 flags)
7780 {
7781     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7782
7783     PERL_UNUSED_ARG(value);
7784
7785     if (flags & RXapif_FETCH) {
7786         return reg_named_buff_fetch(rx, key, flags);
7787     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7788         Perl_croak_no_modify();
7789         return NULL;
7790     } else if (flags & RXapif_EXISTS) {
7791         return reg_named_buff_exists(rx, key, flags)
7792             ? &PL_sv_yes
7793             : &PL_sv_no;
7794     } else if (flags & RXapif_REGNAMES) {
7795         return reg_named_buff_all(rx, flags);
7796     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7797         return reg_named_buff_scalar(rx, flags);
7798     } else {
7799         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7800         return NULL;
7801     }
7802 }
7803
7804 SV*
7805 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7806                          const U32 flags)
7807 {
7808     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7809     PERL_UNUSED_ARG(lastkey);
7810
7811     if (flags & RXapif_FIRSTKEY)
7812         return reg_named_buff_firstkey(rx, flags);
7813     else if (flags & RXapif_NEXTKEY)
7814         return reg_named_buff_nextkey(rx, flags);
7815     else {
7816         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7817                                             (int)flags);
7818         return NULL;
7819     }
7820 }
7821
7822 SV*
7823 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7824                           const U32 flags)
7825 {
7826     AV *retarray = NULL;
7827     SV *ret;
7828     struct regexp *const rx = ReANY(r);
7829
7830     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7831
7832     if (flags & RXapif_ALL)
7833         retarray=newAV();
7834
7835     if (rx && RXp_PAREN_NAMES(rx)) {
7836         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7837         if (he_str) {
7838             IV i;
7839             SV* sv_dat=HeVAL(he_str);
7840             I32 *nums=(I32*)SvPVX(sv_dat);
7841             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7842                 if ((I32)(rx->nparens) >= nums[i]
7843                     && rx->offs[nums[i]].start != -1
7844                     && rx->offs[nums[i]].end != -1)
7845                 {
7846                     ret = newSVpvs("");
7847                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7848                     if (!retarray)
7849                         return ret;
7850                 } else {
7851                     if (retarray)
7852                         ret = newSVsv(&PL_sv_undef);
7853                 }
7854                 if (retarray)
7855                     av_push(retarray, ret);
7856             }
7857             if (retarray)
7858                 return newRV_noinc(MUTABLE_SV(retarray));
7859         }
7860     }
7861     return NULL;
7862 }
7863
7864 bool
7865 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7866                            const U32 flags)
7867 {
7868     struct regexp *const rx = ReANY(r);
7869
7870     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7871
7872     if (rx && RXp_PAREN_NAMES(rx)) {
7873         if (flags & RXapif_ALL) {
7874             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7875         } else {
7876             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7877             if (sv) {
7878                 SvREFCNT_dec_NN(sv);
7879                 return TRUE;
7880             } else {
7881                 return FALSE;
7882             }
7883         }
7884     } else {
7885         return FALSE;
7886     }
7887 }
7888
7889 SV*
7890 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7891 {
7892     struct regexp *const rx = ReANY(r);
7893
7894     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7895
7896     if ( rx && RXp_PAREN_NAMES(rx) ) {
7897         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7898
7899         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7900     } else {
7901         return FALSE;
7902     }
7903 }
7904
7905 SV*
7906 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7907 {
7908     struct regexp *const rx = ReANY(r);
7909     GET_RE_DEBUG_FLAGS_DECL;
7910
7911     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7912
7913     if (rx && RXp_PAREN_NAMES(rx)) {
7914         HV *hv = RXp_PAREN_NAMES(rx);
7915         HE *temphe;
7916         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7917             IV i;
7918             IV parno = 0;
7919             SV* sv_dat = HeVAL(temphe);
7920             I32 *nums = (I32*)SvPVX(sv_dat);
7921             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7922                 if ((I32)(rx->lastparen) >= nums[i] &&
7923                     rx->offs[nums[i]].start != -1 &&
7924                     rx->offs[nums[i]].end != -1)
7925                 {
7926                     parno = nums[i];
7927                     break;
7928                 }
7929             }
7930             if (parno || flags & RXapif_ALL) {
7931                 return newSVhek(HeKEY_hek(temphe));
7932             }
7933         }
7934     }
7935     return NULL;
7936 }
7937
7938 SV*
7939 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7940 {
7941     SV *ret;
7942     AV *av;
7943     SSize_t length;
7944     struct regexp *const rx = ReANY(r);
7945
7946     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7947
7948     if (rx && RXp_PAREN_NAMES(rx)) {
7949         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7950             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7951         } else if (flags & RXapif_ONE) {
7952             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7953             av = MUTABLE_AV(SvRV(ret));
7954             length = av_tindex(av);
7955             SvREFCNT_dec_NN(ret);
7956             return newSViv(length + 1);
7957         } else {
7958             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7959                                                 (int)flags);
7960             return NULL;
7961         }
7962     }
7963     return &PL_sv_undef;
7964 }
7965
7966 SV*
7967 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7968 {
7969     struct regexp *const rx = ReANY(r);
7970     AV *av = newAV();
7971
7972     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7973
7974     if (rx && RXp_PAREN_NAMES(rx)) {
7975         HV *hv= RXp_PAREN_NAMES(rx);
7976         HE *temphe;
7977         (void)hv_iterinit(hv);
7978         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7979             IV i;
7980             IV parno = 0;
7981             SV* sv_dat = HeVAL(temphe);
7982             I32 *nums = (I32*)SvPVX(sv_dat);
7983             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7984                 if ((I32)(rx->lastparen) >= nums[i] &&
7985                     rx->offs[nums[i]].start != -1 &&
7986                     rx->offs[nums[i]].end != -1)
7987                 {
7988                     parno = nums[i];
7989                     break;
7990                 }
7991             }
7992             if (parno || flags & RXapif_ALL) {
7993                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7994             }
7995         }
7996     }
7997
7998     return newRV_noinc(MUTABLE_SV(av));
7999 }
8000
8001 void
8002 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8003                              SV * const sv)
8004 {
8005     struct regexp *const rx = ReANY(r);
8006     char *s = NULL;
8007     SSize_t i = 0;
8008     SSize_t s1, t1;
8009     I32 n = paren;
8010
8011     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8012
8013     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8014            || n == RX_BUFF_IDX_CARET_FULLMATCH
8015            || n == RX_BUFF_IDX_CARET_POSTMATCH
8016        )
8017     {
8018         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8019         if (!keepcopy) {
8020             /* on something like
8021              *    $r = qr/.../;
8022              *    /$qr/p;
8023              * the KEEPCOPY is set on the PMOP rather than the regex */
8024             if (PL_curpm && r == PM_GETRE(PL_curpm))
8025                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8026         }
8027         if (!keepcopy)
8028             goto ret_undef;
8029     }
8030
8031     if (!rx->subbeg)
8032         goto ret_undef;
8033
8034     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8035         /* no need to distinguish between them any more */
8036         n = RX_BUFF_IDX_FULLMATCH;
8037
8038     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8039         && rx->offs[0].start != -1)
8040     {
8041         /* $`, ${^PREMATCH} */
8042         i = rx->offs[0].start;
8043         s = rx->subbeg;
8044     }
8045     else
8046     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8047         && rx->offs[0].end != -1)
8048     {
8049         /* $', ${^POSTMATCH} */
8050         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8051         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8052     }
8053     else
8054     if ( 0 <= n && n <= (I32)rx->nparens &&
8055         (s1 = rx->offs[n].start) != -1 &&
8056         (t1 = rx->offs[n].end) != -1)
8057     {
8058         /* $&, ${^MATCH},  $1 ... */
8059         i = t1 - s1;
8060         s = rx->subbeg + s1 - rx->suboffset;
8061     } else {
8062         goto ret_undef;
8063     }
8064
8065     assert(s >= rx->subbeg);
8066     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8067     if (i >= 0) {
8068 #ifdef NO_TAINT_SUPPORT
8069         sv_setpvn(sv, s, i);
8070 #else
8071         const int oldtainted = TAINT_get;
8072         TAINT_NOT;
8073         sv_setpvn(sv, s, i);
8074         TAINT_set(oldtainted);
8075 #endif
8076         if (RXp_MATCH_UTF8(rx))
8077             SvUTF8_on(sv);
8078         else
8079             SvUTF8_off(sv);
8080         if (TAINTING_get) {
8081             if (RXp_MATCH_TAINTED(rx)) {
8082                 if (SvTYPE(sv) >= SVt_PVMG) {
8083                     MAGIC* const mg = SvMAGIC(sv);
8084                     MAGIC* mgt;
8085                     TAINT;
8086                     SvMAGIC_set(sv, mg->mg_moremagic);
8087                     SvTAINT(sv);
8088                     if ((mgt = SvMAGIC(sv))) {
8089                         mg->mg_moremagic = mgt;
8090                         SvMAGIC_set(sv, mg);
8091                     }
8092                 } else {
8093                     TAINT;
8094                     SvTAINT(sv);
8095                 }
8096             } else
8097                 SvTAINTED_off(sv);
8098         }
8099     } else {
8100       ret_undef:
8101         sv_setsv(sv,&PL_sv_undef);
8102         return;
8103     }
8104 }
8105
8106 void
8107 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8108                                                          SV const * const value)
8109 {
8110     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8111
8112     PERL_UNUSED_ARG(rx);
8113     PERL_UNUSED_ARG(paren);
8114     PERL_UNUSED_ARG(value);
8115
8116     if (!PL_localizing)
8117         Perl_croak_no_modify();
8118 }
8119
8120 I32
8121 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8122                               const I32 paren)
8123 {
8124     struct regexp *const rx = ReANY(r);
8125     I32 i;
8126     I32 s1, t1;
8127
8128     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8129
8130     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8131         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8132         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8133     )
8134     {
8135         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8136         if (!keepcopy) {
8137             /* on something like
8138              *    $r = qr/.../;
8139              *    /$qr/p;
8140              * the KEEPCOPY is set on the PMOP rather than the regex */
8141             if (PL_curpm && r == PM_GETRE(PL_curpm))
8142                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8143         }
8144         if (!keepcopy)
8145             goto warn_undef;
8146     }
8147
8148     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8149     switch (paren) {
8150       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8151       case RX_BUFF_IDX_PREMATCH:       /* $` */
8152         if (rx->offs[0].start != -1) {
8153                         i = rx->offs[0].start;
8154                         if (i > 0) {
8155                                 s1 = 0;
8156                                 t1 = i;
8157                                 goto getlen;
8158                         }
8159             }
8160         return 0;
8161
8162       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8163       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8164             if (rx->offs[0].end != -1) {
8165                         i = rx->sublen - rx->offs[0].end;
8166                         if (i > 0) {
8167                                 s1 = rx->offs[0].end;
8168                                 t1 = rx->sublen;
8169                                 goto getlen;
8170                         }
8171             }
8172         return 0;
8173
8174       default: /* $& / ${^MATCH}, $1, $2, ... */
8175             if (paren <= (I32)rx->nparens &&
8176             (s1 = rx->offs[paren].start) != -1 &&
8177             (t1 = rx->offs[paren].end) != -1)
8178             {
8179             i = t1 - s1;
8180             goto getlen;
8181         } else {
8182           warn_undef:
8183             if (ckWARN(WARN_UNINITIALIZED))
8184                 report_uninit((const SV *)sv);
8185             return 0;
8186         }
8187     }
8188   getlen:
8189     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8190         const char * const s = rx->subbeg - rx->suboffset + s1;
8191         const U8 *ep;
8192         STRLEN el;
8193
8194         i = t1 - s1;
8195         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8196                         i = el;
8197     }
8198     return i;
8199 }
8200
8201 SV*
8202 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8203 {
8204     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8205         PERL_UNUSED_ARG(rx);
8206         if (0)
8207             return NULL;
8208         else
8209             return newSVpvs("Regexp");
8210 }
8211
8212 /* Scans the name of a named buffer from the pattern.
8213  * If flags is REG_RSN_RETURN_NULL returns null.
8214  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8215  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8216  * to the parsed name as looked up in the RExC_paren_names hash.
8217  * If there is an error throws a vFAIL().. type exception.
8218  */
8219
8220 #define REG_RSN_RETURN_NULL    0
8221 #define REG_RSN_RETURN_NAME    1
8222 #define REG_RSN_RETURN_DATA    2
8223
8224 STATIC SV*
8225 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8226 {
8227     char *name_start = RExC_parse;
8228
8229     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8230
8231     assert (RExC_parse <= RExC_end);
8232     if (RExC_parse == RExC_end) NOOP;
8233     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
8234          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8235           * using do...while */
8236         if (UTF)
8237             do {
8238                 RExC_parse += UTF8SKIP(RExC_parse);
8239             } while (isWORDCHAR_utf8((U8*)RExC_parse));
8240         else
8241             do {
8242                 RExC_parse++;
8243             } while (isWORDCHAR(*RExC_parse));
8244     } else {
8245         RExC_parse++; /* so the <- from the vFAIL is after the offending
8246                          character */
8247         vFAIL("Group name must start with a non-digit word character");
8248     }
8249     if ( flags ) {
8250         SV* sv_name
8251             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8252                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8253         if ( flags == REG_RSN_RETURN_NAME)
8254             return sv_name;
8255         else if (flags==REG_RSN_RETURN_DATA) {
8256             HE *he_str = NULL;
8257             SV *sv_dat = NULL;
8258             if ( ! sv_name )      /* should not happen*/
8259                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8260             if (RExC_paren_names)
8261                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8262             if ( he_str )
8263                 sv_dat = HeVAL(he_str);
8264             if ( ! sv_dat )
8265                 vFAIL("Reference to nonexistent named group");
8266             return sv_dat;
8267         }
8268         else {
8269             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8270                        (unsigned long) flags);
8271         }
8272         NOT_REACHED; /* NOTREACHED */
8273     }
8274     return NULL;
8275 }
8276
8277 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8278     int num;                                                    \
8279     if (RExC_lastparse!=RExC_parse) {                           \
8280         Perl_re_printf( aTHX_  "%s",                                        \
8281             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8282                 RExC_end - RExC_parse, 16,                      \
8283                 "", "",                                         \
8284                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8285                 PERL_PV_PRETTY_ELLIPSES   |                     \
8286                 PERL_PV_PRETTY_LTGT       |                     \
8287                 PERL_PV_ESCAPE_RE         |                     \
8288                 PERL_PV_PRETTY_EXACTSIZE                        \
8289             )                                                   \
8290         );                                                      \
8291     } else                                                      \
8292         Perl_re_printf( aTHX_ "%16s","");                                   \
8293                                                                 \
8294     if (SIZE_ONLY)                                              \
8295        num = RExC_size + 1;                                     \
8296     else                                                        \
8297        num=REG_NODE_NUM(RExC_emit);                             \
8298     if (RExC_lastnum!=num)                                      \
8299        Perl_re_printf( aTHX_ "|%4d",num);                                   \
8300     else                                                        \
8301        Perl_re_printf( aTHX_ "|%4s","");                                    \
8302     Perl_re_printf( aTHX_ "|%*s%-4s",                                       \
8303         (int)((depth*2)), "",                                   \
8304         (funcname)                                              \
8305     );                                                          \
8306     RExC_lastnum=num;                                           \
8307     RExC_lastparse=RExC_parse;                                  \
8308 })
8309
8310
8311
8312 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8313     DEBUG_PARSE_MSG((funcname));                            \
8314     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8315 })
8316 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8317     DEBUG_PARSE_MSG((funcname));                            \
8318     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8319 })
8320
8321 /* This section of code defines the inversion list object and its methods.  The
8322  * interfaces are highly subject to change, so as much as possible is static to
8323  * this file.  An inversion list is here implemented as a malloc'd C UV array
8324  * as an SVt_INVLIST scalar.
8325  *
8326  * An inversion list for Unicode is an array of code points, sorted by ordinal
8327  * number.  Each element gives the code point that begins a range that extends
8328  * up-to but not including the code point given by the next element.  The final
8329  * element gives the first code point of a range that extends to the platform's
8330  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8331  * ...) give ranges whose code points are all in the inversion list.  We say
8332  * that those ranges are in the set.  The odd-numbered elements give ranges
8333  * whose code points are not in the inversion list, and hence not in the set.
8334  * Thus, element [0] is the first code point in the list.  Element [1]
8335  * is the first code point beyond that not in the list; and element [2] is the
8336  * first code point beyond that that is in the list.  In other words, the first
8337  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8338  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8339  * all code points in that range are not in the inversion list.  The third
8340  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8341  * list, and so forth.  Thus every element whose index is divisible by two
8342  * gives the beginning of a range that is in the list, and every element whose
8343  * index is not divisible by two gives the beginning of a range not in the
8344  * list.  If the final element's index is divisible by two, the inversion list
8345  * extends to the platform's infinity; otherwise the highest code point in the
8346  * inversion list is the contents of that element minus 1.
8347  *
8348  * A range that contains just a single code point N will look like
8349  *  invlist[i]   == N
8350  *  invlist[i+1] == N+1
8351  *
8352  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8353  * impossible to represent, so element [i+1] is omitted.  The single element
8354  * inversion list
8355  *  invlist[0] == UV_MAX
8356  * contains just UV_MAX, but is interpreted as matching to infinity.
8357  *
8358  * Taking the complement (inverting) an inversion list is quite simple, if the
8359  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8360  * This implementation reserves an element at the beginning of each inversion
8361  * list to always contain 0; there is an additional flag in the header which
8362  * indicates if the list begins at the 0, or is offset to begin at the next
8363  * element.  This means that the inversion list can be inverted without any
8364  * copying; just flip the flag.
8365  *
8366  * More about inversion lists can be found in "Unicode Demystified"
8367  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8368  *
8369  * The inversion list data structure is currently implemented as an SV pointing
8370  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8371  * array of UV whose memory management is automatically handled by the existing
8372  * facilities for SV's.
8373  *
8374  * Some of the methods should always be private to the implementation, and some
8375  * should eventually be made public */
8376
8377 /* The header definitions are in F<invlist_inline.h> */
8378
8379 #ifndef PERL_IN_XSUB_RE
8380
8381 PERL_STATIC_INLINE UV*
8382 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8383 {
8384     /* Returns a pointer to the first element in the inversion list's array.
8385      * This is called upon initialization of an inversion list.  Where the
8386      * array begins depends on whether the list has the code point U+0000 in it
8387      * or not.  The other parameter tells it whether the code that follows this
8388      * call is about to put a 0 in the inversion list or not.  The first
8389      * element is either the element reserved for 0, if TRUE, or the element
8390      * after it, if FALSE */
8391
8392     bool* offset = get_invlist_offset_addr(invlist);
8393     UV* zero_addr = (UV *) SvPVX(invlist);
8394
8395     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8396
8397     /* Must be empty */
8398     assert(! _invlist_len(invlist));
8399
8400     *zero_addr = 0;
8401
8402     /* 1^1 = 0; 1^0 = 1 */
8403     *offset = 1 ^ will_have_0;
8404     return zero_addr + *offset;
8405 }
8406
8407 #endif
8408
8409 PERL_STATIC_INLINE void
8410 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8411 {
8412     /* Sets the current number of elements stored in the inversion list.
8413      * Updates SvCUR correspondingly */
8414     PERL_UNUSED_CONTEXT;
8415     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8416
8417     assert(SvTYPE(invlist) == SVt_INVLIST);
8418
8419     SvCUR_set(invlist,
8420               (len == 0)
8421                ? 0
8422                : TO_INTERNAL_SIZE(len + offset));
8423     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8424 }
8425
8426 #ifndef PERL_IN_XSUB_RE
8427
8428 STATIC void
8429 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8430 {
8431     /* Replaces the inversion list in 'src' with the one in 'dest'.  It steals
8432      * the list from 'src', so 'src' is made to have a NULL list.  This is
8433      * similar to what SvSetMagicSV() would do, if it were implemented on
8434      * inversion lists, though this routine avoids a copy */
8435
8436     const UV src_len          = _invlist_len(src);
8437     const bool src_offset     = *get_invlist_offset_addr(src);
8438     const STRLEN src_byte_len = SvLEN(src);
8439     char * array              = SvPVX(src);
8440
8441     const int oldtainted = TAINT_get;
8442
8443     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8444
8445     assert(SvTYPE(src) == SVt_INVLIST);
8446     assert(SvTYPE(dest) == SVt_INVLIST);
8447     assert(! invlist_is_iterating(src));
8448     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8449
8450     /* Make sure it ends in the right place with a NUL, as our inversion list
8451      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8452      * asserts it */
8453     array[src_byte_len - 1] = '\0';
8454
8455     TAINT_NOT;      /* Otherwise it breaks */
8456     sv_usepvn_flags(dest,
8457                     (char *) array,
8458                     src_byte_len - 1,
8459
8460                     /* This flag is documented to cause a copy to be avoided */
8461                     SV_HAS_TRAILING_NUL);
8462     TAINT_set(oldtainted);
8463     SvPV_set(src, 0);
8464     SvLEN_set(src, 0);
8465     SvCUR_set(src, 0);
8466
8467     /* Finish up copying over the other fields in an inversion list */
8468     *get_invlist_offset_addr(dest) = src_offset;
8469     invlist_set_len(dest, src_len, src_offset);
8470     *get_invlist_previous_index_addr(dest) = 0;
8471     invlist_iterfinish(dest);
8472 }
8473
8474 PERL_STATIC_INLINE IV*
8475 S_get_invlist_previous_index_addr(SV* invlist)
8476 {
8477     /* Return the address of the IV that is reserved to hold the cached index
8478      * */
8479     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8480
8481     assert(SvTYPE(invlist) == SVt_INVLIST);
8482
8483     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8484 }
8485
8486 PERL_STATIC_INLINE IV
8487 S_invlist_previous_index(SV* const invlist)
8488 {
8489     /* Returns cached index of previous search */
8490
8491     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8492
8493     return *get_invlist_previous_index_addr(invlist);
8494 }
8495
8496 PERL_STATIC_INLINE void
8497 S_invlist_set_previous_index(SV* const invlist, const IV index)
8498 {
8499     /* Caches <index> for later retrieval */
8500
8501     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8502
8503     assert(index == 0 || index < (int) _invlist_len(invlist));
8504
8505     *get_invlist_previous_index_addr(invlist) = index;
8506 }
8507
8508 PERL_STATIC_INLINE void
8509 S_invlist_trim(SV* invlist)
8510 {
8511     /* Free the not currently-being-used space in an inversion list */
8512
8513     /* But don't free up the space needed for the 0 UV that is always at the
8514      * beginning of the list, nor the trailing NUL */
8515     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8516
8517     PERL_ARGS_ASSERT_INVLIST_TRIM;
8518
8519     assert(SvTYPE(invlist) == SVt_INVLIST);
8520
8521     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8522 }
8523
8524 PERL_STATIC_INLINE void
8525 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
8526 {
8527     PERL_ARGS_ASSERT_INVLIST_CLEAR;
8528
8529     assert(SvTYPE(invlist) == SVt_INVLIST);
8530
8531     invlist_set_len(invlist, 0, 0);
8532     invlist_trim(invlist);
8533 }
8534
8535 #endif /* ifndef PERL_IN_XSUB_RE */
8536
8537 PERL_STATIC_INLINE bool
8538 S_invlist_is_iterating(SV* const invlist)
8539 {
8540     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8541
8542     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8543 }
8544
8545 #ifndef PERL_IN_XSUB_RE
8546
8547 PERL_STATIC_INLINE UV
8548 S_invlist_max(SV* const invlist)
8549 {
8550     /* Returns the maximum number of elements storable in the inversion list's
8551      * array, without having to realloc() */
8552
8553     PERL_ARGS_ASSERT_INVLIST_MAX;
8554
8555     assert(SvTYPE(invlist) == SVt_INVLIST);
8556
8557     /* Assumes worst case, in which the 0 element is not counted in the
8558      * inversion list, so subtracts 1 for that */
8559     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8560            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8561            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8562 }
8563 SV*
8564 Perl__new_invlist(pTHX_ IV initial_size)
8565 {
8566
8567     /* Return a pointer to a newly constructed inversion list, with enough
8568      * space to store 'initial_size' elements.  If that number is negative, a
8569      * system default is used instead */
8570
8571     SV* new_list;
8572
8573     if (initial_size < 0) {
8574         initial_size = 10;
8575     }
8576
8577     /* Allocate the initial space */
8578     new_list = newSV_type(SVt_INVLIST);
8579
8580     /* First 1 is in case the zero element isn't in the list; second 1 is for
8581      * trailing NUL */
8582     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8583     invlist_set_len(new_list, 0, 0);
8584
8585     /* Force iterinit() to be used to get iteration to work */
8586     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8587
8588     *get_invlist_previous_index_addr(new_list) = 0;
8589
8590     return new_list;
8591 }
8592
8593 SV*
8594 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8595 {
8596     /* Return a pointer to a newly constructed inversion list, initialized to
8597      * point to <list>, which has to be in the exact correct inversion list
8598      * form, including internal fields.  Thus this is a dangerous routine that
8599      * should not be used in the wrong hands.  The passed in 'list' contains
8600      * several header fields at the beginning that are not part of the
8601      * inversion list body proper */
8602
8603     const STRLEN length = (STRLEN) list[0];
8604     const UV version_id =          list[1];
8605     const bool offset   =    cBOOL(list[2]);
8606 #define HEADER_LENGTH 3
8607     /* If any of the above changes in any way, you must change HEADER_LENGTH
8608      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8609      *      perl -E 'say int(rand 2**31-1)'
8610      */
8611 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8612                                         data structure type, so that one being
8613                                         passed in can be validated to be an
8614                                         inversion list of the correct vintage.
8615                                        */
8616
8617     SV* invlist = newSV_type(SVt_INVLIST);
8618
8619     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8620
8621     if (version_id != INVLIST_VERSION_ID) {
8622         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8623     }
8624
8625     /* The generated array passed in includes header elements that aren't part
8626      * of the list proper, so start it just after them */
8627     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8628
8629     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8630                                shouldn't touch it */
8631
8632     *(get_invlist_offset_addr(invlist)) = offset;
8633
8634     /* The 'length' passed to us is the physical number of elements in the
8635      * inversion list.  But if there is an offset the logical number is one
8636      * less than that */
8637     invlist_set_len(invlist, length  - offset, offset);
8638
8639     invlist_set_previous_index(invlist, 0);
8640
8641     /* Initialize the iteration pointer. */
8642     invlist_iterfinish(invlist);
8643
8644     SvREADONLY_on(invlist);
8645
8646     return invlist;
8647 }
8648
8649 STATIC void
8650 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8651 {
8652     /* Grow the maximum size of an inversion list */
8653
8654     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8655
8656     assert(SvTYPE(invlist) == SVt_INVLIST);
8657
8658     /* Add one to account for the zero element at the beginning which may not
8659      * be counted by the calling parameters */
8660     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8661 }
8662
8663 STATIC void
8664 S__append_range_to_invlist(pTHX_ SV* const invlist,
8665                                  const UV start, const UV end)
8666 {
8667    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8668     * the end of the inversion list.  The range must be above any existing
8669     * ones. */
8670
8671     UV* array;
8672     UV max = invlist_max(invlist);
8673     UV len = _invlist_len(invlist);
8674     bool offset;
8675
8676     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8677
8678     if (len == 0) { /* Empty lists must be initialized */
8679         offset = start != 0;
8680         array = _invlist_array_init(invlist, ! offset);
8681     }
8682     else {
8683         /* Here, the existing list is non-empty. The current max entry in the
8684          * list is generally the first value not in the set, except when the
8685          * set extends to the end of permissible values, in which case it is
8686          * the first entry in that final set, and so this call is an attempt to
8687          * append out-of-order */
8688
8689         UV final_element = len - 1;
8690         array = invlist_array(invlist);
8691         if (   array[final_element] > start
8692             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8693         {
8694             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8695                      array[final_element], start,
8696                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8697         }
8698
8699         /* Here, it is a legal append.  If the new range begins 1 above the end
8700          * of the range below it, it is extending the range below it, so the
8701          * new first value not in the set is one greater than the newly
8702          * extended range.  */
8703         offset = *get_invlist_offset_addr(invlist);
8704         if (array[final_element] == start) {
8705             if (end != UV_MAX) {
8706                 array[final_element] = end + 1;
8707             }
8708             else {
8709                 /* But if the end is the maximum representable on the machine,
8710                  * assume that infinity was actually what was meant.  Just let
8711                  * the range that this would extend to have no end */
8712                 invlist_set_len(invlist, len - 1, offset);
8713             }
8714             return;
8715         }
8716     }
8717
8718     /* Here the new range doesn't extend any existing set.  Add it */
8719
8720     len += 2;   /* Includes an element each for the start and end of range */
8721
8722     /* If wll overflow the existing space, extend, which may cause the array to
8723      * be moved */
8724     if (max < len) {
8725         invlist_extend(invlist, len);
8726
8727         /* Have to set len here to avoid assert failure in invlist_array() */
8728         invlist_set_len(invlist, len, offset);
8729
8730         array = invlist_array(invlist);
8731     }
8732     else {
8733         invlist_set_len(invlist, len, offset);
8734     }
8735
8736     /* The next item on the list starts the range, the one after that is
8737      * one past the new range.  */
8738     array[len - 2] = start;
8739     if (end != UV_MAX) {
8740         array[len - 1] = end + 1;
8741     }
8742     else {
8743         /* But if the end is the maximum representable on the machine, just let
8744          * the range have no end */
8745         invlist_set_len(invlist, len - 1, offset);
8746     }
8747 }
8748
8749 SSize_t
8750 Perl__invlist_search(SV* const invlist, const UV cp)
8751 {
8752     /* Searches the inversion list for the entry that contains the input code
8753      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8754      * return value is the index into the list's array of the range that
8755      * contains <cp>, that is, 'i' such that
8756      *  array[i] <= cp < array[i+1]
8757      */
8758
8759     IV low = 0;
8760     IV mid;
8761     IV high = _invlist_len(invlist);
8762     const IV highest_element = high - 1;
8763     const UV* array;
8764
8765     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8766
8767     /* If list is empty, return failure. */
8768     if (high == 0) {
8769         return -1;
8770     }
8771
8772     /* (We can't get the array unless we know the list is non-empty) */
8773     array = invlist_array(invlist);
8774
8775     mid = invlist_previous_index(invlist);
8776     assert(mid >=0);
8777     if (mid > highest_element) {
8778         mid = highest_element;
8779     }
8780
8781     /* <mid> contains the cache of the result of the previous call to this
8782      * function (0 the first time).  See if this call is for the same result,
8783      * or if it is for mid-1.  This is under the theory that calls to this
8784      * function will often be for related code points that are near each other.
8785      * And benchmarks show that caching gives better results.  We also test
8786      * here if the code point is within the bounds of the list.  These tests
8787      * replace others that would have had to be made anyway to make sure that
8788      * the array bounds were not exceeded, and these give us extra information
8789      * at the same time */
8790     if (cp >= array[mid]) {
8791         if (cp >= array[highest_element]) {
8792             return highest_element;
8793         }
8794
8795         /* Here, array[mid] <= cp < array[highest_element].  This means that
8796          * the final element is not the answer, so can exclude it; it also
8797          * means that <mid> is not the final element, so can refer to 'mid + 1'
8798          * safely */
8799         if (cp < array[mid + 1]) {
8800             return mid;
8801         }
8802         high--;
8803         low = mid + 1;
8804     }
8805     else { /* cp < aray[mid] */
8806         if (cp < array[0]) { /* Fail if outside the array */
8807             return -1;
8808         }
8809         high = mid;
8810         if (cp >= array[mid - 1]) {
8811             goto found_entry;
8812         }
8813     }
8814
8815     /* Binary search.  What we are looking for is <i> such that
8816      *  array[i] <= cp < array[i+1]
8817      * The loop below converges on the i+1.  Note that there may not be an
8818      * (i+1)th element in the array, and things work nonetheless */
8819     while (low < high) {
8820         mid = (low + high) / 2;
8821         assert(mid <= highest_element);
8822         if (array[mid] <= cp) { /* cp >= array[mid] */
8823             low = mid + 1;
8824
8825             /* We could do this extra test to exit the loop early.
8826             if (cp < array[low]) {
8827                 return mid;
8828             }
8829             */
8830         }
8831         else { /* cp < array[mid] */
8832             high = mid;
8833         }
8834     }
8835
8836   found_entry:
8837     high--;
8838     invlist_set_previous_index(invlist, high);
8839     return high;
8840 }
8841
8842 void
8843 Perl__invlist_populate_swatch(SV* const invlist,
8844                               const UV start, const UV end, U8* swatch)
8845 {
8846     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8847      * but is used when the swash has an inversion list.  This makes this much
8848      * faster, as it uses a binary search instead of a linear one.  This is
8849      * intimately tied to that function, and perhaps should be in utf8.c,
8850      * except it is intimately tied to inversion lists as well.  It assumes
8851      * that <swatch> is all 0's on input */
8852
8853     UV current = start;
8854     const IV len = _invlist_len(invlist);
8855     IV i;
8856     const UV * array;
8857
8858     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8859
8860     if (len == 0) { /* Empty inversion list */
8861         return;
8862     }
8863
8864     array = invlist_array(invlist);
8865
8866     /* Find which element it is */
8867     i = _invlist_search(invlist, start);
8868
8869     /* We populate from <start> to <end> */
8870     while (current < end) {
8871         UV upper;
8872
8873         /* The inversion list gives the results for every possible code point
8874          * after the first one in the list.  Only those ranges whose index is
8875          * even are ones that the inversion list matches.  For the odd ones,
8876          * and if the initial code point is not in the list, we have to skip
8877          * forward to the next element */
8878         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8879             i++;
8880             if (i >= len) { /* Finished if beyond the end of the array */
8881                 return;
8882             }
8883             current = array[i];
8884             if (current >= end) {   /* Finished if beyond the end of what we
8885                                        are populating */
8886                 if (LIKELY(end < UV_MAX)) {
8887                     return;
8888                 }
8889
8890                 /* We get here when the upper bound is the maximum
8891                  * representable on the machine, and we are looking for just
8892                  * that code point.  Have to special case it */
8893                 i = len;
8894                 goto join_end_of_list;
8895             }
8896         }
8897         assert(current >= start);
8898
8899         /* The current range ends one below the next one, except don't go past
8900          * <end> */
8901         i++;
8902         upper = (i < len && array[i] < end) ? array[i] : end;
8903
8904         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8905          * for each code point in it */
8906         for (; current < upper; current++) {
8907             const STRLEN offset = (STRLEN)(current - start);
8908             swatch[offset >> 3] |= 1 << (offset & 7);
8909         }
8910
8911       join_end_of_list:
8912
8913         /* Quit if at the end of the list */
8914         if (i >= len) {
8915
8916             /* But first, have to deal with the highest possible code point on
8917              * the platform.  The previous code assumes that <end> is one
8918              * beyond where we want to populate, but that is impossible at the
8919              * platform's infinity, so have to handle it specially */
8920             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8921             {
8922                 const STRLEN offset = (STRLEN)(end - start);
8923                 swatch[offset >> 3] |= 1 << (offset & 7);
8924             }
8925             return;
8926         }
8927
8928         /* Advance to the next range, which will be for code points not in the
8929          * inversion list */
8930         current = array[i];
8931     }
8932
8933     return;
8934 }
8935
8936 void
8937 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8938                                          const bool complement_b, SV** output)
8939 {
8940     /* Take the union of two inversion lists and point <output> to it.  *output
8941      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8942      * the reference count to that list will be decremented if not already a
8943      * temporary (mortal); otherwise just its contents will be modified to be
8944      * the union.  The first list, <a>, may be NULL, in which case a copy of
8945      * the second list is returned.  If <complement_b> is TRUE, the union is
8946      * taken of the complement (inversion) of <b> instead of b itself.
8947      *
8948      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8949      * Richard Gillam, published by Addison-Wesley, and explained at some
8950      * length there.  The preface says to incorporate its examples into your
8951      * code at your own risk.
8952      *
8953      * The algorithm is like a merge sort. */
8954
8955     const UV* array_a;    /* a's array */
8956     const UV* array_b;
8957     UV len_a;       /* length of a's array */
8958     UV len_b;
8959
8960     SV* u;                      /* the resulting union */
8961     UV* array_u;
8962     UV len_u = 0;
8963
8964     UV i_a = 0;             /* current index into a's array */
8965     UV i_b = 0;
8966     UV i_u = 0;
8967
8968     /* running count, as explained in the algorithm source book; items are
8969      * stopped accumulating and are output when the count changes to/from 0.
8970      * The count is incremented when we start a range that's in an input's set,
8971      * and decremented when we start a range that's not in a set.  So this
8972      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
8973      * and hence nothing goes into the union; 1, just one of the inputs is in
8974      * its set (and its current range gets added to the union); and 2 when both
8975      * inputs are in their sets.  */
8976     UV count = 0;
8977
8978     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8979     assert(a != b);
8980
8981     len_b = _invlist_len(b);
8982     if (len_b == 0) {
8983
8984         /* Here, 'b' is empty.  If the output is the complement of 'b', the
8985          * union is all possible code points, and we need not even look at 'a'.
8986          * It's easiest to create a new inversion list that matches everything.
8987          * */
8988         if (complement_b) {
8989             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
8990
8991             /* If the output didn't exist, just point it at the new list */
8992             if (*output == NULL) {
8993                 *output = everything;
8994                 return;
8995             }
8996
8997             /* Otherwise, replace its contents with the new list */
8998             invlist_replace_list_destroys_src(*output, everything);
8999             SvREFCNT_dec_NN(everything);
9000             return;
9001         }
9002
9003         /* Here, we don't want the complement of 'b', and since it is empty,
9004          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9005          * output will be empty */
9006
9007         if (a == NULL) {
9008             *output = _new_invlist(0);
9009             return;
9010         }
9011
9012         if (_invlist_len(a) == 0) {
9013             invlist_clear(*output);
9014             return;
9015         }
9016
9017         /* Here, 'a' is not empty, and entirely determines the union.  If the
9018          * output is not to overwrite 'b', we can just return 'a'. */
9019         if (*output != b) {
9020
9021             /* If the output is to overwrite 'a', we have a no-op, as it's
9022              * already in 'a' */
9023             if (*output == a) {
9024                 return;
9025             }
9026
9027             /* But otherwise we have to copy 'a' to the output */
9028             *output = invlist_clone(a);
9029             return;
9030         }
9031
9032         /* Here, 'b' is to be overwritten by the output, which will be 'a' */
9033         u = invlist_clone(a);
9034         invlist_replace_list_destroys_src(*output, u);
9035         SvREFCNT_dec_NN(u);
9036
9037         return;
9038     }
9039
9040     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9041
9042         /* Here, 'a' is empty (and b is not).  That means the union will come
9043          * entirely from 'b'.  If the output is not to overwrite 'a', we can
9044          * just return what's in 'b'.  */
9045         if (*output != a) {
9046
9047             /* If the output is to overwrite 'b', it's already in 'b', but
9048              * otherwise we have to copy 'b' to the output */
9049             if (*output != b) {
9050                 *output = invlist_clone(b);
9051             }
9052
9053             /* And if the output is to be the inversion of 'b', do that */
9054             if (complement_b) {
9055                 _invlist_invert(*output);
9056             }
9057
9058             return;
9059         }
9060
9061         /* Here, 'a', which is empty or even NULL, is to be overwritten by the
9062          * output, which will either be 'b' or the complement of 'b' */
9063
9064         if (a == NULL) {
9065             *output = invlist_clone(b);
9066         }
9067         else {
9068             u = invlist_clone(b);
9069             invlist_replace_list_destroys_src(*output, u);
9070             SvREFCNT_dec_NN(u);
9071         }
9072
9073         if (complement_b) {
9074             _invlist_invert(*output);
9075         }
9076
9077         return;
9078     }
9079
9080     /* Here both lists exist and are non-empty */
9081     array_a = invlist_array(a);
9082     array_b = invlist_array(b);
9083
9084     /* If are to take the union of 'a' with the complement of b, set it
9085      * up so are looking at b's complement. */
9086     if (complement_b) {
9087
9088         /* To complement, we invert: if the first element is 0, remove it.  To
9089          * do this, we just pretend the array starts one later */
9090         if (array_b[0] == 0) {
9091             array_b++;
9092             len_b--;
9093         }
9094         else {
9095
9096             /* But if the first element is not zero, we pretend the list starts
9097              * at the 0 that is always stored immediately before the array. */
9098             array_b--;
9099             len_b++;
9100         }
9101     }
9102
9103     /* Size the union for the worst case: that the sets are completely
9104      * disjoint */
9105     u = _new_invlist(len_a + len_b);
9106
9107     /* Will contain U+0000 if either component does */
9108     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9109                                       || (len_b > 0 && array_b[0] == 0));
9110
9111     /* Go through each input list item by item, stopping when exhausted one of
9112      * them */
9113     while (i_a < len_a && i_b < len_b) {
9114         UV cp;      /* The element to potentially add to the union's array */
9115         bool cp_in_set;   /* is it in the the input list's set or not */
9116
9117         /* We need to take one or the other of the two inputs for the union.
9118          * Since we are merging two sorted lists, we take the smaller of the
9119          * next items.  In case of a tie, we take first the one that is in its
9120          * set.  If we first took the one not in its set, it would decrement
9121          * the count, possibly to 0 which would cause it to be output as ending
9122          * the range, and the next time through we would take the same number,
9123          * and output it again as beginning the next range.  By doing it the
9124          * opposite way, there is no possibility that the count will be
9125          * momentarily decremented to 0, and thus the two adjoining ranges will
9126          * be seamlessly merged.  (In a tie and both are in the set or both not
9127          * in the set, it doesn't matter which we take first.) */
9128         if (       array_a[i_a] < array_b[i_b]
9129             || (   array_a[i_a] == array_b[i_b]
9130                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9131         {
9132             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9133             cp = array_a[i_a++];
9134         }
9135         else {
9136             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9137             cp = array_b[i_b++];
9138         }
9139
9140         /* Here, have chosen which of the two inputs to look at.  Only output
9141          * if the running count changes to/from 0, which marks the
9142          * beginning/end of a range that's in the set */
9143         if (cp_in_set) {
9144             if (count == 0) {
9145                 array_u[i_u++] = cp;
9146             }
9147             count++;
9148         }
9149         else {
9150             count--;
9151             if (count == 0) {
9152                 array_u[i_u++] = cp;
9153             }
9154         }
9155     }
9156
9157
9158     /* The loop above increments the index into exactly one of the input lists
9159      * each iteration, and ends when either index gets to its list end.  That
9160      * means the other index is lower than its end, and so something is
9161      * remaining in that one.  We decrement 'count', as explained below, if
9162      * that list is in its set.  (i_a and i_b each currently index the element
9163      * beyond the one we care about.) */
9164     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9165         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9166     {
9167         count--;
9168     }
9169
9170     /* Above we decremented 'count' if the list that had unexamined elements in
9171      * it was in its set.  This has made it so that 'count' being non-zero
9172      * means there isn't anything left to output; and 'count' equal to 0 means
9173      * that what is left to output is precisely that which is left in the
9174      * non-exhausted input list.
9175      *
9176      * To see why, note first that the exhausted input obviously has nothing
9177      * left to add to the union.  If it was in its set at its end, that means
9178      * the set extends from here to the platform's infinity, and hence so does
9179      * the union and the non-exhausted set is irrelevant.  The exhausted set
9180      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9181      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9182      * 'count' remains at 1.  This is consistent with the decremented 'count'
9183      * != 0 meaning there's nothing left to add to the union.
9184      *
9185      * But if the exhausted input wasn't in its set, it contributed 0 to
9186      * 'count', and the rest of the union will be whatever the other input is.
9187      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9188      * otherwise it gets decremented to 0.  This is consistent with 'count'
9189      * == 0 meaning the remainder of the union is whatever is left in the
9190      * non-exhausted list. */
9191     if (count != 0) {
9192         len_u = i_u;
9193     }
9194     else {
9195         IV copy_count = len_a - i_a;
9196         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9197             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9198         }
9199         else { /* The non-exhausted input is b */
9200             copy_count = len_b - i_b;
9201             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9202         }
9203         len_u = i_u + copy_count;
9204     }
9205
9206     /* Set the result to the final length, which can change the pointer to
9207      * array_u, so re-find it.  (Note that it is unlikely that this will
9208      * change, as we are shrinking the space, not enlarging it) */
9209     if (len_u != _invlist_len(u)) {
9210         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9211         invlist_trim(u);
9212         array_u = invlist_array(u);
9213     }
9214
9215     /* If the output is not to overwrite either of the inputs, just return the
9216      * calculated union */
9217     if (a != *output && b != *output) {
9218         *output = u;
9219     }
9220     else {
9221         /*  Here, the output is to be the same as one of the input scalars,
9222          *  hence replacing it.  The simple thing to do is to free the input
9223          *  scalar, making it instead be the output one.  But experience has
9224          *  shown [perl #127392] that if the input is a mortal, we can get a
9225          *  huge build-up of these during regex compilation before they get
9226          *  freed.  So for that case, replace just the input's interior with
9227          *  the union's, and then free the union */
9228
9229         assert(! invlist_is_iterating(*output));
9230
9231         if (! SvTEMP(*output)) {
9232             SvREFCNT_dec_NN(*output);
9233             *output = u;
9234         }
9235         else {
9236             invlist_replace_list_destroys_src(*output, u);
9237             SvREFCNT_dec_NN(u);
9238         }
9239     }
9240
9241     return;
9242 }
9243
9244 void
9245 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9246                                                const bool complement_b, SV** i)
9247 {
9248     /* Take the intersection of two inversion lists and point <i> to it.  *i
9249      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
9250      * the reference count to that list will be decremented if not already a
9251      * temporary (mortal); otherwise just its contents will be modified to be
9252      * the intersection.  The first list, <a>, may be NULL, in which case an
9253      * empty list is returned.  If <complement_b> is TRUE, the result will be
9254      * the intersection of <a> and the complement (or inversion) of <b> instead
9255      * of <b> directly.
9256      *
9257      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9258      * Richard Gillam, published by Addison-Wesley, and explained at some
9259      * length there.  The preface says to incorporate its examples into your
9260      * code at your own risk.  In fact, it had bugs
9261      *
9262      * The algorithm is like a merge sort, and is essentially the same as the
9263      * union above
9264      */
9265
9266     const UV* array_a;          /* a's array */
9267     const UV* array_b;
9268     UV len_a;   /* length of a's array */
9269     UV len_b;
9270
9271     SV* r;                   /* the resulting intersection */
9272     UV* array_r;
9273     UV len_r = 0;
9274
9275     UV i_a = 0;             /* current index into a's array */
9276     UV i_b = 0;
9277     UV i_r = 0;
9278
9279     /* running count of how many of the two inputs are postitioned at ranges
9280      * that are in their sets.  As explained in the algorithm source book,
9281      * items are stopped accumulating and are output when the count changes
9282      * to/from 2.  The count is incremented when we start a range that's in an
9283      * input's set, and decremented when we start a range that's not in a set.
9284      * Only when it is 2 are we in the intersection. */
9285     UV count = 0;
9286
9287     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9288     assert(a != b);
9289
9290     /* Special case if either one is empty */
9291     len_a = (a == NULL) ? 0 : _invlist_len(a);
9292     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9293         if (len_a != 0 && complement_b) {
9294
9295             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9296              * must be empty.  Here, also we are using 'b's complement, which
9297              * hence must be every possible code point.  Thus the intersection
9298              * is simply 'a'. */
9299
9300             if (*i == a) {  /* No-op */
9301                 return;
9302             }
9303
9304             /* If not overwriting either input, just make a copy of 'a' */
9305             if (*i != b) {
9306                 *i = invlist_clone(a);
9307                 return;
9308             }
9309
9310             /* Here we are overwriting 'b' with 'a's contents */
9311             r = invlist_clone(a);
9312             invlist_replace_list_destroys_src(*i, r);
9313             SvREFCNT_dec_NN(r);
9314             return;
9315         }
9316
9317         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9318          * intersection must be empty */
9319         if (*i == NULL) {
9320             *i = _new_invlist(0);
9321             return;
9322         }
9323
9324         invlist_clear(*i);
9325         return;
9326     }
9327
9328     /* Here both lists exist and are non-empty */
9329     array_a = invlist_array(a);
9330     array_b = invlist_array(b);
9331
9332     /* If are to take the intersection of 'a' with the complement of b, set it
9333      * up so are looking at b's complement. */
9334     if (complement_b) {
9335
9336         /* To complement, we invert: if the first element is 0, remove it.  To
9337          * do this, we just pretend the array starts one later */
9338         if (array_b[0] == 0) {
9339             array_b++;
9340             len_b--;
9341         }
9342         else {
9343
9344             /* But if the first element is not zero, we pretend the list starts
9345              * at the 0 that is always stored immediately before the array. */
9346             array_b--;
9347             len_b++;
9348         }
9349     }
9350
9351     /* Size the intersection for the worst case: that the intersection ends up
9352      * fragmenting everything to be completely disjoint */
9353     r= _new_invlist(len_a + len_b);
9354
9355     /* Will contain U+0000 iff both components do */
9356     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9357                                      && len_b > 0 && array_b[0] == 0);
9358
9359     /* Go through each list item by item, stopping when exhausted one of
9360      * them */
9361     while (i_a < len_a && i_b < len_b) {
9362         UV cp;      /* The element to potentially add to the intersection's
9363                        array */
9364         bool cp_in_set; /* Is it in the input list's set or not */
9365
9366         /* We need to take one or the other of the two inputs for the
9367          * intersection.  Since we are merging two sorted lists, we take the
9368          * smaller of the next items.  In case of a tie, we take first the one
9369          * that is not in its set (a difference from the union algorithm).  If
9370          * we first took the one in its set, it would increment the count,
9371          * possibly to 2 which would cause it to be output as starting a range
9372          * in the intersection, and the next time through we would take that
9373          * same number, and output it again as ending the set.  By doing the
9374          * opposite of this, there is no possibility that the count will be
9375          * momentarily incremented to 2.  (In a tie and both are in the set or
9376          * both not in the set, it doesn't matter which we take first.) */
9377         if (       array_a[i_a] < array_b[i_b]
9378             || (   array_a[i_a] == array_b[i_b]
9379                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9380         {
9381             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9382             cp = array_a[i_a++];
9383         }
9384         else {
9385             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9386             cp= array_b[i_b++];
9387         }
9388
9389         /* Here, have chosen which of the two inputs to look at.  Only output
9390          * if the running count changes to/from 2, which marks the
9391          * beginning/end of a range that's in the intersection */
9392         if (cp_in_set) {
9393             count++;
9394             if (count == 2) {
9395                 array_r[i_r++] = cp;
9396             }
9397         }
9398         else {
9399             if (count == 2) {
9400                 array_r[i_r++] = cp;
9401             }
9402             count--;
9403         }
9404
9405     }
9406
9407     /* The loop above increments the index into exactly one of the input lists
9408      * each iteration, and ends when either index gets to its list end.  That
9409      * means the other index is lower than its end, and so something is
9410      * remaining in that one.  We increment 'count', as explained below, if the
9411      * exhausted list was in its set.  (i_a and i_b each currently index the
9412      * element beyond the one we care about.) */
9413     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9414         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9415     {
9416         count++;
9417     }
9418
9419     /* Above we incremented 'count' if the exhausted list was in its set.  This
9420      * has made it so that 'count' being below 2 means there is nothing left to
9421      * output; otheriwse what's left to add to the intersection is precisely
9422      * that which is left in the non-exhausted input list.
9423      *
9424      * To see why, note first that the exhausted input obviously has nothing
9425      * left to affect the intersection.  If it was in its set at its end, that
9426      * means the set extends from here to the platform's infinity, and hence
9427      * anything in the non-exhausted's list will be in the intersection, and
9428      * anything not in it won't be.  Hence, the rest of the intersection is
9429      * precisely what's in the non-exhausted list  The exhausted set also
9430      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9431      * it means 'count' is now at least 2.  This is consistent with the
9432      * incremented 'count' being >= 2 means to add the non-exhausted list to
9433      * the intersection.
9434      *
9435      * But if the exhausted input wasn't in its set, it contributed 0 to
9436      * 'count', and the intersection can't include anything further; the
9437      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9438      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9439      * further to add to the intersection. */
9440     if (count < 2) { /* Nothing left to put in the intersection. */
9441         len_r = i_r;
9442     }
9443     else { /* copy the non-exhausted list, unchanged. */
9444         IV copy_count = len_a - i_a;
9445         if (copy_count > 0) {   /* a is the one with stuff left */
9446             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9447         }
9448         else {  /* b is the one with stuff left */
9449             copy_count = len_b - i_b;
9450             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9451         }
9452         len_r = i_r + copy_count;
9453     }
9454
9455     /* Set the result to the final length, which can change the pointer to
9456      * array_r, so re-find it.  (Note that it is unlikely that this will
9457      * change, as we are shrinking the space, not enlarging it) */
9458     if (len_r != _invlist_len(r)) {
9459         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9460         invlist_trim(r);
9461         array_r = invlist_array(r);
9462     }
9463
9464     /* Finish outputting any remaining */
9465     if (count >= 2) { /* At most one will have a non-zero copy count */
9466         IV copy_count;
9467         if ((copy_count = len_a - i_a) > 0) {
9468             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9469         }
9470         else if ((copy_count = len_b - i_b) > 0) {
9471             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9472         }
9473     }
9474
9475     /* If the output is not to overwrite either of the inputs, just return the
9476      * calculated intersection */
9477     if (a != *i && b != *i) {
9478         *i = r;
9479     }
9480     else {
9481         /*  Here, the output is to be the same as one of the input scalars,
9482          *  hence replacing it.  The simple thing to do is to free the input
9483          *  scalar, making it instead be the output one.  But experience has
9484          *  shown [perl #127392] that if the input is a mortal, we can get a
9485          *  huge build-up of these during regex compilation before they get
9486          *  freed.  So for that case, replace just the input's interior with
9487          *  the output's, and then free the output.  A short-cut in this case
9488          *  is if the output is empty, we can just set the input to be empty */
9489
9490         assert(! invlist_is_iterating(*i));
9491
9492         if (! SvTEMP(*i)) {
9493             SvREFCNT_dec_NN(*i);
9494             *i = r;
9495         }
9496         else {
9497             if (len_r) {
9498                 invlist_replace_list_destroys_src(*i, r);
9499             }
9500             else {
9501                 invlist_clear(*i);
9502             }
9503             SvREFCNT_dec_NN(r);
9504         }
9505     }
9506
9507     return;
9508 }
9509
9510 SV*
9511 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9512 {
9513     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9514      * set.  A pointer to the inversion list is returned.  This may actually be
9515      * a new list, in which case the passed in one has been destroyed.  The
9516      * passed-in inversion list can be NULL, in which case a new one is created
9517      * with just the one range in it.  The new list is not necessarily
9518      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9519      * result of this function.  The gain would not be large, and in many
9520      * cases, this is called multiple times on a single inversion list, so
9521      * anything freed may almost immediately be needed again.
9522      *
9523      * This used to mostly call the 'union' routine, but that is much more
9524      * heavyweight than really needed for a single range addition */
9525
9526     UV* array;              /* The array implementing the inversion list */
9527     UV len;                 /* How many elements in 'array' */
9528     SSize_t i_s;            /* index into the invlist array where 'start'
9529                                should go */
9530     SSize_t i_e = 0;        /* And the index where 'end' should go */
9531     UV cur_highest;         /* The highest code point in the inversion list
9532                                upon entry to this function */
9533
9534     /* This range becomes the whole inversion list if none already existed */
9535     if (invlist == NULL) {
9536         invlist = _new_invlist(2);
9537         _append_range_to_invlist(invlist, start, end);
9538         return invlist;
9539     }
9540
9541     /* Likewise, if the inversion list is currently empty */
9542     len = _invlist_len(invlist);
9543     if (len == 0) {
9544         _append_range_to_invlist(invlist, start, end);
9545         return invlist;
9546     }
9547
9548     /* Starting here, we have to know the internals of the list */
9549     array = invlist_array(invlist);
9550
9551     /* If the new range ends higher than the current highest ... */
9552     cur_highest = invlist_highest(invlist);
9553     if (end > cur_highest) {
9554
9555         /* If the whole range is higher, we can just append it */
9556         if (start > cur_highest) {
9557             _append_range_to_invlist(invlist, start, end);
9558             return invlist;
9559         }
9560
9561         /* Otherwise, add the portion that is higher ... */
9562         _append_range_to_invlist(invlist, cur_highest + 1, end);
9563
9564         /* ... and continue on below to handle the rest.  As a result of the
9565          * above append, we know that the index of the end of the range is the
9566          * final even numbered one of the array.  Recall that the final element
9567          * always starts a range that extends to infinity.  If that range is in
9568          * the set (meaning the set goes from here to infinity), it will be an
9569          * even index, but if it isn't in the set, it's odd, and the final
9570          * range in the set is one less, which is even. */
9571         if (end == UV_MAX) {
9572             i_e = len;
9573         }
9574         else {
9575             i_e = len - 2;
9576         }
9577     }
9578
9579     /* We have dealt with appending, now see about prepending.  If the new
9580      * range starts lower than the current lowest ... */
9581     if (start < array[0]) {
9582
9583         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9584          * Let the union code handle it, rather than having to know the
9585          * trickiness in two code places.  */
9586         if (UNLIKELY(start == 0)) {
9587             SV* range_invlist;
9588
9589             range_invlist = _new_invlist(2);
9590             _append_range_to_invlist(range_invlist, start, end);
9591
9592             _invlist_union(invlist, range_invlist, &invlist);
9593
9594             SvREFCNT_dec_NN(range_invlist);
9595
9596             return invlist;
9597         }
9598
9599         /* If the whole new range comes before the first entry, and doesn't
9600          * extend it, we have to insert it as an additional range */
9601         if (end < array[0] - 1) {
9602             i_s = i_e = -1;
9603             goto splice_in_new_range;
9604         }
9605
9606         /* Here the new range adjoins the existing first range, extending it
9607          * downwards. */
9608         array[0] = start;
9609
9610         /* And continue on below to handle the rest.  We know that the index of
9611          * the beginning of the range is the first one of the array */
9612         i_s = 0;
9613     }
9614     else { /* Not prepending any part of the new range to the existing list.
9615             * Find where in the list it should go.  This finds i_s, such that:
9616             *     invlist[i_s] <= start < array[i_s+1]
9617             */
9618         i_s = _invlist_search(invlist, start);
9619     }
9620
9621     /* At this point, any extending before the beginning of the inversion list
9622      * and/or after the end has been done.  This has made it so that, in the
9623      * code below, each endpoint of the new range is either in a range that is
9624      * in the set, or is in a gap between two ranges that are.  This means we
9625      * don't have to worry about exceeding the array bounds.
9626      *
9627      * Find where in the list the new range ends (but we can skip this if we
9628      * have already determined what it is, or if it will be the same as i_s,
9629      * which we already have computed) */
9630     if (i_e == 0) {
9631         i_e = (start == end)
9632               ? i_s
9633               : _invlist_search(invlist, end);
9634     }
9635
9636     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
9637      * is a range that goes to infinity there is no element at invlist[i_e+1],
9638      * so only the first relation holds. */
9639
9640     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9641
9642         /* Here, the ranges on either side of the beginning of the new range
9643          * are in the set, and this range starts in the gap between them.
9644          *
9645          * The new range extends the range above it downwards if the new range
9646          * ends at or above that range's start */
9647         const bool extends_the_range_above = (   end == UV_MAX
9648                                               || end + 1 >= array[i_s+1]);
9649
9650         /* The new range extends the range below it upwards if it begins just
9651          * after where that range ends */
9652         if (start == array[i_s]) {
9653
9654             /* If the new range fills the entire gap between the other ranges,
9655              * they will get merged together.  Other ranges may also get
9656              * merged, depending on how many of them the new range spans.  In
9657              * the general case, we do the merge later, just once, after we
9658              * figure out how many to merge.  But in the case where the new
9659              * range exactly spans just this one gap (possibly extending into
9660              * the one above), we do the merge here, and an early exit.  This
9661              * is done here to avoid having to special case later. */
9662             if (i_e - i_s <= 1) {
9663
9664                 /* If i_e - i_s == 1, it means that the new range terminates
9665                  * within the range above, and hence 'extends_the_range_above'
9666                  * must be true.  (If the range above it extends to infinity,
9667                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9668                  * will be 0, so no harm done.) */
9669                 if (extends_the_range_above) {
9670                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9671                     invlist_set_len(invlist,
9672                                     len - 2,
9673                                     *(get_invlist_offset_addr(invlist)));
9674                     return invlist;
9675                 }
9676
9677                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
9678                  * to the same range, and below we are about to decrement i_s
9679                  * */
9680                 i_e--;
9681             }
9682
9683             /* Here, the new range is adjacent to the one below.  (It may also
9684              * span beyond the range above, but that will get resolved later.)
9685              * Extend the range below to include this one. */
9686             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
9687             i_s--;
9688             start = array[i_s];
9689         }
9690         else if (extends_the_range_above) {
9691
9692             /* Here the new range only extends the range above it, but not the
9693              * one below.  It merges with the one above.  Again, we keep i_e
9694              * and i_s in sync if they point to the same range */
9695             if (i_e == i_s) {
9696                 i_e++;
9697             }
9698             i_s++;
9699             array[i_s] = start;
9700         }
9701     }
9702
9703     /* Here, we've dealt with the new range start extending any adjoining
9704      * existing ranges.
9705      *
9706      * If the new range extends to infinity, it is now the final one,
9707      * regardless of what was there before */
9708     if (UNLIKELY(end == UV_MAX)) {
9709         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
9710         return invlist;
9711     }
9712
9713     /* If i_e started as == i_s, it has also been dealt with,
9714      * and been updated to the new i_s, which will fail the following if */
9715     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
9716
9717         /* Here, the ranges on either side of the end of the new range are in
9718          * the set, and this range ends in the gap between them.
9719          *
9720          * If this range is adjacent to (hence extends) the range above it, it
9721          * becomes part of that range; likewise if it extends the range below,
9722          * it becomes part of that range */
9723         if (end + 1 == array[i_e+1]) {
9724             i_e++;
9725             array[i_e] = start;
9726         }
9727         else if (start <= array[i_e]) {
9728             array[i_e] = end + 1;
9729             i_e--;
9730         }
9731     }
9732
9733     if (i_s == i_e) {
9734
9735         /* If the range fits entirely in an existing range (as possibly already
9736          * extended above), it doesn't add anything new */
9737         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9738             return invlist;
9739         }
9740
9741         /* Here, no part of the range is in the list.  Must add it.  It will
9742          * occupy 2 more slots */
9743       splice_in_new_range:
9744
9745         invlist_extend(invlist, len + 2);
9746         array = invlist_array(invlist);
9747         /* Move the rest of the array down two slots. Don't include any
9748          * trailing NUL */
9749         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
9750
9751         /* Do the actual splice */
9752         array[i_e+1] = start;
9753         array[i_e+2] = end + 1;
9754         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
9755         return invlist;
9756     }
9757
9758     /* Here the new range crossed the boundaries of a pre-existing range.  The
9759      * code above has adjusted things so that both ends are in ranges that are
9760      * in the set.  This means everything in between must also be in the set.
9761      * Just squash things together */
9762     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
9763     invlist_set_len(invlist,
9764                     len - i_e + i_s,
9765                     *(get_invlist_offset_addr(invlist)));
9766
9767     return invlist;
9768 }
9769
9770 SV*
9771 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9772                                  UV** other_elements_ptr)
9773 {
9774     /* Create and return an inversion list whose contents are to be populated
9775      * by the caller.  The caller gives the number of elements (in 'size') and
9776      * the very first element ('element0').  This function will set
9777      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9778      * are to be placed.
9779      *
9780      * Obviously there is some trust involved that the caller will properly
9781      * fill in the other elements of the array.
9782      *
9783      * (The first element needs to be passed in, as the underlying code does
9784      * things differently depending on whether it is zero or non-zero) */
9785
9786     SV* invlist = _new_invlist(size);
9787     bool offset;
9788
9789     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9790
9791     invlist = add_cp_to_invlist(invlist, element0);
9792     offset = *get_invlist_offset_addr(invlist);
9793
9794     invlist_set_len(invlist, size, offset);
9795     *other_elements_ptr = invlist_array(invlist) + 1;
9796     return invlist;
9797 }
9798
9799 #endif
9800
9801 PERL_STATIC_INLINE SV*
9802 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9803     return _add_range_to_invlist(invlist, cp, cp);
9804 }
9805
9806 #ifndef PERL_IN_XSUB_RE
9807 void
9808 Perl__invlist_invert(pTHX_ SV* const invlist)
9809 {
9810     /* Complement the input inversion list.  This adds a 0 if the list didn't
9811      * have a zero; removes it otherwise.  As described above, the data
9812      * structure is set up so that this is very efficient */
9813
9814     PERL_ARGS_ASSERT__INVLIST_INVERT;
9815
9816     assert(! invlist_is_iterating(invlist));
9817
9818     /* The inverse of matching nothing is matching everything */
9819     if (_invlist_len(invlist) == 0) {
9820         _append_range_to_invlist(invlist, 0, UV_MAX);
9821         return;
9822     }
9823
9824     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9825 }
9826
9827 #endif
9828
9829 PERL_STATIC_INLINE SV*
9830 S_invlist_clone(pTHX_ SV* const invlist)
9831 {
9832
9833     /* Return a new inversion list that is a copy of the input one, which is
9834      * unchanged.  The new list will not be mortal even if the old one was. */
9835
9836     /* Need to allocate extra space to accommodate Perl's addition of a
9837      * trailing NUL to SvPV's, since it thinks they are always strings */
9838     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9839     STRLEN physical_length = SvCUR(invlist);
9840     bool offset = *(get_invlist_offset_addr(invlist));
9841
9842     PERL_ARGS_ASSERT_INVLIST_CLONE;
9843
9844     *(get_invlist_offset_addr(new_invlist)) = offset;
9845     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9846     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9847
9848     return new_invlist;
9849 }
9850
9851 PERL_STATIC_INLINE STRLEN*
9852 S_get_invlist_iter_addr(SV* invlist)
9853 {
9854     /* Return the address of the UV that contains the current iteration
9855      * position */
9856
9857     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9858
9859     assert(SvTYPE(invlist) == SVt_INVLIST);
9860
9861     return &(((XINVLIST*) SvANY(invlist))->iterator);
9862 }
9863
9864 PERL_STATIC_INLINE void
9865 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9866 {
9867     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9868
9869     *get_invlist_iter_addr(invlist) = 0;
9870 }
9871
9872 PERL_STATIC_INLINE void
9873 S_invlist_iterfinish(SV* invlist)
9874 {
9875     /* Terminate iterator for invlist.  This is to catch development errors.
9876      * Any iteration that is interrupted before completed should call this
9877      * function.  Functions that add code points anywhere else but to the end
9878      * of an inversion list assert that they are not in the middle of an
9879      * iteration.  If they were, the addition would make the iteration
9880      * problematical: if the iteration hadn't reached the place where things
9881      * were being added, it would be ok */
9882
9883     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9884
9885     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9886 }
9887
9888 STATIC bool
9889 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9890 {
9891     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9892      * This call sets in <*start> and <*end>, the next range in <invlist>.
9893      * Returns <TRUE> if successful and the next call will return the next
9894      * range; <FALSE> if was already at the end of the list.  If the latter,
9895      * <*start> and <*end> are unchanged, and the next call to this function
9896      * will start over at the beginning of the list */
9897
9898     STRLEN* pos = get_invlist_iter_addr(invlist);
9899     UV len = _invlist_len(invlist);
9900     UV *array;
9901
9902     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9903
9904     if (*pos >= len) {
9905         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9906         return FALSE;
9907     }
9908
9909     array = invlist_array(invlist);
9910
9911     *start = array[(*pos)++];
9912
9913     if (*pos >= len) {
9914         *end = UV_MAX;
9915     }
9916     else {
9917         *end = array[(*pos)++] - 1;
9918     }
9919
9920     return TRUE;
9921 }
9922
9923 PERL_STATIC_INLINE UV
9924 S_invlist_highest(SV* const invlist)
9925 {
9926     /* Returns the highest code point that matches an inversion list.  This API
9927      * has an ambiguity, as it returns 0 under either the highest is actually
9928      * 0, or if the list is empty.  If this distinction matters to you, check
9929      * for emptiness before calling this function */
9930
9931     UV len = _invlist_len(invlist);
9932     UV *array;
9933
9934     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9935
9936     if (len == 0) {
9937         return 0;
9938     }
9939
9940     array = invlist_array(invlist);
9941
9942     /* The last element in the array in the inversion list always starts a
9943      * range that goes to infinity.  That range may be for code points that are
9944      * matched in the inversion list, or it may be for ones that aren't
9945      * matched.  In the latter case, the highest code point in the set is one
9946      * less than the beginning of this range; otherwise it is the final element
9947      * of this range: infinity */
9948     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9949            ? UV_MAX
9950            : array[len - 1] - 1;
9951 }
9952
9953 STATIC SV *
9954 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
9955 {
9956     /* Get the contents of an inversion list into a string SV so that they can
9957      * be printed out.  If 'traditional_style' is TRUE, it uses the format
9958      * traditionally done for debug tracing; otherwise it uses a format
9959      * suitable for just copying to the output, with blanks between ranges and
9960      * a dash between range components */
9961
9962     UV start, end;
9963     SV* output;
9964     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
9965     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
9966
9967     if (traditional_style) {
9968         output = newSVpvs("\n");
9969     }
9970     else {
9971         output = newSVpvs("");
9972     }
9973
9974     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
9975
9976     assert(! invlist_is_iterating(invlist));
9977
9978     invlist_iterinit(invlist);
9979     while (invlist_iternext(invlist, &start, &end)) {
9980         if (end == UV_MAX) {
9981             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
9982                                           start, intra_range_delimiter,
9983                                                  inter_range_delimiter);
9984         }
9985         else if (end != start) {
9986             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
9987                                           start,
9988                                                    intra_range_delimiter,
9989                                                   end, inter_range_delimiter);
9990         }
9991         else {
9992             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
9993                                           start, inter_range_delimiter);
9994         }
9995     }
9996
9997     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
9998         SvCUR_set(output, SvCUR(output) - 1);
9999     }
10000
10001     return output;
10002 }
10003
10004 #ifndef PERL_IN_XSUB_RE
10005 void
10006 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10007                          const char * const indent, SV* const invlist)
10008 {
10009     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10010      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10011      * the string 'indent'.  The output looks like this:
10012          [0] 0x000A .. 0x000D
10013          [2] 0x0085
10014          [4] 0x2028 .. 0x2029
10015          [6] 0x3104 .. INFINITY
10016      * This means that the first range of code points matched by the list are
10017      * 0xA through 0xD; the second range contains only the single code point
10018      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10019      * are used to define each range (except if the final range extends to
10020      * infinity, only a single element is needed).  The array index of the
10021      * first element for the corresponding range is given in brackets. */
10022
10023     UV start, end;
10024     STRLEN count = 0;
10025
10026     PERL_ARGS_ASSERT__INVLIST_DUMP;
10027
10028     if (invlist_is_iterating(invlist)) {
10029         Perl_dump_indent(aTHX_ level, file,
10030              "%sCan't dump inversion list because is in middle of iterating\n",
10031              indent);
10032         return;
10033     }
10034
10035     invlist_iterinit(invlist);
10036     while (invlist_iternext(invlist, &start, &end)) {
10037         if (end == UV_MAX) {
10038             Perl_dump_indent(aTHX_ level, file,
10039                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
10040                                    indent, (UV)count, start);
10041         }
10042         else if (end != start) {
10043             Perl_dump_indent(aTHX_ level, file,
10044                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
10045                                 indent, (UV)count, start,         end);
10046         }
10047         else {
10048             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
10049                                             indent, (UV)count, start);
10050         }
10051         count += 2;
10052     }
10053 }
10054
10055 void
10056 Perl__load_PL_utf8_foldclosures (pTHX)
10057 {
10058     assert(! PL_utf8_foldclosures);
10059
10060     /* If the folds haven't been read in, call a fold function
10061      * to force that */
10062     if (! PL_utf8_tofold) {
10063         U8 dummy[UTF8_MAXBYTES_CASE+1];
10064
10065         /* This string is just a short named one above \xff */
10066         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
10067         assert(PL_utf8_tofold); /* Verify that worked */
10068     }
10069     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10070 }
10071 #endif
10072
10073 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10074 bool
10075 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10076 {
10077     /* Return a boolean as to if the two passed in inversion lists are
10078      * identical.  The final argument, if TRUE, says to take the complement of
10079      * the second inversion list before doing the comparison */
10080
10081     const UV* array_a = invlist_array(a);
10082     const UV* array_b = invlist_array(b);
10083     UV len_a = _invlist_len(a);
10084     UV len_b = _invlist_len(b);
10085
10086     UV i = 0;               /* current index into the arrays */
10087     bool retval = TRUE;     /* Assume are identical until proven otherwise */
10088
10089     PERL_ARGS_ASSERT__INVLISTEQ;
10090
10091     /* If are to compare 'a' with the complement of b, set it
10092      * up so are looking at b's complement. */
10093     if (complement_b) {
10094
10095         /* The complement of nothing is everything, so <a> would have to have
10096          * just one element, starting at zero (ending at infinity) */
10097         if (len_b == 0) {
10098             return (len_a == 1 && array_a[0] == 0);
10099         }
10100         else if (array_b[0] == 0) {
10101
10102             /* Otherwise, to complement, we invert.  Here, the first element is
10103              * 0, just remove it.  To do this, we just pretend the array starts
10104              * one later */
10105
10106             array_b++;
10107             len_b--;
10108         }
10109         else {
10110
10111             /* But if the first element is not zero, we pretend the list starts
10112              * at the 0 that is always stored immediately before the array. */
10113             array_b--;
10114             len_b++;
10115         }
10116     }
10117
10118     /* Make sure that the lengths are the same, as well as the final element
10119      * before looping through the remainder.  (Thus we test the length, final,
10120      * and first elements right off the bat) */
10121     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
10122         retval = FALSE;
10123     }
10124     else for (i = 0; i < len_a - 1; i++) {
10125         if (array_a[i] != array_b[i]) {
10126             retval = FALSE;
10127             break;
10128         }
10129     }
10130
10131     return retval;
10132 }
10133 #endif
10134
10135 /*
10136  * As best we can, determine the characters that can match the start of
10137  * the given EXACTF-ish node.
10138  *
10139  * Returns the invlist as a new SV*; it is the caller's responsibility to
10140  * call SvREFCNT_dec() when done with it.
10141  */
10142 STATIC SV*
10143 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10144 {
10145     const U8 * s = (U8*)STRING(node);
10146     SSize_t bytelen = STR_LEN(node);
10147     UV uc;
10148     /* Start out big enough for 2 separate code points */
10149     SV* invlist = _new_invlist(4);
10150
10151     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10152
10153     if (! UTF) {
10154         uc = *s;
10155
10156         /* We punt and assume can match anything if the node begins
10157          * with a multi-character fold.  Things are complicated.  For
10158          * example, /ffi/i could match any of:
10159          *  "\N{LATIN SMALL LIGATURE FFI}"
10160          *  "\N{LATIN SMALL LIGATURE FF}I"
10161          *  "F\N{LATIN SMALL LIGATURE FI}"
10162          *  plus several other things; and making sure we have all the
10163          *  possibilities is hard. */
10164         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10165             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10166         }
10167         else {
10168             /* Any Latin1 range character can potentially match any
10169              * other depending on the locale */
10170             if (OP(node) == EXACTFL) {
10171                 _invlist_union(invlist, PL_Latin1, &invlist);
10172             }
10173             else {
10174                 /* But otherwise, it matches at least itself.  We can
10175                  * quickly tell if it has a distinct fold, and if so,
10176                  * it matches that as well */
10177                 invlist = add_cp_to_invlist(invlist, uc);
10178                 if (IS_IN_SOME_FOLD_L1(uc))
10179                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10180             }
10181
10182             /* Some characters match above-Latin1 ones under /i.  This
10183              * is true of EXACTFL ones when the locale is UTF-8 */
10184             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10185                 && (! isASCII(uc) || (OP(node) != EXACTFA
10186                                     && OP(node) != EXACTFA_NO_TRIE)))
10187             {
10188                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10189             }
10190         }
10191     }
10192     else {  /* Pattern is UTF-8 */
10193         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10194         STRLEN foldlen = UTF8SKIP(s);
10195         const U8* e = s + bytelen;
10196         SV** listp;
10197
10198         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10199
10200         /* The only code points that aren't folded in a UTF EXACTFish
10201          * node are are the problematic ones in EXACTFL nodes */
10202         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10203             /* We need to check for the possibility that this EXACTFL
10204              * node begins with a multi-char fold.  Therefore we fold
10205              * the first few characters of it so that we can make that
10206              * check */
10207             U8 *d = folded;
10208             int i;
10209
10210             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10211                 if (isASCII(*s)) {
10212                     *(d++) = (U8) toFOLD(*s);
10213                     s++;
10214                 }
10215                 else {
10216                     STRLEN len;
10217                     to_utf8_fold(s, d, &len);
10218                     d += len;
10219                     s += UTF8SKIP(s);
10220                 }
10221             }
10222
10223             /* And set up so the code below that looks in this folded
10224              * buffer instead of the node's string */
10225             e = d;
10226             foldlen = UTF8SKIP(folded);
10227             s = folded;
10228         }
10229
10230         /* When we reach here 's' points to the fold of the first
10231          * character(s) of the node; and 'e' points to far enough along
10232          * the folded string to be just past any possible multi-char
10233          * fold. 'foldlen' is the length in bytes of the first
10234          * character in 's'
10235          *
10236          * Unlike the non-UTF-8 case, the macro for determining if a
10237          * string is a multi-char fold requires all the characters to
10238          * already be folded.  This is because of all the complications
10239          * if not.  Note that they are folded anyway, except in EXACTFL
10240          * nodes.  Like the non-UTF case above, we punt if the node
10241          * begins with a multi-char fold  */
10242
10243         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10244             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10245         }
10246         else {  /* Single char fold */
10247
10248             /* It matches all the things that fold to it, which are
10249              * found in PL_utf8_foldclosures (including itself) */
10250             invlist = add_cp_to_invlist(invlist, uc);
10251             if (! PL_utf8_foldclosures)
10252                 _load_PL_utf8_foldclosures();
10253             if ((listp = hv_fetch(PL_utf8_foldclosures,
10254                                 (char *) s, foldlen, FALSE)))
10255             {
10256                 AV* list = (AV*) *listp;
10257                 IV k;
10258                 for (k = 0; k <= av_tindex_nomg(list); k++) {
10259                     SV** c_p = av_fetch(list, k, FALSE);
10260                     UV c;
10261                     assert(c_p);
10262
10263                     c = SvUV(*c_p);
10264
10265                     /* /aa doesn't allow folds between ASCII and non- */
10266                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10267                         && isASCII(c) != isASCII(uc))
10268                     {
10269                         continue;
10270                     }
10271
10272                     invlist = add_cp_to_invlist(invlist, c);
10273                 }
10274             }
10275         }
10276     }
10277
10278     return invlist;
10279 }
10280
10281 #undef HEADER_LENGTH
10282 #undef TO_INTERNAL_SIZE
10283 #undef FROM_INTERNAL_SIZE
10284 #undef INVLIST_VERSION_ID
10285
10286 /* End of inversion list object */
10287
10288 STATIC void
10289 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10290 {
10291     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10292      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10293      * should point to the first flag; it is updated on output to point to the
10294      * final ')' or ':'.  There needs to be at least one flag, or this will
10295      * abort */
10296
10297     /* for (?g), (?gc), and (?o) warnings; warning
10298        about (?c) will warn about (?g) -- japhy    */
10299
10300 #define WASTED_O  0x01
10301 #define WASTED_G  0x02
10302 #define WASTED_C  0x04
10303 #define WASTED_GC (WASTED_G|WASTED_C)
10304     I32 wastedflags = 0x00;
10305     U32 posflags = 0, negflags = 0;
10306     U32 *flagsp = &posflags;
10307     char has_charset_modifier = '\0';
10308     regex_charset cs;
10309     bool has_use_defaults = FALSE;
10310     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10311     int x_mod_count = 0;
10312
10313     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10314
10315     /* '^' as an initial flag sets certain defaults */
10316     if (UCHARAT(RExC_parse) == '^') {
10317         RExC_parse++;
10318         has_use_defaults = TRUE;
10319         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10320         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10321                                         ? REGEX_UNICODE_CHARSET
10322                                         : REGEX_DEPENDS_CHARSET);
10323     }
10324
10325     cs = get_regex_charset(RExC_flags);
10326     if (cs == REGEX_DEPENDS_CHARSET
10327         && (RExC_utf8 || RExC_uni_semantics))
10328     {
10329         cs = REGEX_UNICODE_CHARSET;
10330     }
10331
10332     while (RExC_parse < RExC_end) {
10333         /* && strchr("iogcmsx", *RExC_parse) */
10334         /* (?g), (?gc) and (?o) are useless here
10335            and must be globally applied -- japhy */
10336         switch (*RExC_parse) {
10337
10338             /* Code for the imsxn flags */
10339             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10340
10341             case LOCALE_PAT_MOD:
10342                 if (has_charset_modifier) {
10343                     goto excess_modifier;
10344                 }
10345                 else if (flagsp == &negflags) {
10346                     goto neg_modifier;
10347                 }
10348                 cs = REGEX_LOCALE_CHARSET;
10349                 has_charset_modifier = LOCALE_PAT_MOD;
10350                 break;
10351             case UNICODE_PAT_MOD:
10352                 if (has_charset_modifier) {
10353                     goto excess_modifier;
10354                 }
10355                 else if (flagsp == &negflags) {
10356                     goto neg_modifier;
10357                 }
10358                 cs = REGEX_UNICODE_CHARSET;
10359                 has_charset_modifier = UNICODE_PAT_MOD;
10360                 break;
10361             case ASCII_RESTRICT_PAT_MOD:
10362                 if (flagsp == &negflags) {
10363                     goto neg_modifier;
10364                 }
10365                 if (has_charset_modifier) {
10366                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10367                         goto excess_modifier;
10368                     }
10369                     /* Doubled modifier implies more restricted */
10370                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10371                 }
10372                 else {
10373                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10374                 }
10375                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10376                 break;
10377             case DEPENDS_PAT_MOD:
10378                 if (has_use_defaults) {
10379                     goto fail_modifiers;
10380                 }
10381                 else if (flagsp == &negflags) {
10382                     goto neg_modifier;
10383                 }
10384                 else if (has_charset_modifier) {
10385                     goto excess_modifier;
10386                 }
10387
10388                 /* The dual charset means unicode semantics if the
10389                  * pattern (or target, not known until runtime) are
10390                  * utf8, or something in the pattern indicates unicode
10391                  * semantics */
10392                 cs = (RExC_utf8 || RExC_uni_semantics)
10393                      ? REGEX_UNICODE_CHARSET
10394                      : REGEX_DEPENDS_CHARSET;
10395                 has_charset_modifier = DEPENDS_PAT_MOD;
10396                 break;
10397               excess_modifier:
10398                 RExC_parse++;
10399                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10400                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10401                 }
10402                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10403                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10404                                         *(RExC_parse - 1));
10405                 }
10406                 else {
10407                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10408                 }
10409                 NOT_REACHED; /*NOTREACHED*/
10410               neg_modifier:
10411                 RExC_parse++;
10412                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10413                                     *(RExC_parse - 1));
10414                 NOT_REACHED; /*NOTREACHED*/
10415             case ONCE_PAT_MOD: /* 'o' */
10416             case GLOBAL_PAT_MOD: /* 'g' */
10417                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10418                     const I32 wflagbit = *RExC_parse == 'o'
10419                                          ? WASTED_O
10420                                          : WASTED_G;
10421                     if (! (wastedflags & wflagbit) ) {
10422                         wastedflags |= wflagbit;
10423                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10424                         vWARN5(
10425                             RExC_parse + 1,
10426                             "Useless (%s%c) - %suse /%c modifier",
10427                             flagsp == &negflags ? "?-" : "?",
10428                             *RExC_parse,
10429                             flagsp == &negflags ? "don't " : "",
10430                             *RExC_parse
10431                         );
10432                     }
10433                 }
10434                 break;
10435
10436             case CONTINUE_PAT_MOD: /* 'c' */
10437                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10438                     if (! (wastedflags & WASTED_C) ) {
10439                         wastedflags |= WASTED_GC;
10440                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10441                         vWARN3(
10442                             RExC_parse + 1,
10443                             "Useless (%sc) - %suse /gc modifier",
10444                             flagsp == &negflags ? "?-" : "?",
10445                             flagsp == &negflags ? "don't " : ""
10446                         );
10447                     }
10448                 }
10449                 break;
10450             case KEEPCOPY_PAT_MOD: /* 'p' */
10451                 if (flagsp == &negflags) {
10452                     if (PASS2)
10453                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10454                 } else {
10455                     *flagsp |= RXf_PMf_KEEPCOPY;
10456                 }
10457                 break;
10458             case '-':
10459                 /* A flag is a default iff it is following a minus, so
10460                  * if there is a minus, it means will be trying to
10461                  * re-specify a default which is an error */
10462                 if (has_use_defaults || flagsp == &negflags) {
10463                     goto fail_modifiers;
10464                 }
10465                 flagsp = &negflags;
10466                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10467                 break;
10468             case ':':
10469             case ')':
10470                 RExC_flags |= posflags;
10471                 RExC_flags &= ~negflags;
10472                 set_regex_charset(&RExC_flags, cs);
10473                 if (RExC_flags & RXf_PMf_FOLD) {
10474                     RExC_contains_i = 1;
10475                 }
10476
10477                 if (UNLIKELY((x_mod_count) > 1)) {
10478                     vFAIL("Only one /x regex modifier is allowed");
10479                 }
10480                 return;
10481                 /*NOTREACHED*/
10482             default:
10483               fail_modifiers:
10484                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10485                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10486                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
10487                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10488                 NOT_REACHED; /*NOTREACHED*/
10489         }
10490
10491         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10492     }
10493
10494     vFAIL("Sequence (?... not terminated");
10495 }
10496
10497 /*
10498  - reg - regular expression, i.e. main body or parenthesized thing
10499  *
10500  * Caller must absorb opening parenthesis.
10501  *
10502  * Combining parenthesis handling with the base level of regular expression
10503  * is a trifle forced, but the need to tie the tails of the branches to what
10504  * follows makes it hard to avoid.
10505  */
10506 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10507 #ifdef DEBUGGING
10508 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10509 #else
10510 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10511 #endif
10512
10513 PERL_STATIC_INLINE regnode *
10514 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10515                              I32 *flagp,
10516                              char * parse_start,
10517                              char ch
10518                       )
10519 {
10520     regnode *ret;
10521     char* name_start = RExC_parse;
10522     U32 num = 0;
10523     SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10524                                             ? REG_RSN_RETURN_NULL
10525                                             : REG_RSN_RETURN_DATA);
10526     GET_RE_DEBUG_FLAGS_DECL;
10527
10528     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10529
10530     if (RExC_parse == name_start || *RExC_parse != ch) {
10531         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10532         vFAIL2("Sequence %.3s... not terminated",parse_start);
10533     }
10534
10535     if (!SIZE_ONLY) {
10536         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10537         RExC_rxi->data->data[num]=(void*)sv_dat;
10538         SvREFCNT_inc_simple_void(sv_dat);
10539     }
10540     RExC_sawback = 1;
10541     ret = reganode(pRExC_state,
10542                    ((! FOLD)
10543                      ? NREF
10544                      : (ASCII_FOLD_RESTRICTED)
10545                        ? NREFFA
10546                        : (AT_LEAST_UNI_SEMANTICS)
10547                          ? NREFFU
10548                          : (LOC)
10549                            ? NREFFL
10550                            : NREFF),
10551                     num);
10552     *flagp |= HASWIDTH;
10553
10554     Set_Node_Offset(ret, parse_start+1);
10555     Set_Node_Cur_Length(ret, parse_start);
10556
10557     nextchar(pRExC_state);
10558     return ret;
10559 }
10560
10561 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10562    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10563    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10564    upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
10565    NULL, which cannot happen.  */
10566 STATIC regnode *
10567 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10568     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10569      * 2 is like 1, but indicates that nextchar() has been called to advance
10570      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10571      * this flag alerts us to the need to check for that */
10572 {
10573     regnode *ret;               /* Will be the head of the group. */
10574     regnode *br;
10575     regnode *lastbr;
10576     regnode *ender = NULL;
10577     I32 parno = 0;
10578     I32 flags;
10579     U32 oregflags = RExC_flags;
10580     bool have_branch = 0;
10581     bool is_open = 0;
10582     I32 freeze_paren = 0;
10583     I32 after_freeze = 0;
10584     I32 num; /* numeric backreferences */
10585
10586     char * parse_start = RExC_parse; /* MJD */
10587     char * const oregcomp_parse = RExC_parse;
10588
10589     GET_RE_DEBUG_FLAGS_DECL;
10590
10591     PERL_ARGS_ASSERT_REG;
10592     DEBUG_PARSE("reg ");
10593
10594     *flagp = 0;                         /* Tentatively. */
10595
10596     /* Having this true makes it feasible to have a lot fewer tests for the
10597      * parse pointer being in scope.  For example, we can write
10598      *      while(isFOO(*RExC_parse)) RExC_parse++;
10599      * instead of
10600      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10601      */
10602     assert(*RExC_end == '\0');
10603
10604     /* Make an OPEN node, if parenthesized. */
10605     if (paren) {
10606
10607         /* Under /x, space and comments can be gobbled up between the '(' and
10608          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
10609          * intervening space, as the sequence is a token, and a token should be
10610          * indivisible */
10611         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
10612
10613         if (RExC_parse >= RExC_end) {
10614             vFAIL("Unmatched (");
10615         }
10616
10617         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10618             char *start_verb = RExC_parse + 1;
10619             STRLEN verb_len;
10620             char *start_arg = NULL;
10621             unsigned char op = 0;
10622             int arg_required = 0;
10623             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10624
10625             if (has_intervening_patws) {
10626                 RExC_parse++;   /* past the '*' */
10627                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10628             }
10629             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10630                 if ( *RExC_parse == ':' ) {
10631                     start_arg = RExC_parse + 1;
10632                     break;
10633                 }
10634                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10635             }
10636             verb_len = RExC_parse - start_verb;
10637             if ( start_arg ) {
10638                 if (RExC_parse >= RExC_end) {
10639                     goto unterminated_verb_pattern;
10640                 }
10641                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10642                 while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10643                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10644                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10645                   unterminated_verb_pattern:
10646                     vFAIL("Unterminated verb pattern argument");
10647                 if ( RExC_parse == start_arg )
10648                     start_arg = NULL;
10649             } else {
10650                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10651                     vFAIL("Unterminated verb pattern");
10652             }
10653
10654             /* Here, we know that RExC_parse < RExC_end */
10655
10656             switch ( *start_verb ) {
10657             case 'A':  /* (*ACCEPT) */
10658                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10659                     op = ACCEPT;
10660                     internal_argval = RExC_nestroot;
10661                 }
10662                 break;
10663             case 'C':  /* (*COMMIT) */
10664                 if ( memEQs(start_verb,verb_len,"COMMIT") )
10665                     op = COMMIT;
10666                 break;
10667             case 'F':  /* (*FAIL) */
10668                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10669                     op = OPFAIL;
10670                 }
10671                 break;
10672             case ':':  /* (*:NAME) */
10673             case 'M':  /* (*MARK:NAME) */
10674                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10675                     op = MARKPOINT;
10676                     arg_required = 1;
10677                 }
10678                 break;
10679             case 'P':  /* (*PRUNE) */
10680                 if ( memEQs(start_verb,verb_len,"PRUNE") )
10681                     op = PRUNE;
10682                 break;
10683             case 'S':   /* (*SKIP) */
10684                 if ( memEQs(start_verb,verb_len,"SKIP") )
10685                     op = SKIP;
10686                 break;
10687             case 'T':  /* (*THEN) */
10688                 /* [19:06] <TimToady> :: is then */
10689                 if ( memEQs(start_verb,verb_len,"THEN") ) {
10690                     op = CUTGROUP;
10691                     RExC_seen |= REG_CUTGROUP_SEEN;
10692                 }
10693                 break;
10694             }
10695             if ( ! op ) {
10696                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10697                 vFAIL2utf8f(
10698                     "Unknown verb pattern '%"UTF8f"'",
10699                     UTF8fARG(UTF, verb_len, start_verb));
10700             }
10701             if ( arg_required && !start_arg ) {
10702                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10703                     verb_len, start_verb);
10704             }
10705             if (internal_argval == -1) {
10706                 ret = reganode(pRExC_state, op, 0);
10707             } else {
10708                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10709             }
10710             RExC_seen |= REG_VERBARG_SEEN;
10711             if ( ! SIZE_ONLY ) {
10712                 if (start_arg) {
10713                     SV *sv = newSVpvn( start_arg,
10714                                        RExC_parse - start_arg);
10715                     ARG(ret) = add_data( pRExC_state,
10716                                          STR_WITH_LEN("S"));
10717                     RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10718                     ret->flags = 1;
10719                 } else {
10720                     ret->flags = 0;
10721                 }
10722                 if ( internal_argval != -1 )
10723                     ARG2L_SET(ret, internal_argval);
10724             }
10725             nextchar(pRExC_state);
10726             return ret;
10727         }
10728         else if (*RExC_parse == '?') { /* (?...) */
10729             bool is_logical = 0;
10730             const char * const seqstart = RExC_parse;
10731             const char * endptr;
10732             if (has_intervening_patws) {
10733                 RExC_parse++;
10734                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10735             }
10736
10737             RExC_parse++;           /* past the '?' */
10738             paren = *RExC_parse;    /* might be a trailing NUL, if not
10739                                        well-formed */
10740             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10741             if (RExC_parse > RExC_end) {
10742                 paren = '\0';
10743             }
10744             ret = NULL;                 /* For look-ahead/behind. */
10745             switch (paren) {
10746
10747             case 'P':   /* (?P...) variants for those used to PCRE/Python */
10748                 paren = *RExC_parse;
10749                 if ( paren == '<') {    /* (?P<...>) named capture */
10750                     RExC_parse++;
10751                     if (RExC_parse >= RExC_end) {
10752                         vFAIL("Sequence (?P<... not terminated");
10753                     }
10754                     goto named_capture;
10755                 }
10756                 else if (paren == '>') {   /* (?P>name) named recursion */
10757                     RExC_parse++;
10758                     if (RExC_parse >= RExC_end) {
10759                         vFAIL("Sequence (?P>... not terminated");
10760                     }
10761                     goto named_recursion;
10762                 }
10763                 else if (paren == '=') {   /* (?P=...)  named backref */
10764                     RExC_parse++;
10765                     return handle_named_backref(pRExC_state, flagp,
10766                                                 parse_start, ')');
10767                 }
10768                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10769                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10770                 vFAIL3("Sequence (%.*s...) not recognized",
10771                                 RExC_parse-seqstart, seqstart);
10772                 NOT_REACHED; /*NOTREACHED*/
10773             case '<':           /* (?<...) */
10774                 if (*RExC_parse == '!')
10775                     paren = ',';
10776                 else if (*RExC_parse != '=')
10777               named_capture:
10778                 {               /* (?<...>) */
10779                     char *name_start;
10780                     SV *svname;
10781                     paren= '>';
10782                 /* FALLTHROUGH */
10783             case '\'':          /* (?'...') */
10784                     name_start = RExC_parse;
10785                     svname = reg_scan_name(pRExC_state,
10786                         SIZE_ONLY    /* reverse test from the others */
10787                         ? REG_RSN_RETURN_NAME
10788                         : REG_RSN_RETURN_NULL);
10789                     if (   RExC_parse == name_start
10790                         || RExC_parse >= RExC_end
10791                         || *RExC_parse != paren)
10792                     {
10793                         vFAIL2("Sequence (?%c... not terminated",
10794                             paren=='>' ? '<' : paren);
10795                     }
10796                     if (SIZE_ONLY) {
10797                         HE *he_str;
10798                         SV *sv_dat = NULL;
10799                         if (!svname) /* shouldn't happen */
10800                             Perl_croak(aTHX_
10801                                 "panic: reg_scan_name returned NULL");
10802                         if (!RExC_paren_names) {
10803                             RExC_paren_names= newHV();
10804                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10805 #ifdef DEBUGGING
10806                             RExC_paren_name_list= newAV();
10807                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10808 #endif
10809                         }
10810                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10811                         if ( he_str )
10812                             sv_dat = HeVAL(he_str);
10813                         if ( ! sv_dat ) {
10814                             /* croak baby croak */
10815                             Perl_croak(aTHX_
10816                                 "panic: paren_name hash element allocation failed");
10817                         } else if ( SvPOK(sv_dat) ) {
10818                             /* (?|...) can mean we have dupes so scan to check
10819                                its already been stored. Maybe a flag indicating
10820                                we are inside such a construct would be useful,
10821                                but the arrays are likely to be quite small, so
10822                                for now we punt -- dmq */
10823                             IV count = SvIV(sv_dat);
10824                             I32 *pv = (I32*)SvPVX(sv_dat);
10825                             IV i;
10826                             for ( i = 0 ; i < count ; i++ ) {
10827                                 if ( pv[i] == RExC_npar ) {
10828                                     count = 0;
10829                                     break;
10830                                 }
10831                             }
10832                             if ( count ) {
10833                                 pv = (I32*)SvGROW(sv_dat,
10834                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10835                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10836                                 pv[count] = RExC_npar;
10837                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10838                             }
10839                         } else {
10840                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10841                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10842                                                                 sizeof(I32));
10843                             SvIOK_on(sv_dat);
10844                             SvIV_set(sv_dat, 1);
10845                         }
10846 #ifdef DEBUGGING
10847                         /* Yes this does cause a memory leak in debugging Perls
10848                          * */
10849                         if (!av_store(RExC_paren_name_list,
10850                                       RExC_npar, SvREFCNT_inc(svname)))
10851                             SvREFCNT_dec_NN(svname);
10852 #endif
10853
10854                         /*sv_dump(sv_dat);*/
10855                     }
10856                     nextchar(pRExC_state);
10857                     paren = 1;
10858                     goto capturing_parens;
10859                 }
10860                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10861                 RExC_in_lookbehind++;
10862                 RExC_parse++;
10863                 if (RExC_parse >= RExC_end) {
10864                     vFAIL("Sequence (?... not terminated");
10865                 }
10866
10867                 /* FALLTHROUGH */
10868             case '=':           /* (?=...) */
10869                 RExC_seen_zerolen++;
10870                 break;
10871             case '!':           /* (?!...) */
10872                 RExC_seen_zerolen++;
10873                 /* check if we're really just a "FAIL" assertion */
10874                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10875                                         FALSE /* Don't force to /x */ );
10876                 if (*RExC_parse == ')') {
10877                     ret=reganode(pRExC_state, OPFAIL, 0);
10878                     nextchar(pRExC_state);
10879                     return ret;
10880                 }
10881                 break;
10882             case '|':           /* (?|...) */
10883                 /* branch reset, behave like a (?:...) except that
10884                    buffers in alternations share the same numbers */
10885                 paren = ':';
10886                 after_freeze = freeze_paren = RExC_npar;
10887                 break;
10888             case ':':           /* (?:...) */
10889             case '>':           /* (?>...) */
10890                 break;
10891             case '$':           /* (?$...) */
10892             case '@':           /* (?@...) */
10893                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10894                 break;
10895             case '0' :           /* (?0) */
10896             case 'R' :           /* (?R) */
10897                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10898                     FAIL("Sequence (?R) not terminated");
10899                 num = 0;
10900                 RExC_seen |= REG_RECURSE_SEEN;
10901                 *flagp |= POSTPONED;
10902                 goto gen_recurse_regop;
10903                 /*notreached*/
10904             /* named and numeric backreferences */
10905             case '&':            /* (?&NAME) */
10906                 parse_start = RExC_parse - 1;
10907               named_recursion:
10908                 {
10909                     SV *sv_dat = reg_scan_name(pRExC_state,
10910                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10911                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10912                 }
10913                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
10914                     vFAIL("Sequence (?&... not terminated");
10915                 goto gen_recurse_regop;
10916                 /* NOTREACHED */
10917             case '+':
10918                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10919                     RExC_parse++;
10920                     vFAIL("Illegal pattern");
10921                 }
10922                 goto parse_recursion;
10923                 /* NOTREACHED*/
10924             case '-': /* (?-1) */
10925                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10926                     RExC_parse--; /* rewind to let it be handled later */
10927                     goto parse_flags;
10928                 }
10929                 /* FALLTHROUGH */
10930             case '1': case '2': case '3': case '4': /* (?1) */
10931             case '5': case '6': case '7': case '8': case '9':
10932                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
10933               parse_recursion:
10934                 {
10935                     bool is_neg = FALSE;
10936                     UV unum;
10937                     parse_start = RExC_parse - 1; /* MJD */
10938                     if (*RExC_parse == '-') {
10939                         RExC_parse++;
10940                         is_neg = TRUE;
10941                     }
10942                     if (grok_atoUV(RExC_parse, &unum, &endptr)
10943                         && unum <= I32_MAX
10944                     ) {
10945                         num = (I32)unum;
10946                         RExC_parse = (char*)endptr;
10947                     } else
10948                         num = I32_MAX;
10949                     if (is_neg) {
10950                         /* Some limit for num? */
10951                         num = -num;
10952                     }
10953                 }
10954                 if (*RExC_parse!=')')
10955                     vFAIL("Expecting close bracket");
10956
10957               gen_recurse_regop:
10958                 if ( paren == '-' ) {
10959                     /*
10960                     Diagram of capture buffer numbering.
10961                     Top line is the normal capture buffer numbers
10962                     Bottom line is the negative indexing as from
10963                     the X (the (?-2))
10964
10965                     +   1 2    3 4 5 X          6 7
10966                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10967                     -   5 4    3 2 1 X          x x
10968
10969                     */
10970                     num = RExC_npar + num;
10971                     if (num < 1)  {
10972                         RExC_parse++;
10973                         vFAIL("Reference to nonexistent group");
10974                     }
10975                 } else if ( paren == '+' ) {
10976                     num = RExC_npar + num - 1;
10977                 }
10978                 /* We keep track how many GOSUB items we have produced.
10979                    To start off the ARG2L() of the GOSUB holds its "id",
10980                    which is used later in conjunction with RExC_recurse
10981                    to calculate the offset we need to jump for the GOSUB,
10982                    which it will store in the final representation.
10983                    We have to defer the actual calculation until much later
10984                    as the regop may move.
10985                  */
10986
10987                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10988                 if (!SIZE_ONLY) {
10989                     if (num > (I32)RExC_rx->nparens) {
10990                         RExC_parse++;
10991                         vFAIL("Reference to nonexistent group");
10992                     }
10993                     RExC_recurse_count++;
10994                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
10995                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10996                               22, "|    |", (int)(depth * 2 + 1), "",
10997                               (UV)ARG(ret), (IV)ARG2L(ret)));
10998                 }
10999                 RExC_seen |= REG_RECURSE_SEEN;
11000
11001                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
11002                 Set_Node_Offset(ret, parse_start); /* MJD */
11003
11004                 *flagp |= POSTPONED;
11005                 assert(*RExC_parse == ')');
11006                 nextchar(pRExC_state);
11007                 return ret;
11008
11009             /* NOTREACHED */
11010
11011             case '?':           /* (??...) */
11012                 is_logical = 1;
11013                 if (*RExC_parse != '{') {
11014                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
11015                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11016                     vFAIL2utf8f(
11017                         "Sequence (%"UTF8f"...) not recognized",
11018                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11019                     NOT_REACHED; /*NOTREACHED*/
11020                 }
11021                 *flagp |= POSTPONED;
11022                 paren = '{';
11023                 RExC_parse++;
11024                 /* FALLTHROUGH */
11025             case '{':           /* (?{...}) */
11026             {
11027                 U32 n = 0;
11028                 struct reg_code_block *cb;
11029
11030                 RExC_seen_zerolen++;
11031
11032                 if (   !pRExC_state->num_code_blocks
11033                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
11034                     || pRExC_state->code_blocks[pRExC_state->code_index].start
11035                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11036                             - RExC_start)
11037                 ) {
11038                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11039                         FAIL("panic: Sequence (?{...}): no code block found\n");
11040                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11041                 }
11042                 /* this is a pre-compiled code block (?{...}) */
11043                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
11044                 RExC_parse = RExC_start + cb->end;
11045                 if (!SIZE_ONLY) {
11046                     OP *o = cb->block;
11047                     if (cb->src_regex) {
11048                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11049                         RExC_rxi->data->data[n] =
11050                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
11051                         RExC_rxi->data->data[n+1] = (void*)o;
11052                     }
11053                     else {
11054                         n = add_data(pRExC_state,
11055                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11056                         RExC_rxi->data->data[n] = (void*)o;
11057                     }
11058                 }
11059                 pRExC_state->code_index++;
11060                 nextchar(pRExC_state);
11061
11062                 if (is_logical) {
11063                     regnode *eval;
11064                     ret = reg_node(pRExC_state, LOGICAL);
11065
11066                     eval = reg2Lanode(pRExC_state, EVAL,
11067                                        n,
11068
11069                                        /* for later propagation into (??{})
11070                                         * return value */
11071                                        RExC_flags & RXf_PMf_COMPILETIME
11072                                       );
11073                     if (!SIZE_ONLY) {
11074                         ret->flags = 2;
11075                     }
11076                     REGTAIL(pRExC_state, ret, eval);
11077                     /* deal with the length of this later - MJD */
11078                     return ret;
11079                 }
11080                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11081                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
11082                 Set_Node_Offset(ret, parse_start);
11083                 return ret;
11084             }
11085             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11086             {
11087                 int is_define= 0;
11088                 const int DEFINE_len = sizeof("DEFINE") - 1;
11089                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
11090                     if (   RExC_parse < RExC_end - 1
11091                         && (   RExC_parse[1] == '='
11092                             || RExC_parse[1] == '!'
11093                             || RExC_parse[1] == '<'
11094                             || RExC_parse[1] == '{')
11095                     ) { /* Lookahead or eval. */
11096                         I32 flag;
11097                         regnode *tail;
11098
11099                         ret = reg_node(pRExC_state, LOGICAL);
11100                         if (!SIZE_ONLY)
11101                             ret->flags = 1;
11102
11103                         tail = reg(pRExC_state, 1, &flag, depth+1);
11104                         if (flag & (RESTART_PASS1|NEED_UTF8)) {
11105                             *flagp = flag & (RESTART_PASS1|NEED_UTF8);
11106                             return NULL;
11107                         }
11108                         REGTAIL(pRExC_state, ret, tail);
11109                         goto insert_if;
11110                     }
11111                     /* Fall through to ‘Unknown switch condition’ at the
11112                        end of the if/else chain. */
11113                 }
11114                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11115                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11116                 {
11117                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11118                     char *name_start= RExC_parse++;
11119                     U32 num = 0;
11120                     SV *sv_dat=reg_scan_name(pRExC_state,
11121                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11122                     if (   RExC_parse == name_start
11123                         || RExC_parse >= RExC_end
11124                         || *RExC_parse != ch)
11125                     {
11126                         vFAIL2("Sequence (?(%c... not terminated",
11127                             (ch == '>' ? '<' : ch));
11128                     }
11129                     RExC_parse++;
11130                     if (!SIZE_ONLY) {
11131                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11132                         RExC_rxi->data->data[num]=(void*)sv_dat;
11133                         SvREFCNT_inc_simple_void(sv_dat);
11134                     }
11135                     ret = reganode(pRExC_state,NGROUPP,num);
11136                     goto insert_if_check_paren;
11137                 }
11138                 else if (RExC_end - RExC_parse >= DEFINE_len
11139                         && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
11140                 {
11141                     ret = reganode(pRExC_state,DEFINEP,0);
11142                     RExC_parse += DEFINE_len;
11143                     is_define = 1;
11144                     goto insert_if_check_paren;
11145                 }
11146                 else if (RExC_parse[0] == 'R') {
11147                     RExC_parse++;
11148                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11149                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11150                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11151                      */
11152                     parno = 0;
11153                     if (RExC_parse[0] == '0') {
11154                         parno = 1;
11155                         RExC_parse++;
11156                     }
11157                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11158                         UV uv;
11159                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11160                             && uv <= I32_MAX
11161                         ) {
11162                             parno = (I32)uv + 1;
11163                             RExC_parse = (char*)endptr;
11164                         }
11165                         /* else "Switch condition not recognized" below */
11166                     } else if (RExC_parse[0] == '&') {
11167                         SV *sv_dat;
11168                         RExC_parse++;
11169                         sv_dat = reg_scan_name(pRExC_state,
11170                             SIZE_ONLY
11171                             ? REG_RSN_RETURN_NULL
11172                             : REG_RSN_RETURN_DATA);
11173
11174                         /* we should only have a false sv_dat when
11175                          * SIZE_ONLY is true, and we always have false
11176                          * sv_dat when SIZE_ONLY is true.
11177                          * reg_scan_name() will VFAIL() if the name is
11178                          * unknown when SIZE_ONLY is false, and otherwise
11179                          * will return something, and when SIZE_ONLY is
11180                          * true, reg_scan_name() just parses the string,
11181                          * and doesnt return anything. (in theory) */
11182                         assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11183
11184                         if (sv_dat)
11185                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11186                     }
11187                     ret = reganode(pRExC_state,INSUBP,parno);
11188                     goto insert_if_check_paren;
11189                 }
11190                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11191                     /* (?(1)...) */
11192                     char c;
11193                     UV uv;
11194                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11195                         && uv <= I32_MAX
11196                     ) {
11197                         parno = (I32)uv;
11198                         RExC_parse = (char*)endptr;
11199                     }
11200                     else {
11201                         vFAIL("panic: grok_atoUV returned FALSE");
11202                     }
11203                     ret = reganode(pRExC_state, GROUPP, parno);
11204
11205                  insert_if_check_paren:
11206                     if (UCHARAT(RExC_parse) != ')') {
11207                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11208                         vFAIL("Switch condition not recognized");
11209                     }
11210                     nextchar(pRExC_state);
11211                   insert_if:
11212                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11213                     br = regbranch(pRExC_state, &flags, 1,depth+1);
11214                     if (br == NULL) {
11215                         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11216                             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11217                             return NULL;
11218                         }
11219                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
11220                               (UV) flags);
11221                     } else
11222                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
11223                                                           LONGJMP, 0));
11224                     c = UCHARAT(RExC_parse);
11225                     nextchar(pRExC_state);
11226                     if (flags&HASWIDTH)
11227                         *flagp |= HASWIDTH;
11228                     if (c == '|') {
11229                         if (is_define)
11230                             vFAIL("(?(DEFINE)....) does not allow branches");
11231
11232                         /* Fake one for optimizer.  */
11233                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11234
11235                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
11236                             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11237                                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11238                                 return NULL;
11239                             }
11240                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
11241                                   (UV) flags);
11242                         }
11243                         REGTAIL(pRExC_state, ret, lastbr);
11244                         if (flags&HASWIDTH)
11245                             *flagp |= HASWIDTH;
11246                         c = UCHARAT(RExC_parse);
11247                         nextchar(pRExC_state);
11248                     }
11249                     else
11250                         lastbr = NULL;
11251                     if (c != ')') {
11252                         if (RExC_parse >= RExC_end)
11253                             vFAIL("Switch (?(condition)... not terminated");
11254                         else
11255                             vFAIL("Switch (?(condition)... contains too many branches");
11256                     }
11257                     ender = reg_node(pRExC_state, TAIL);
11258                     REGTAIL(pRExC_state, br, ender);
11259                     if (lastbr) {
11260                         REGTAIL(pRExC_state, lastbr, ender);
11261                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11262                     }
11263                     else
11264                         REGTAIL(pRExC_state, ret, ender);
11265                     RExC_size++; /* XXX WHY do we need this?!!
11266                                     For large programs it seems to be required
11267                                     but I can't figure out why. -- dmq*/
11268                     return ret;
11269                 }
11270                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11271                 vFAIL("Unknown switch condition (?(...))");
11272             }
11273             case '[':           /* (?[ ... ]) */
11274                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
11275                                          oregcomp_parse);
11276             case 0: /* A NUL */
11277                 RExC_parse--; /* for vFAIL to print correctly */
11278                 vFAIL("Sequence (? incomplete");
11279                 break;
11280             default: /* e.g., (?i) */
11281                 RExC_parse = (char *) seqstart + 1;
11282               parse_flags:
11283                 parse_lparen_question_flags(pRExC_state);
11284                 if (UCHARAT(RExC_parse) != ':') {
11285                     if (RExC_parse < RExC_end)
11286                         nextchar(pRExC_state);
11287                     *flagp = TRYAGAIN;
11288                     return NULL;
11289                 }
11290                 paren = ':';
11291                 nextchar(pRExC_state);
11292                 ret = NULL;
11293                 goto parse_rest;
11294             } /* end switch */
11295         }
11296         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11297           capturing_parens:
11298             parno = RExC_npar;
11299             RExC_npar++;
11300
11301             ret = reganode(pRExC_state, OPEN, parno);
11302             if (!SIZE_ONLY ){
11303                 if (!RExC_nestroot)
11304                     RExC_nestroot = parno;
11305                 if (RExC_open_parens && !RExC_open_parens[parno])
11306                 {
11307                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11308                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
11309                         22, "|    |", (int)(depth * 2 + 1), "",
11310                         (IV)parno, REG_NODE_NUM(ret)));
11311                     RExC_open_parens[parno]= ret;
11312                 }
11313             }
11314             Set_Node_Length(ret, 1); /* MJD */
11315             Set_Node_Offset(ret, RExC_parse); /* MJD */
11316             is_open = 1;
11317         } else {
11318             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11319             paren = ':';
11320             ret = NULL;
11321         }
11322     }
11323     else                        /* ! paren */
11324         ret = NULL;
11325
11326    parse_rest:
11327     /* Pick up the branches, linking them together. */
11328     parse_start = RExC_parse;   /* MJD */
11329     br = regbranch(pRExC_state, &flags, 1,depth+1);
11330
11331     /*     branch_len = (paren != 0); */
11332
11333     if (br == NULL) {
11334         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11335             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11336             return NULL;
11337         }
11338         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11339     }
11340     if (*RExC_parse == '|') {
11341         if (!SIZE_ONLY && RExC_extralen) {
11342             reginsert(pRExC_state, BRANCHJ, br, depth+1);
11343         }
11344         else {                  /* MJD */
11345             reginsert(pRExC_state, BRANCH, br, depth+1);
11346             Set_Node_Length(br, paren != 0);
11347             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11348         }
11349         have_branch = 1;
11350         if (SIZE_ONLY)
11351             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
11352     }
11353     else if (paren == ':') {
11354         *flagp |= flags&SIMPLE;
11355     }
11356     if (is_open) {                              /* Starts with OPEN. */
11357         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
11358     }
11359     else if (paren != '?')              /* Not Conditional */
11360         ret = br;
11361     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11362     lastbr = br;
11363     while (*RExC_parse == '|') {
11364         if (!SIZE_ONLY && RExC_extralen) {
11365             ender = reganode(pRExC_state, LONGJMP,0);
11366
11367             /* Append to the previous. */
11368             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11369         }
11370         if (SIZE_ONLY)
11371             RExC_extralen += 2;         /* Account for LONGJMP. */
11372         nextchar(pRExC_state);
11373         if (freeze_paren) {
11374             if (RExC_npar > after_freeze)
11375                 after_freeze = RExC_npar;
11376             RExC_npar = freeze_paren;
11377         }
11378         br = regbranch(pRExC_state, &flags, 0, depth+1);
11379
11380         if (br == NULL) {
11381             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11382                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11383                 return NULL;
11384             }
11385             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11386         }
11387         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
11388         lastbr = br;
11389         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11390     }
11391
11392     if (have_branch || paren != ':') {
11393         /* Make a closing node, and hook it on the end. */
11394         switch (paren) {
11395         case ':':
11396             ender = reg_node(pRExC_state, TAIL);
11397             break;
11398         case 1: case 2:
11399             ender = reganode(pRExC_state, CLOSE, parno);
11400             if ( RExC_close_parens ) {
11401                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11402                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
11403                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11404                 RExC_close_parens[parno]= ender;
11405                 if (RExC_nestroot == parno)
11406                     RExC_nestroot = 0;
11407             }
11408             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11409             Set_Node_Length(ender,1); /* MJD */
11410             break;
11411         case '<':
11412         case ',':
11413         case '=':
11414         case '!':
11415             *flagp &= ~HASWIDTH;
11416             /* FALLTHROUGH */
11417         case '>':
11418             ender = reg_node(pRExC_state, SUCCEED);
11419             break;
11420         case 0:
11421             ender = reg_node(pRExC_state, END);
11422             if (!SIZE_ONLY) {
11423                 assert(!RExC_end_op); /* there can only be one! */
11424                 RExC_end_op = ender;
11425                 if (RExC_close_parens) {
11426                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11427                         "%*s%*s Setting close paren #0 (END) to %d\n",
11428                         22, "|    |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11429
11430                     RExC_close_parens[0]= ender;
11431                 }
11432             }
11433             break;
11434         }
11435         DEBUG_PARSE_r(if (!SIZE_ONLY) {
11436             DEBUG_PARSE_MSG("lsbr");
11437             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11438             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11439             Perl_re_printf( aTHX_  "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11440                           SvPV_nolen_const(RExC_mysv1),
11441                           (IV)REG_NODE_NUM(lastbr),
11442                           SvPV_nolen_const(RExC_mysv2),
11443                           (IV)REG_NODE_NUM(ender),
11444                           (IV)(ender - lastbr)
11445             );
11446         });
11447         REGTAIL(pRExC_state, lastbr, ender);
11448
11449         if (have_branch && !SIZE_ONLY) {
11450             char is_nothing= 1;
11451             if (depth==1)
11452                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11453
11454             /* Hook the tails of the branches to the closing node. */
11455             for (br = ret; br; br = regnext(br)) {
11456                 const U8 op = PL_regkind[OP(br)];
11457                 if (op == BRANCH) {
11458                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11459                     if ( OP(NEXTOPER(br)) != NOTHING
11460                          || regnext(NEXTOPER(br)) != ender)
11461                         is_nothing= 0;
11462                 }
11463                 else if (op == BRANCHJ) {
11464                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11465                     /* for now we always disable this optimisation * /
11466                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11467                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11468                     */
11469                         is_nothing= 0;
11470                 }
11471             }
11472             if (is_nothing) {
11473                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11474                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11475                     DEBUG_PARSE_MSG("NADA");
11476                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11477                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11478                     Perl_re_printf( aTHX_  "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11479                                   SvPV_nolen_const(RExC_mysv1),
11480                                   (IV)REG_NODE_NUM(ret),
11481                                   SvPV_nolen_const(RExC_mysv2),
11482                                   (IV)REG_NODE_NUM(ender),
11483                                   (IV)(ender - ret)
11484                     );
11485                 });
11486                 OP(br)= NOTHING;
11487                 if (OP(ender) == TAIL) {
11488                     NEXT_OFF(br)= 0;
11489                     RExC_emit= br + 1;
11490                 } else {
11491                     regnode *opt;
11492                     for ( opt= br + 1; opt < ender ; opt++ )
11493                         OP(opt)= OPTIMIZED;
11494                     NEXT_OFF(br)= ender - br;
11495                 }
11496             }
11497         }
11498     }
11499
11500     {
11501         const char *p;
11502         static const char parens[] = "=!<,>";
11503
11504         if (paren && (p = strchr(parens, paren))) {
11505             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11506             int flag = (p - parens) > 1;
11507
11508             if (paren == '>')
11509                 node = SUSPEND, flag = 0;
11510             reginsert(pRExC_state, node,ret, depth+1);
11511             Set_Node_Cur_Length(ret, parse_start);
11512             Set_Node_Offset(ret, parse_start + 1);
11513             ret->flags = flag;
11514             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11515         }
11516     }
11517
11518     /* Check for proper termination. */
11519     if (paren) {
11520         /* restore original flags, but keep (?p) and, if we've changed from /d
11521          * rules to /u, keep the /u */
11522         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11523         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11524             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11525         }
11526         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11527             RExC_parse = oregcomp_parse;
11528             vFAIL("Unmatched (");
11529         }
11530         nextchar(pRExC_state);
11531     }
11532     else if (!paren && RExC_parse < RExC_end) {
11533         if (*RExC_parse == ')') {
11534             RExC_parse++;
11535             vFAIL("Unmatched )");
11536         }
11537         else
11538             FAIL("Junk on end of regexp");      /* "Can't happen". */
11539         NOT_REACHED; /* NOTREACHED */
11540     }
11541
11542     if (RExC_in_lookbehind) {
11543         RExC_in_lookbehind--;
11544     }
11545     if (after_freeze > RExC_npar)
11546         RExC_npar = after_freeze;
11547     return(ret);
11548 }
11549
11550 /*
11551  - regbranch - one alternative of an | operator
11552  *
11553  * Implements the concatenation operator.
11554  *
11555  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11556  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11557  */
11558 STATIC regnode *
11559 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11560 {
11561     regnode *ret;
11562     regnode *chain = NULL;
11563     regnode *latest;
11564     I32 flags = 0, c = 0;
11565     GET_RE_DEBUG_FLAGS_DECL;
11566
11567     PERL_ARGS_ASSERT_REGBRANCH;
11568
11569     DEBUG_PARSE("brnc");
11570
11571     if (first)
11572         ret = NULL;
11573     else {
11574         if (!SIZE_ONLY && RExC_extralen)
11575             ret = reganode(pRExC_state, BRANCHJ,0);
11576         else {
11577             ret = reg_node(pRExC_state, BRANCH);
11578             Set_Node_Length(ret, 1);
11579         }
11580     }
11581
11582     if (!first && SIZE_ONLY)
11583         RExC_extralen += 1;                     /* BRANCHJ */
11584
11585     *flagp = WORST;                     /* Tentatively. */
11586
11587     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11588                             FALSE /* Don't force to /x */ );
11589     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11590         flags &= ~TRYAGAIN;
11591         latest = regpiece(pRExC_state, &flags,depth+1);
11592         if (latest == NULL) {
11593             if (flags & TRYAGAIN)
11594                 continue;
11595             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11596                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11597                 return NULL;
11598             }
11599             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
11600         }
11601         else if (ret == NULL)
11602             ret = latest;
11603         *flagp |= flags&(HASWIDTH|POSTPONED);
11604         if (chain == NULL)      /* First piece. */
11605             *flagp |= flags&SPSTART;
11606         else {
11607             /* FIXME adding one for every branch after the first is probably
11608              * excessive now we have TRIE support. (hv) */
11609             MARK_NAUGHTY(1);
11610             REGTAIL(pRExC_state, chain, latest);
11611         }
11612         chain = latest;
11613         c++;
11614     }
11615     if (chain == NULL) {        /* Loop ran zero times. */
11616         chain = reg_node(pRExC_state, NOTHING);
11617         if (ret == NULL)
11618             ret = chain;
11619     }
11620     if (c == 1) {
11621         *flagp |= flags&SIMPLE;
11622     }
11623
11624     return ret;
11625 }
11626
11627 /*
11628  - regpiece - something followed by possible [*+?]
11629  *
11630  * Note that the branching code sequences used for ? and the general cases
11631  * of * and + are somewhat optimized:  they use the same NOTHING node as
11632  * both the endmarker for their branch list and the body of the last branch.
11633  * It might seem that this node could be dispensed with entirely, but the
11634  * endmarker role is not redundant.
11635  *
11636  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11637  * TRYAGAIN.
11638  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11639  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11640  */
11641 STATIC regnode *
11642 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11643 {
11644     regnode *ret;
11645     char op;
11646     char *next;
11647     I32 flags;
11648     const char * const origparse = RExC_parse;
11649     I32 min;
11650     I32 max = REG_INFTY;
11651 #ifdef RE_TRACK_PATTERN_OFFSETS
11652     char *parse_start;
11653 #endif
11654     const char *maxpos = NULL;
11655     UV uv;
11656
11657     /* Save the original in case we change the emitted regop to a FAIL. */
11658     regnode * const orig_emit = RExC_emit;
11659
11660     GET_RE_DEBUG_FLAGS_DECL;
11661
11662     PERL_ARGS_ASSERT_REGPIECE;
11663
11664     DEBUG_PARSE("piec");
11665
11666     ret = regatom(pRExC_state, &flags,depth+1);
11667     if (ret == NULL) {
11668         if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11669             *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11670         else
11671             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
11672         return(NULL);
11673     }
11674
11675     op = *RExC_parse;
11676
11677     if (op == '{' && regcurly(RExC_parse)) {
11678         maxpos = NULL;
11679 #ifdef RE_TRACK_PATTERN_OFFSETS
11680         parse_start = RExC_parse; /* MJD */
11681 #endif
11682         next = RExC_parse + 1;
11683         while (isDIGIT(*next) || *next == ',') {
11684             if (*next == ',') {
11685                 if (maxpos)
11686                     break;
11687                 else
11688                     maxpos = next;
11689             }
11690             next++;
11691         }
11692         if (*next == '}') {             /* got one */
11693             const char* endptr;
11694             if (!maxpos)
11695                 maxpos = next;
11696             RExC_parse++;
11697             if (isDIGIT(*RExC_parse)) {
11698                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11699                     vFAIL("Invalid quantifier in {,}");
11700                 if (uv >= REG_INFTY)
11701                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11702                 min = (I32)uv;
11703             } else {
11704                 min = 0;
11705             }
11706             if (*maxpos == ',')
11707                 maxpos++;
11708             else
11709                 maxpos = RExC_parse;
11710             if (isDIGIT(*maxpos)) {
11711                 if (!grok_atoUV(maxpos, &uv, &endptr))
11712                     vFAIL("Invalid quantifier in {,}");
11713                 if (uv >= REG_INFTY)
11714                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11715                 max = (I32)uv;
11716             } else {
11717                 max = REG_INFTY;                /* meaning "infinity" */
11718             }
11719             RExC_parse = next;
11720             nextchar(pRExC_state);
11721             if (max < min) {    /* If can't match, warn and optimize to fail
11722                                    unconditionally */
11723                 if (SIZE_ONLY) {
11724
11725                     /* We can't back off the size because we have to reserve
11726                      * enough space for all the things we are about to throw
11727                      * away, but we can shrink it by the amount we are about
11728                      * to re-use here */
11729                     RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
11730                 }
11731                 else {
11732                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11733                     RExC_emit = orig_emit;
11734                 }
11735                 ret = reganode(pRExC_state, OPFAIL, 0);
11736                 return ret;
11737             }
11738             else if (min == max && *RExC_parse == '?')
11739             {
11740                 if (PASS2) {
11741                     ckWARN2reg(RExC_parse + 1,
11742                                "Useless use of greediness modifier '%c'",
11743                                *RExC_parse);
11744                 }
11745             }
11746
11747           do_curly:
11748             if ((flags&SIMPLE)) {
11749                 if (min == 0 && max == REG_INFTY) {
11750                     reginsert(pRExC_state, STAR, ret, depth+1);
11751                     ret->flags = 0;
11752                     MARK_NAUGHTY(4);
11753                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11754                     goto nest_check;
11755                 }
11756                 if (min == 1 && max == REG_INFTY) {
11757                     reginsert(pRExC_state, PLUS, ret, depth+1);
11758                     ret->flags = 0;
11759                     MARK_NAUGHTY(3);
11760                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11761                     goto nest_check;
11762                 }
11763                 MARK_NAUGHTY_EXP(2, 2);
11764                 reginsert(pRExC_state, CURLY, ret, depth+1);
11765                 Set_Node_Offset(ret, parse_start+1); /* MJD */
11766                 Set_Node_Cur_Length(ret, parse_start);
11767             }
11768             else {
11769                 regnode * const w = reg_node(pRExC_state, WHILEM);
11770
11771                 w->flags = 0;
11772                 REGTAIL(pRExC_state, ret, w);
11773                 if (!SIZE_ONLY && RExC_extralen) {
11774                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
11775                     reginsert(pRExC_state, NOTHING,ret, depth+1);
11776                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
11777                 }
11778                 reginsert(pRExC_state, CURLYX,ret, depth+1);
11779                                 /* MJD hk */
11780                 Set_Node_Offset(ret, parse_start+1);
11781                 Set_Node_Length(ret,
11782                                 op == '{' ? (RExC_parse - parse_start) : 1);
11783
11784                 if (!SIZE_ONLY && RExC_extralen)
11785                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
11786                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11787                 if (SIZE_ONLY)
11788                     RExC_whilem_seen++, RExC_extralen += 3;
11789                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
11790             }
11791             ret->flags = 0;
11792
11793             if (min > 0)
11794                 *flagp = WORST;
11795             if (max > 0)
11796                 *flagp |= HASWIDTH;
11797             if (!SIZE_ONLY) {
11798                 ARG1_SET(ret, (U16)min);
11799                 ARG2_SET(ret, (U16)max);
11800             }
11801             if (max == REG_INFTY)
11802                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11803
11804             goto nest_check;
11805         }
11806     }
11807
11808     if (!ISMULT1(op)) {
11809         *flagp = flags;
11810         return(ret);
11811     }
11812
11813 #if 0                           /* Now runtime fix should be reliable. */
11814
11815     /* if this is reinstated, don't forget to put this back into perldiag:
11816
11817             =item Regexp *+ operand could be empty at {#} in regex m/%s/
11818
11819            (F) The part of the regexp subject to either the * or + quantifier
11820            could match an empty string. The {#} shows in the regular
11821            expression about where the problem was discovered.
11822
11823     */
11824
11825     if (!(flags&HASWIDTH) && op != '?')
11826       vFAIL("Regexp *+ operand could be empty");
11827 #endif
11828
11829 #ifdef RE_TRACK_PATTERN_OFFSETS
11830     parse_start = RExC_parse;
11831 #endif
11832     nextchar(pRExC_state);
11833
11834     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11835
11836     if (op == '*') {
11837         min = 0;
11838         goto do_curly;
11839     }
11840     else if (op == '+') {
11841         min = 1;
11842         goto do_curly;
11843     }
11844     else if (op == '?') {
11845         min = 0; max = 1;
11846         goto do_curly;
11847     }
11848   nest_check:
11849     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11850         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11851         ckWARN2reg(RExC_parse,
11852                    "%"UTF8f" matches null string many times",
11853                    UTF8fARG(UTF, (RExC_parse >= origparse
11854                                  ? RExC_parse - origparse
11855                                  : 0),
11856                    origparse));
11857         (void)ReREFCNT_inc(RExC_rx_sv);
11858     }
11859
11860     if (*RExC_parse == '?') {
11861         nextchar(pRExC_state);
11862         reginsert(pRExC_state, MINMOD, ret, depth+1);
11863         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11864     }
11865     else if (*RExC_parse == '+') {
11866         regnode *ender;
11867         nextchar(pRExC_state);
11868         ender = reg_node(pRExC_state, SUCCEED);
11869         REGTAIL(pRExC_state, ret, ender);
11870         reginsert(pRExC_state, SUSPEND, ret, depth+1);
11871         ret->flags = 0;
11872         ender = reg_node(pRExC_state, TAIL);
11873         REGTAIL(pRExC_state, ret, ender);
11874     }
11875
11876     if (ISMULT2(RExC_parse)) {
11877         RExC_parse++;
11878         vFAIL("Nested quantifiers");
11879     }
11880
11881     return(ret);
11882 }
11883
11884 STATIC bool
11885 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11886                 regnode ** node_p,
11887                 UV * code_point_p,
11888                 int * cp_count,
11889                 I32 * flagp,
11890                 const bool strict,
11891                 const U32 depth
11892     )
11893 {
11894  /* This routine teases apart the various meanings of \N and returns
11895   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11896   * in the current context.
11897   *
11898   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11899   *
11900   * If <code_point_p> is not NULL, the context is expecting the result to be a
11901   * single code point.  If this \N instance turns out to a single code point,
11902   * the function returns TRUE and sets *code_point_p to that code point.
11903   *
11904   * If <node_p> is not NULL, the context is expecting the result to be one of
11905   * the things representable by a regnode.  If this \N instance turns out to be
11906   * one such, the function generates the regnode, returns TRUE and sets *node_p
11907   * to point to that regnode.
11908   *
11909   * If this instance of \N isn't legal in any context, this function will
11910   * generate a fatal error and not return.
11911   *
11912   * On input, RExC_parse should point to the first char following the \N at the
11913   * time of the call.  On successful return, RExC_parse will have been updated
11914   * to point to just after the sequence identified by this routine.  Also
11915   * *flagp has been updated as needed.
11916   *
11917   * When there is some problem with the current context and this \N instance,
11918   * the function returns FALSE, without advancing RExC_parse, nor setting
11919   * *node_p, nor *code_point_p, nor *flagp.
11920   *
11921   * If <cp_count> is not NULL, the caller wants to know the length (in code
11922   * points) that this \N sequence matches.  This is set even if the function
11923   * returns FALSE, as detailed below.
11924   *
11925   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11926   *
11927   * Probably the most common case is for the \N to specify a single code point.
11928   * *cp_count will be set to 1, and *code_point_p will be set to that code
11929   * point.
11930   *
11931   * Another possibility is for the input to be an empty \N{}, which for
11932   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11933   * will be set to a generated NOTHING node.
11934   *
11935   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11936   * set to 0. *node_p will be set to a generated REG_ANY node.
11937   *
11938   * The fourth possibility is that \N resolves to a sequence of more than one
11939   * code points.  *cp_count will be set to the number of code points in the
11940   * sequence. *node_p * will be set to a generated node returned by this
11941   * function calling S_reg().
11942   *
11943   * The final possibility is that it is premature to be calling this function;
11944   * that pass1 needs to be restarted.  This can happen when this changes from
11945   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
11946   * latter occurs only when the fourth possibility would otherwise be in
11947   * effect, and is because one of those code points requires the pattern to be
11948   * recompiled as UTF-8.  The function returns FALSE, and sets the
11949   * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
11950   * happens, the caller needs to desist from continuing parsing, and return
11951   * this information to its caller.  This is not set for when there is only one
11952   * code point, as this can be called as part of an ANYOF node, and they can
11953   * store above-Latin1 code points without the pattern having to be in UTF-8.
11954   *
11955   * For non-single-quoted regexes, the tokenizer has resolved character and
11956   * sequence names inside \N{...} into their Unicode values, normalizing the
11957   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11958   * hex-represented code points in the sequence.  This is done there because
11959   * the names can vary based on what charnames pragma is in scope at the time,
11960   * so we need a way to take a snapshot of what they resolve to at the time of
11961   * the original parse. [perl #56444].
11962   *
11963   * That parsing is skipped for single-quoted regexes, so we may here get
11964   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11965   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11966   * is legal and handled here.  The code point is Unicode, and has to be
11967   * translated into the native character set for non-ASCII platforms.
11968   */
11969
11970     char * endbrace;    /* points to '}' following the name */
11971     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11972                            stream */
11973     char* p = RExC_parse; /* Temporary */
11974
11975     GET_RE_DEBUG_FLAGS_DECL;
11976
11977     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11978
11979     GET_RE_DEBUG_FLAGS;
11980
11981     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11982     assert(! (node_p && cp_count));               /* At most 1 should be set */
11983
11984     if (cp_count) {     /* Initialize return for the most common case */
11985         *cp_count = 1;
11986     }
11987
11988     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11989      * modifier.  The other meanings do not, so use a temporary until we find
11990      * out which we are being called with */
11991     skip_to_be_ignored_text(pRExC_state, &p,
11992                             FALSE /* Don't force to /x */ );
11993
11994     /* Disambiguate between \N meaning a named character versus \N meaning
11995      * [^\n].  The latter is assumed when the {...} following the \N is a legal
11996      * quantifier, or there is no '{' at all */
11997     if (*p != '{' || regcurly(p)) {
11998         RExC_parse = p;
11999         if (cp_count) {
12000             *cp_count = -1;
12001         }
12002
12003         if (! node_p) {
12004             return FALSE;
12005         }
12006
12007         *node_p = reg_node(pRExC_state, REG_ANY);
12008         *flagp |= HASWIDTH|SIMPLE;
12009         MARK_NAUGHTY(1);
12010         Set_Node_Length(*node_p, 1); /* MJD */
12011         return TRUE;
12012     }
12013
12014     /* Here, we have decided it should be a named character or sequence */
12015
12016     /* The test above made sure that the next real character is a '{', but
12017      * under the /x modifier, it could be separated by space (or a comment and
12018      * \n) and this is not allowed (for consistency with \x{...} and the
12019      * tokenizer handling of \N{NAME}). */
12020     if (*RExC_parse != '{') {
12021         vFAIL("Missing braces on \\N{}");
12022     }
12023
12024     RExC_parse++;       /* Skip past the '{' */
12025
12026     if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
12027         || ! (endbrace == RExC_parse            /* nothing between the {} */
12028               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked... */
12029                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
12030                                                        error msg) */
12031     {
12032         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
12033         vFAIL("\\N{NAME} must be resolved by the lexer");
12034     }
12035
12036     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12037                                         semantics */
12038
12039     if (endbrace == RExC_parse) {   /* empty: \N{} */
12040         if (strict) {
12041             RExC_parse++;   /* Position after the "}" */
12042             vFAIL("Zero length \\N{}");
12043         }
12044         if (cp_count) {
12045             *cp_count = 0;
12046         }
12047         nextchar(pRExC_state);
12048         if (! node_p) {
12049             return FALSE;
12050         }
12051
12052         *node_p = reg_node(pRExC_state,NOTHING);
12053         return TRUE;
12054     }
12055
12056     RExC_parse += 2;    /* Skip past the 'U+' */
12057
12058     /* Because toke.c has generated a special construct for us guaranteed not
12059      * to have NULs, we can use a str function */
12060     endchar = RExC_parse + strcspn(RExC_parse, ".}");
12061
12062     /* Code points are separated by dots.  If none, there is only one code
12063      * point, and is terminated by the brace */
12064
12065     if (endchar >= endbrace) {
12066         STRLEN length_of_hex;
12067         I32 grok_hex_flags;
12068
12069         /* Here, exactly one code point.  If that isn't what is wanted, fail */
12070         if (! code_point_p) {
12071             RExC_parse = p;
12072             return FALSE;
12073         }
12074
12075         /* Convert code point from hex */
12076         length_of_hex = (STRLEN)(endchar - RExC_parse);
12077         grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
12078                            | PERL_SCAN_DISALLOW_PREFIX
12079
12080                              /* No errors in the first pass (See [perl
12081                               * #122671].)  We let the code below find the
12082                               * errors when there are multiple chars. */
12083                            | ((SIZE_ONLY)
12084                               ? PERL_SCAN_SILENT_ILLDIGIT
12085                               : 0);
12086
12087         /* This routine is the one place where both single- and double-quotish
12088          * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
12089          * must be converted to native. */
12090         *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
12091                                          &length_of_hex,
12092                                          &grok_hex_flags,
12093                                          NULL));
12094
12095         /* The tokenizer should have guaranteed validity, but it's possible to
12096          * bypass it by using single quoting, so check.  Don't do the check
12097          * here when there are multiple chars; we do it below anyway. */
12098         if (length_of_hex == 0
12099             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
12100         {
12101             RExC_parse += length_of_hex;        /* Includes all the valid */
12102             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
12103                             ? UTF8SKIP(RExC_parse)
12104                             : 1;
12105             /* Guard against malformed utf8 */
12106             if (RExC_parse >= endchar) {
12107                 RExC_parse = endchar;
12108             }
12109             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12110         }
12111
12112         RExC_parse = endbrace + 1;
12113         return TRUE;
12114     }
12115     else {  /* Is a multiple character sequence */
12116         SV * substitute_parse;
12117         STRLEN len;
12118         char *orig_end = RExC_end;
12119         char *save_start = RExC_start;
12120         I32 flags;
12121
12122         /* Count the code points, if desired, in the sequence */
12123         if (cp_count) {
12124             *cp_count = 0;
12125             while (RExC_parse < endbrace) {
12126                 /* Point to the beginning of the next character in the sequence. */
12127                 RExC_parse = endchar + 1;
12128                 endchar = RExC_parse + strcspn(RExC_parse, ".}");
12129                 (*cp_count)++;
12130             }
12131         }
12132
12133         /* Fail if caller doesn't want to handle a multi-code-point sequence.
12134          * But don't backup up the pointer if the caller want to know how many
12135          * code points there are (they can then handle things) */
12136         if (! node_p) {
12137             if (! cp_count) {
12138                 RExC_parse = p;
12139             }
12140             return FALSE;
12141         }
12142
12143         /* What is done here is to convert this to a sub-pattern of the form
12144          * \x{char1}\x{char2}...  and then call reg recursively to parse it
12145          * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
12146          * while not having to worry about special handling that some code
12147          * points may have. */
12148
12149         substitute_parse = newSVpvs("?:");
12150
12151         while (RExC_parse < endbrace) {
12152
12153             /* Convert to notation the rest of the code understands */
12154             sv_catpv(substitute_parse, "\\x{");
12155             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
12156             sv_catpv(substitute_parse, "}");
12157
12158             /* Point to the beginning of the next character in the sequence. */
12159             RExC_parse = endchar + 1;
12160             endchar = RExC_parse + strcspn(RExC_parse, ".}");
12161
12162         }
12163         sv_catpv(substitute_parse, ")");
12164
12165         RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
12166                                                              len);
12167
12168         /* Don't allow empty number */
12169         if (len < (STRLEN) 8) {
12170             RExC_parse = endbrace;
12171             vFAIL("Invalid hexadecimal number in \\N{U+...}");
12172         }
12173         RExC_end = RExC_parse + len;
12174
12175         /* The values are Unicode, and therefore not subject to recoding, but
12176          * have to be converted to native on a non-Unicode (meaning non-ASCII)
12177          * platform. */
12178         RExC_override_recoding = 1;
12179 #ifdef EBCDIC
12180         RExC_recode_x_to_native = 1;
12181 #endif
12182
12183         if (node_p) {
12184             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
12185                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12186                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12187                     return FALSE;
12188                 }
12189                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
12190                     (UV) flags);
12191             }
12192             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12193         }
12194
12195         /* Restore the saved values */
12196         RExC_start = RExC_adjusted_start = save_start;
12197         RExC_parse = endbrace;
12198         RExC_end = orig_end;
12199         RExC_override_recoding = 0;
12200 #ifdef EBCDIC
12201         RExC_recode_x_to_native = 0;
12202 #endif
12203
12204         SvREFCNT_dec_NN(substitute_parse);
12205         nextchar(pRExC_state);
12206
12207         return TRUE;
12208     }
12209 }
12210
12211
12212 PERL_STATIC_INLINE U8
12213 S_compute_EXACTish(RExC_state_t *pRExC_state)
12214 {
12215     U8 op;
12216
12217     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12218
12219     if (! FOLD) {
12220         return (LOC)
12221                 ? EXACTL
12222                 : EXACT;
12223     }
12224
12225     op = get_regex_charset(RExC_flags);
12226     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12227         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12228                  been, so there is no hole */
12229     }
12230
12231     return op + EXACTF;
12232 }
12233
12234 PERL_STATIC_INLINE void
12235 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12236                          regnode *node, I32* flagp, STRLEN len, UV code_point,
12237                          bool downgradable)
12238 {
12239     /* This knows the details about sizing an EXACTish node, setting flags for
12240      * it (by setting <*flagp>, and potentially populating it with a single
12241      * character.
12242      *
12243      * If <len> (the length in bytes) is non-zero, this function assumes that
12244      * the node has already been populated, and just does the sizing.  In this
12245      * case <code_point> should be the final code point that has already been
12246      * placed into the node.  This value will be ignored except that under some
12247      * circumstances <*flagp> is set based on it.
12248      *
12249      * If <len> is zero, the function assumes that the node is to contain only
12250      * the single character given by <code_point> and calculates what <len>
12251      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
12252      * additionally will populate the node's STRING with <code_point> or its
12253      * fold if folding.
12254      *
12255      * In both cases <*flagp> is appropriately set
12256      *
12257      * It knows that under FOLD, the Latin Sharp S and UTF characters above
12258      * 255, must be folded (the former only when the rules indicate it can
12259      * match 'ss')
12260      *
12261      * When it does the populating, it looks at the flag 'downgradable'.  If
12262      * true with a node that folds, it checks if the single code point
12263      * participates in a fold, and if not downgrades the node to an EXACT.
12264      * This helps the optimizer */
12265
12266     bool len_passed_in = cBOOL(len != 0);
12267     U8 character[UTF8_MAXBYTES_CASE+1];
12268
12269     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12270
12271     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12272      * sizing difference, and is extra work that is thrown away */
12273     if (downgradable && ! PASS2) {
12274         downgradable = FALSE;
12275     }
12276
12277     if (! len_passed_in) {
12278         if (UTF) {
12279             if (UVCHR_IS_INVARIANT(code_point)) {
12280                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
12281                     *character = (U8) code_point;
12282                 }
12283                 else { /* Here is /i and not /l. (toFOLD() is defined on just
12284                           ASCII, which isn't the same thing as INVARIANT on
12285                           EBCDIC, but it works there, as the extra invariants
12286                           fold to themselves) */
12287                     *character = toFOLD((U8) code_point);
12288
12289                     /* We can downgrade to an EXACT node if this character
12290                      * isn't a folding one.  Note that this assumes that
12291                      * nothing above Latin1 folds to some other invariant than
12292                      * one of these alphabetics; otherwise we would also have
12293                      * to check:
12294                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12295                      *      || ASCII_FOLD_RESTRICTED))
12296                      */
12297                     if (downgradable && PL_fold[code_point] == code_point) {
12298                         OP(node) = EXACT;
12299                     }
12300                 }
12301                 len = 1;
12302             }
12303             else if (FOLD && (! LOC
12304                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12305             {   /* Folding, and ok to do so now */
12306                 UV folded = _to_uni_fold_flags(
12307                                    code_point,
12308                                    character,
12309                                    &len,
12310                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12311                                                       ? FOLD_FLAGS_NOMIX_ASCII
12312                                                       : 0));
12313                 if (downgradable
12314                     && folded == code_point /* This quickly rules out many
12315                                                cases, avoiding the
12316                                                _invlist_contains_cp() overhead
12317                                                for those.  */
12318                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12319                 {
12320                     OP(node) = (LOC)
12321                                ? EXACTL
12322                                : EXACT;
12323                 }
12324             }
12325             else if (code_point <= MAX_UTF8_TWO_BYTE) {
12326
12327                 /* Not folding this cp, and can output it directly */
12328                 *character = UTF8_TWO_BYTE_HI(code_point);
12329                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12330                 len = 2;
12331             }
12332             else {
12333                 uvchr_to_utf8( character, code_point);
12334                 len = UTF8SKIP(character);
12335             }
12336         } /* Else pattern isn't UTF8.  */
12337         else if (! FOLD) {
12338             *character = (U8) code_point;
12339             len = 1;
12340         } /* Else is folded non-UTF8 */
12341 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12342    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12343                                       || UNICODE_DOT_DOT_VERSION > 0)
12344         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12345 #else
12346         else if (1) {
12347 #endif
12348             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
12349              * comments at join_exact()); */
12350             *character = (U8) code_point;
12351             len = 1;
12352
12353             /* Can turn into an EXACT node if we know the fold at compile time,
12354              * and it folds to itself and doesn't particpate in other folds */
12355             if (downgradable
12356                 && ! LOC
12357                 && PL_fold_latin1[code_point] == code_point
12358                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12359                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12360             {
12361                 OP(node) = EXACT;
12362             }
12363         } /* else is Sharp s.  May need to fold it */
12364         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12365             *character = 's';
12366             *(character + 1) = 's';
12367             len = 2;
12368         }
12369         else {
12370             *character = LATIN_SMALL_LETTER_SHARP_S;
12371             len = 1;
12372         }
12373     }
12374
12375     if (SIZE_ONLY) {
12376         RExC_size += STR_SZ(len);
12377     }
12378     else {
12379         RExC_emit += STR_SZ(len);
12380         STR_LEN(node) = len;
12381         if (! len_passed_in) {
12382             Copy((char *) character, STRING(node), len, char);
12383         }
12384     }
12385
12386     *flagp |= HASWIDTH;
12387
12388     /* A single character node is SIMPLE, except for the special-cased SHARP S
12389      * under /di. */
12390     if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12391 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12392    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12393                                       || UNICODE_DOT_DOT_VERSION > 0)
12394         && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12395             || ! FOLD || ! DEPENDS_SEMANTICS)
12396 #endif
12397     ) {
12398         *flagp |= SIMPLE;
12399     }
12400
12401     /* The OP may not be well defined in PASS1 */
12402     if (PASS2 && OP(node) == EXACTFL) {
12403         RExC_contains_locale = 1;
12404     }
12405 }
12406
12407
12408 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12409  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12410
12411 static I32
12412 S_backref_value(char *p)
12413 {
12414     const char* endptr;
12415     UV val;
12416     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12417         return (I32)val;
12418     return I32_MAX;
12419 }
12420
12421
12422 /*
12423  - regatom - the lowest level
12424
12425    Try to identify anything special at the start of the current parse position.
12426    If there is, then handle it as required. This may involve generating a
12427    single regop, such as for an assertion; or it may involve recursing, such as
12428    to handle a () structure.
12429
12430    If the string doesn't start with something special then we gobble up
12431    as much literal text as we can.  If we encounter a quantifier, we have to
12432    back off the final literal character, as that quantifier applies to just it
12433    and not to the whole string of literals.
12434
12435    Once we have been able to handle whatever type of thing started the
12436    sequence, we return.
12437
12438    Note: we have to be careful with escapes, as they can be both literal
12439    and special, and in the case of \10 and friends, context determines which.
12440
12441    A summary of the code structure is:
12442
12443    switch (first_byte) {
12444         cases for each special:
12445             handle this special;
12446             break;
12447         case '\\':
12448             switch (2nd byte) {
12449                 cases for each unambiguous special:
12450                     handle this special;
12451                     break;
12452                 cases for each ambigous special/literal:
12453                     disambiguate;
12454                     if (special)  handle here
12455                     else goto defchar;
12456                 default: // unambiguously literal:
12457                     goto defchar;
12458             }
12459         default:  // is a literal char
12460             // FALL THROUGH
12461         defchar:
12462             create EXACTish node for literal;
12463             while (more input and node isn't full) {
12464                 switch (input_byte) {
12465                    cases for each special;
12466                        make sure parse pointer is set so that the next call to
12467                            regatom will see this special first
12468                        goto loopdone; // EXACTish node terminated by prev. char
12469                    default:
12470                        append char to EXACTISH node;
12471                 }
12472                 get next input byte;
12473             }
12474         loopdone:
12475    }
12476    return the generated node;
12477
12478    Specifically there are two separate switches for handling
12479    escape sequences, with the one for handling literal escapes requiring
12480    a dummy entry for all of the special escapes that are actually handled
12481    by the other.
12482
12483    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12484    TRYAGAIN.
12485    Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12486    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12487    Otherwise does not return NULL.
12488 */
12489
12490 STATIC regnode *
12491 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12492 {
12493     regnode *ret = NULL;
12494     I32 flags = 0;
12495     char *parse_start;
12496     U8 op;
12497     int invert = 0;
12498     U8 arg;
12499
12500     GET_RE_DEBUG_FLAGS_DECL;
12501
12502     *flagp = WORST;             /* Tentatively. */
12503
12504     DEBUG_PARSE("atom");
12505
12506     PERL_ARGS_ASSERT_REGATOM;
12507
12508   tryagain:
12509     parse_start = RExC_parse;
12510     assert(RExC_parse < RExC_end);
12511     switch ((U8)*RExC_parse) {
12512     case '^':
12513         RExC_seen_zerolen++;
12514         nextchar(pRExC_state);
12515         if (RExC_flags & RXf_PMf_MULTILINE)
12516             ret = reg_node(pRExC_state, MBOL);
12517         else
12518             ret = reg_node(pRExC_state, SBOL);
12519         Set_Node_Length(ret, 1); /* MJD */
12520         break;
12521     case '$':
12522         nextchar(pRExC_state);
12523         if (*RExC_parse)
12524             RExC_seen_zerolen++;
12525         if (RExC_flags & RXf_PMf_MULTILINE)
12526             ret = reg_node(pRExC_state, MEOL);
12527         else
12528             ret = reg_node(pRExC_state, SEOL);
12529         Set_Node_Length(ret, 1); /* MJD */
12530         break;
12531     case '.':
12532         nextchar(pRExC_state);
12533         if (RExC_flags & RXf_PMf_SINGLELINE)
12534             ret = reg_node(pRExC_state, SANY);
12535         else
12536             ret = reg_node(pRExC_state, REG_ANY);
12537         *flagp |= HASWIDTH|SIMPLE;
12538         MARK_NAUGHTY(1);
12539         Set_Node_Length(ret, 1); /* MJD */
12540         break;
12541     case '[':
12542     {
12543         char * const oregcomp_parse = ++RExC_parse;
12544         ret = regclass(pRExC_state, flagp,depth+1,
12545                        FALSE, /* means parse the whole char class */
12546                        TRUE, /* allow multi-char folds */
12547                        FALSE, /* don't silence non-portable warnings. */
12548                        (bool) RExC_strict,
12549                        TRUE, /* Allow an optimized regnode result */
12550                        NULL,
12551                        NULL);
12552         if (ret == NULL) {
12553             if (*flagp & (RESTART_PASS1|NEED_UTF8))
12554                 return NULL;
12555             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12556                   (UV) *flagp);
12557         }
12558         if (*RExC_parse != ']') {
12559             RExC_parse = oregcomp_parse;
12560             vFAIL("Unmatched [");
12561         }
12562         nextchar(pRExC_state);
12563         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12564         break;
12565     }
12566     case '(':
12567         nextchar(pRExC_state);
12568         ret = reg(pRExC_state, 2, &flags,depth+1);
12569         if (ret == NULL) {
12570                 if (flags & TRYAGAIN) {
12571                     if (RExC_parse >= RExC_end) {
12572                          /* Make parent create an empty node if needed. */
12573                         *flagp |= TRYAGAIN;
12574                         return(NULL);
12575                     }
12576                     goto tryagain;
12577                 }
12578                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12579                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12580                     return NULL;
12581                 }
12582                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
12583                                                                  (UV) flags);
12584         }
12585         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12586         break;
12587     case '|':
12588     case ')':
12589         if (flags & TRYAGAIN) {
12590             *flagp |= TRYAGAIN;
12591             return NULL;
12592         }
12593         vFAIL("Internal urp");
12594                                 /* Supposed to be caught earlier. */
12595         break;
12596     case '?':
12597     case '+':
12598     case '*':
12599         RExC_parse++;
12600         vFAIL("Quantifier follows nothing");
12601         break;
12602     case '\\':
12603         /* Special Escapes
12604
12605            This switch handles escape sequences that resolve to some kind
12606            of special regop and not to literal text. Escape sequnces that
12607            resolve to literal text are handled below in the switch marked
12608            "Literal Escapes".
12609
12610            Every entry in this switch *must* have a corresponding entry
12611            in the literal escape switch. However, the opposite is not
12612            required, as the default for this switch is to jump to the
12613            literal text handling code.
12614         */
12615         RExC_parse++;
12616         switch ((U8)*RExC_parse) {
12617         /* Special Escapes */
12618         case 'A':
12619             RExC_seen_zerolen++;
12620             ret = reg_node(pRExC_state, SBOL);
12621             /* SBOL is shared with /^/ so we set the flags so we can tell
12622              * /\A/ from /^/ in split. We check ret because first pass we
12623              * have no regop struct to set the flags on. */
12624             if (PASS2)
12625                 ret->flags = 1;
12626             *flagp |= SIMPLE;
12627             goto finish_meta_pat;
12628         case 'G':
12629             ret = reg_node(pRExC_state, GPOS);
12630             RExC_seen |= REG_GPOS_SEEN;
12631             *flagp |= SIMPLE;
12632             goto finish_meta_pat;
12633         case 'K':
12634             RExC_seen_zerolen++;
12635             ret = reg_node(pRExC_state, KEEPS);
12636             *flagp |= SIMPLE;
12637             /* XXX:dmq : disabling in-place substitution seems to
12638              * be necessary here to avoid cases of memory corruption, as
12639              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12640              */
12641             RExC_seen |= REG_LOOKBEHIND_SEEN;
12642             goto finish_meta_pat;
12643         case 'Z':
12644             ret = reg_node(pRExC_state, SEOL);
12645             *flagp |= SIMPLE;
12646             RExC_seen_zerolen++;                /* Do not optimize RE away */
12647             goto finish_meta_pat;
12648         case 'z':
12649             ret = reg_node(pRExC_state, EOS);
12650             *flagp |= SIMPLE;
12651             RExC_seen_zerolen++;                /* Do not optimize RE away */
12652             goto finish_meta_pat;
12653         case 'C':
12654             vFAIL("\\C no longer supported");
12655         case 'X':
12656             ret = reg_node(pRExC_state, CLUMP);
12657             *flagp |= HASWIDTH;
12658             goto finish_meta_pat;
12659
12660         case 'W':
12661             invert = 1;
12662             /* FALLTHROUGH */
12663         case 'w':
12664             arg = ANYOF_WORDCHAR;
12665             goto join_posix;
12666
12667         case 'B':
12668             invert = 1;
12669             /* FALLTHROUGH */
12670         case 'b':
12671           {
12672             regex_charset charset = get_regex_charset(RExC_flags);
12673
12674             RExC_seen_zerolen++;
12675             RExC_seen |= REG_LOOKBEHIND_SEEN;
12676             op = BOUND + charset;
12677
12678             if (op == BOUNDL) {
12679                 RExC_contains_locale = 1;
12680             }
12681
12682             ret = reg_node(pRExC_state, op);
12683             *flagp |= SIMPLE;
12684             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12685                 FLAGS(ret) = TRADITIONAL_BOUND;
12686                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
12687                     OP(ret) = BOUNDA;
12688                 }
12689             }
12690             else {
12691                 STRLEN length;
12692                 char name = *RExC_parse;
12693                 char * endbrace;
12694                 RExC_parse += 2;
12695                 endbrace = strchr(RExC_parse, '}');
12696
12697                 if (! endbrace) {
12698                     vFAIL2("Missing right brace on \\%c{}", name);
12699                 }
12700                 /* XXX Need to decide whether to take spaces or not.  Should be
12701                  * consistent with \p{}, but that currently is SPACE, which
12702                  * means vertical too, which seems wrong
12703                  * while (isBLANK(*RExC_parse)) {
12704                     RExC_parse++;
12705                 }*/
12706                 if (endbrace == RExC_parse) {
12707                     RExC_parse++;  /* After the '}' */
12708                     vFAIL2("Empty \\%c{}", name);
12709                 }
12710                 length = endbrace - RExC_parse;
12711                 /*while (isBLANK(*(RExC_parse + length - 1))) {
12712                     length--;
12713                 }*/
12714                 switch (*RExC_parse) {
12715                     case 'g':
12716                         if (length != 1
12717                             && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12718                         {
12719                             goto bad_bound_type;
12720                         }
12721                         FLAGS(ret) = GCB_BOUND;
12722                         break;
12723                     case 'l':
12724                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12725                             goto bad_bound_type;
12726                         }
12727                         FLAGS(ret) = LB_BOUND;
12728                         break;
12729                     case 's':
12730                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12731                             goto bad_bound_type;
12732                         }
12733                         FLAGS(ret) = SB_BOUND;
12734                         break;
12735                     case 'w':
12736                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12737                             goto bad_bound_type;
12738                         }
12739                         FLAGS(ret) = WB_BOUND;
12740                         break;
12741                     default:
12742                       bad_bound_type:
12743                         RExC_parse = endbrace;
12744                         vFAIL2utf8f(
12745                             "'%"UTF8f"' is an unknown bound type",
12746                             UTF8fARG(UTF, length, endbrace - length));
12747                         NOT_REACHED; /*NOTREACHED*/
12748                 }
12749                 RExC_parse = endbrace;
12750                 REQUIRE_UNI_RULES(flagp, NULL);
12751
12752                 if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
12753                     OP(ret) = BOUNDU;
12754                     length += 4;
12755
12756                     /* Don't have to worry about UTF-8, in this message because
12757                      * to get here the contents of the \b must be ASCII */
12758                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
12759                               "Using /u for '%.*s' instead of /%s",
12760                               (unsigned) length,
12761                               endbrace - length + 1,
12762                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12763                               ? ASCII_RESTRICT_PAT_MODS
12764                               : ASCII_MORE_RESTRICT_PAT_MODS);
12765                 }
12766             }
12767
12768             if (PASS2 && invert) {
12769                 OP(ret) += NBOUND - BOUND;
12770             }
12771             goto finish_meta_pat;
12772           }
12773
12774         case 'D':
12775             invert = 1;
12776             /* FALLTHROUGH */
12777         case 'd':
12778             arg = ANYOF_DIGIT;
12779             if (! DEPENDS_SEMANTICS) {
12780                 goto join_posix;
12781             }
12782
12783             /* \d doesn't have any matches in the upper Latin1 range, hence /d
12784              * is equivalent to /u.  Changing to /u saves some branches at
12785              * runtime */
12786             op = POSIXU;
12787             goto join_posix_op_known;
12788
12789         case 'R':
12790             ret = reg_node(pRExC_state, LNBREAK);
12791             *flagp |= HASWIDTH|SIMPLE;
12792             goto finish_meta_pat;
12793
12794         case 'H':
12795             invert = 1;
12796             /* FALLTHROUGH */
12797         case 'h':
12798             arg = ANYOF_BLANK;
12799             op = POSIXU;
12800             goto join_posix_op_known;
12801
12802         case 'V':
12803             invert = 1;
12804             /* FALLTHROUGH */
12805         case 'v':
12806             arg = ANYOF_VERTWS;
12807             op = POSIXU;
12808             goto join_posix_op_known;
12809
12810         case 'S':
12811             invert = 1;
12812             /* FALLTHROUGH */
12813         case 's':
12814             arg = ANYOF_SPACE;
12815
12816           join_posix:
12817
12818             op = POSIXD + get_regex_charset(RExC_flags);
12819             if (op > POSIXA) {  /* /aa is same as /a */
12820                 op = POSIXA;
12821             }
12822             else if (op == POSIXL) {
12823                 RExC_contains_locale = 1;
12824             }
12825
12826           join_posix_op_known:
12827
12828             if (invert) {
12829                 op += NPOSIXD - POSIXD;
12830             }
12831
12832             ret = reg_node(pRExC_state, op);
12833             if (! SIZE_ONLY) {
12834                 FLAGS(ret) = namedclass_to_classnum(arg);
12835             }
12836
12837             *flagp |= HASWIDTH|SIMPLE;
12838             /* FALLTHROUGH */
12839
12840           finish_meta_pat:
12841             nextchar(pRExC_state);
12842             Set_Node_Length(ret, 2); /* MJD */
12843             break;
12844         case 'p':
12845         case 'P':
12846             RExC_parse--;
12847
12848             ret = regclass(pRExC_state, flagp,depth+1,
12849                            TRUE, /* means just parse this element */
12850                            FALSE, /* don't allow multi-char folds */
12851                            FALSE, /* don't silence non-portable warnings.  It
12852                                      would be a bug if these returned
12853                                      non-portables */
12854                            (bool) RExC_strict,
12855                            TRUE, /* Allow an optimized regnode result */
12856                            NULL,
12857                            NULL);
12858             if (*flagp & RESTART_PASS1)
12859                 return NULL;
12860             /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12861              * multi-char folds are allowed.  */
12862             if (!ret)
12863                 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12864                       (UV) *flagp);
12865
12866             RExC_parse--;
12867
12868             Set_Node_Offset(ret, parse_start);
12869             Set_Node_Cur_Length(ret, parse_start - 2);
12870             nextchar(pRExC_state);
12871             break;
12872         case 'N':
12873             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12874              * \N{...} evaluates to a sequence of more than one code points).
12875              * The function call below returns a regnode, which is our result.
12876              * The parameters cause it to fail if the \N{} evaluates to a
12877              * single code point; we handle those like any other literal.  The
12878              * reason that the multicharacter case is handled here and not as
12879              * part of the EXACtish code is because of quantifiers.  In
12880              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12881              * this way makes that Just Happen. dmq.
12882              * join_exact() will join this up with adjacent EXACTish nodes
12883              * later on, if appropriate. */
12884             ++RExC_parse;
12885             if (grok_bslash_N(pRExC_state,
12886                               &ret,     /* Want a regnode returned */
12887                               NULL,     /* Fail if evaluates to a single code
12888                                            point */
12889                               NULL,     /* Don't need a count of how many code
12890                                            points */
12891                               flagp,
12892                               RExC_strict,
12893                               depth)
12894             ) {
12895                 break;
12896             }
12897
12898             if (*flagp & RESTART_PASS1)
12899                 return NULL;
12900
12901             /* Here, evaluates to a single code point.  Go get that */
12902             RExC_parse = parse_start;
12903             goto defchar;
12904
12905         case 'k':    /* Handle \k<NAME> and \k'NAME' */
12906       parse_named_seq:
12907         {
12908             char ch;
12909             if (   RExC_parse >= RExC_end - 1
12910                 || ((   ch = RExC_parse[1]) != '<'
12911                                       && ch != '\''
12912                                       && ch != '{'))
12913             {
12914                 RExC_parse++;
12915                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12916                 vFAIL2("Sequence %.2s... not terminated",parse_start);
12917             } else {
12918                 RExC_parse += 2;
12919                 ret = handle_named_backref(pRExC_state,
12920                                            flagp,
12921                                            parse_start,
12922                                            (ch == '<')
12923                                            ? '>'
12924                                            : (ch == '{')
12925                                              ? '}'
12926                                              : '\'');
12927             }
12928             break;
12929         }
12930         case 'g':
12931         case '1': case '2': case '3': case '4':
12932         case '5': case '6': case '7': case '8': case '9':
12933             {
12934                 I32 num;
12935                 bool hasbrace = 0;
12936
12937                 if (*RExC_parse == 'g') {
12938                     bool isrel = 0;
12939
12940                     RExC_parse++;
12941                     if (*RExC_parse == '{') {
12942                         RExC_parse++;
12943                         hasbrace = 1;
12944                     }
12945                     if (*RExC_parse == '-') {
12946                         RExC_parse++;
12947                         isrel = 1;
12948                     }
12949                     if (hasbrace && !isDIGIT(*RExC_parse)) {
12950                         if (isrel) RExC_parse--;
12951                         RExC_parse -= 2;
12952                         goto parse_named_seq;
12953                     }
12954
12955                     if (RExC_parse >= RExC_end) {
12956                         goto unterminated_g;
12957                     }
12958                     num = S_backref_value(RExC_parse);
12959                     if (num == 0)
12960                         vFAIL("Reference to invalid group 0");
12961                     else if (num == I32_MAX) {
12962                          if (isDIGIT(*RExC_parse))
12963                             vFAIL("Reference to nonexistent group");
12964                         else
12965                           unterminated_g:
12966                             vFAIL("Unterminated \\g... pattern");
12967                     }
12968
12969                     if (isrel) {
12970                         num = RExC_npar - num;
12971                         if (num < 1)
12972                             vFAIL("Reference to nonexistent or unclosed group");
12973                     }
12974                 }
12975                 else {
12976                     num = S_backref_value(RExC_parse);
12977                     /* bare \NNN might be backref or octal - if it is larger
12978                      * than or equal RExC_npar then it is assumed to be an
12979                      * octal escape. Note RExC_npar is +1 from the actual
12980                      * number of parens. */
12981                     /* Note we do NOT check if num == I32_MAX here, as that is
12982                      * handled by the RExC_npar check */
12983
12984                     if (
12985                         /* any numeric escape < 10 is always a backref */
12986                         num > 9
12987                         /* any numeric escape < RExC_npar is a backref */
12988                         && num >= RExC_npar
12989                         /* cannot be an octal escape if it starts with 8 */
12990                         && *RExC_parse != '8'
12991                         /* cannot be an octal escape it it starts with 9 */
12992                         && *RExC_parse != '9'
12993                     )
12994                     {
12995                         /* Probably not a backref, instead likely to be an
12996                          * octal character escape, e.g. \35 or \777.
12997                          * The above logic should make it obvious why using
12998                          * octal escapes in patterns is problematic. - Yves */
12999                         RExC_parse = parse_start;
13000                         goto defchar;
13001                     }
13002                 }
13003
13004                 /* At this point RExC_parse points at a numeric escape like
13005                  * \12 or \88 or something similar, which we should NOT treat
13006                  * as an octal escape. It may or may not be a valid backref
13007                  * escape. For instance \88888888 is unlikely to be a valid
13008                  * backref. */
13009                 while (isDIGIT(*RExC_parse))
13010                     RExC_parse++;
13011                 if (hasbrace) {
13012                     if (*RExC_parse != '}')
13013                         vFAIL("Unterminated \\g{...} pattern");
13014                     RExC_parse++;
13015                 }
13016                 if (!SIZE_ONLY) {
13017                     if (num > (I32)RExC_rx->nparens)
13018                         vFAIL("Reference to nonexistent group");
13019                 }
13020                 RExC_sawback = 1;
13021                 ret = reganode(pRExC_state,
13022                                ((! FOLD)
13023                                  ? REF
13024                                  : (ASCII_FOLD_RESTRICTED)
13025                                    ? REFFA
13026                                    : (AT_LEAST_UNI_SEMANTICS)
13027                                      ? REFFU
13028                                      : (LOC)
13029                                        ? REFFL
13030                                        : REFF),
13031                                 num);
13032                 *flagp |= HASWIDTH;
13033
13034                 /* override incorrect value set in reganode MJD */
13035                 Set_Node_Offset(ret, parse_start);
13036                 Set_Node_Cur_Length(ret, parse_start-1);
13037                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13038                                         FALSE /* Don't force to /x */ );
13039             }
13040             break;
13041         case '\0':
13042             if (RExC_parse >= RExC_end)
13043                 FAIL("Trailing \\");
13044             /* FALLTHROUGH */
13045         default:
13046             /* Do not generate "unrecognized" warnings here, we fall
13047                back into the quick-grab loop below */
13048             RExC_parse = parse_start;
13049             goto defchar;
13050         } /* end of switch on a \foo sequence */
13051         break;
13052
13053     case '#':
13054
13055         /* '#' comments should have been spaced over before this function was
13056          * called */
13057         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13058         /*
13059         if (RExC_flags & RXf_PMf_EXTENDED) {
13060             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13061             if (RExC_parse < RExC_end)
13062                 goto tryagain;
13063         }
13064         */
13065
13066         /* FALLTHROUGH */
13067
13068     default:
13069           defchar: {
13070
13071             /* Here, we have determined that the next thing is probably a
13072              * literal character.  RExC_parse points to the first byte of its
13073              * definition.  (It still may be an escape sequence that evaluates
13074              * to a single character) */
13075
13076             STRLEN len = 0;
13077             UV ender = 0;
13078             char *p;
13079             char *s;
13080 #define MAX_NODE_STRING_SIZE 127
13081             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
13082             char *s0;
13083             U8 upper_parse = MAX_NODE_STRING_SIZE;
13084             U8 node_type = compute_EXACTish(pRExC_state);
13085             bool next_is_quantifier;
13086             char * oldp = NULL;
13087
13088             /* We can convert EXACTF nodes to EXACTFU if they contain only
13089              * characters that match identically regardless of the target
13090              * string's UTF8ness.  The reason to do this is that EXACTF is not
13091              * trie-able, EXACTFU is.
13092              *
13093              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13094              * contain only above-Latin1 characters (hence must be in UTF8),
13095              * which don't participate in folds with Latin1-range characters,
13096              * as the latter's folds aren't known until runtime.  (We don't
13097              * need to figure this out until pass 2) */
13098             bool maybe_exactfu = PASS2
13099                                && (node_type == EXACTF || node_type == EXACTFL);
13100
13101             /* If a folding node contains only code points that don't
13102              * participate in folds, it can be changed into an EXACT node,
13103              * which allows the optimizer more things to look for */
13104             bool maybe_exact;
13105
13106             ret = reg_node(pRExC_state, node_type);
13107
13108             /* In pass1, folded, we use a temporary buffer instead of the
13109              * actual node, as the node doesn't exist yet */
13110             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
13111
13112             s0 = s;
13113
13114           reparse:
13115
13116             /* We look for the EXACTFish to EXACT node optimizaton only if
13117              * folding.  (And we don't need to figure this out until pass 2).
13118              * XXX It might actually make sense to split the node into portions
13119              * that are exact and ones that aren't, so that we could later use
13120              * the exact ones to find the longest fixed and floating strings.
13121              * One would want to join them back into a larger node.  One could
13122              * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
13123             maybe_exact = FOLD && PASS2;
13124
13125             /* XXX The node can hold up to 255 bytes, yet this only goes to
13126              * 127.  I (khw) do not know why.  Keeping it somewhat less than
13127              * 255 allows us to not have to worry about overflow due to
13128              * converting to utf8 and fold expansion, but that value is
13129              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
13130              * split up by this limit into a single one using the real max of
13131              * 255.  Even at 127, this breaks under rare circumstances.  If
13132              * folding, we do not want to split a node at a character that is a
13133              * non-final in a multi-char fold, as an input string could just
13134              * happen to want to match across the node boundary.  The join
13135              * would solve that problem if the join actually happens.  But a
13136              * series of more than two nodes in a row each of 127 would cause
13137              * the first join to succeed to get to 254, but then there wouldn't
13138              * be room for the next one, which could at be one of those split
13139              * multi-char folds.  I don't know of any fool-proof solution.  One
13140              * could back off to end with only a code point that isn't such a
13141              * non-final, but it is possible for there not to be any in the
13142              * entire node. */
13143
13144             assert(   ! UTF     /* Is at the beginning of a character */
13145                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13146                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13147
13148             /* Here, we have a literal character.  Find the maximal string of
13149              * them in the input that we can fit into a single EXACTish node.
13150              * We quit at the first non-literal or when the node gets full */
13151             for (p = RExC_parse;
13152                  len < upper_parse && p < RExC_end;
13153                  len++)
13154             {
13155                 oldp = p;
13156
13157                 /* White space has already been ignored */
13158                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13159                        || ! is_PATWS_safe((p), RExC_end, UTF));
13160
13161                 switch ((U8)*p) {
13162                 case '^':
13163                 case '$':
13164                 case '.':
13165                 case '[':
13166                 case '(':
13167                 case ')':
13168                 case '|':
13169                     goto loopdone;
13170                 case '\\':
13171                     /* Literal Escapes Switch
13172
13173                        This switch is meant to handle escape sequences that
13174                        resolve to a literal character.
13175
13176                        Every escape sequence that represents something
13177                        else, like an assertion or a char class, is handled
13178                        in the switch marked 'Special Escapes' above in this
13179                        routine, but also has an entry here as anything that
13180                        isn't explicitly mentioned here will be treated as
13181                        an unescaped equivalent literal.
13182                     */
13183
13184                     switch ((U8)*++p) {
13185                     /* These are all the special escapes. */
13186                     case 'A':             /* Start assertion */
13187                     case 'b': case 'B':   /* Word-boundary assertion*/
13188                     case 'C':             /* Single char !DANGEROUS! */
13189                     case 'd': case 'D':   /* digit class */
13190                     case 'g': case 'G':   /* generic-backref, pos assertion */
13191                     case 'h': case 'H':   /* HORIZWS */
13192                     case 'k': case 'K':   /* named backref, keep marker */
13193                     case 'p': case 'P':   /* Unicode property */
13194                               case 'R':   /* LNBREAK */
13195                     case 's': case 'S':   /* space class */
13196                     case 'v': case 'V':   /* VERTWS */
13197                     case 'w': case 'W':   /* word class */
13198                     case 'X':             /* eXtended Unicode "combining
13199                                              character sequence" */
13200                     case 'z': case 'Z':   /* End of line/string assertion */
13201                         --p;
13202                         goto loopdone;
13203
13204                     /* Anything after here is an escape that resolves to a
13205                        literal. (Except digits, which may or may not)
13206                      */
13207                     case 'n':
13208                         ender = '\n';
13209                         p++;
13210                         break;
13211                     case 'N': /* Handle a single-code point named character. */
13212                         RExC_parse = p + 1;
13213                         if (! grok_bslash_N(pRExC_state,
13214                                             NULL,   /* Fail if evaluates to
13215                                                        anything other than a
13216                                                        single code point */
13217                                             &ender, /* The returned single code
13218                                                        point */
13219                                             NULL,   /* Don't need a count of
13220                                                        how many code points */
13221                                             flagp,
13222                                             RExC_strict,
13223                                             depth)
13224                         ) {
13225                             if (*flagp & NEED_UTF8)
13226                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13227                             if (*flagp & RESTART_PASS1)
13228                                 return NULL;
13229
13230                             /* Here, it wasn't a single code point.  Go close
13231                              * up this EXACTish node.  The switch() prior to
13232                              * this switch handles the other cases */
13233                             RExC_parse = p = oldp;
13234                             goto loopdone;
13235                         }
13236                         p = RExC_parse;
13237                         if (ender > 0xff) {
13238                             REQUIRE_UTF8(flagp);
13239                         }
13240                         break;
13241                     case 'r':
13242                         ender = '\r';
13243                         p++;
13244                         break;
13245                     case 't':
13246                         ender = '\t';
13247                         p++;
13248                         break;
13249                     case 'f':
13250                         ender = '\f';
13251                         p++;
13252                         break;
13253                     case 'e':
13254                         ender = ESC_NATIVE;
13255                         p++;
13256                         break;
13257                     case 'a':
13258                         ender = '\a';
13259                         p++;
13260                         break;
13261                     case 'o':
13262                         {
13263                             UV result;
13264                             const char* error_msg;
13265
13266                             bool valid = grok_bslash_o(&p,
13267                                                        &result,
13268                                                        &error_msg,
13269                                                        PASS2, /* out warnings */
13270                                                        (bool) RExC_strict,
13271                                                        TRUE, /* Output warnings
13272                                                                 for non-
13273                                                                 portables */
13274                                                        UTF);
13275                             if (! valid) {
13276                                 RExC_parse = p; /* going to die anyway; point
13277                                                    to exact spot of failure */
13278                                 vFAIL(error_msg);
13279                             }
13280                             ender = result;
13281                             if (ender > 0xff) {
13282                                 REQUIRE_UTF8(flagp);
13283                             }
13284                             break;
13285                         }
13286                     case 'x':
13287                         {
13288                             UV result = UV_MAX; /* initialize to erroneous
13289                                                    value */
13290                             const char* error_msg;
13291
13292                             bool valid = grok_bslash_x(&p,
13293                                                        &result,
13294                                                        &error_msg,
13295                                                        PASS2, /* out warnings */
13296                                                        (bool) RExC_strict,
13297                                                        TRUE, /* Silence warnings
13298                                                                 for non-
13299                                                                 portables */
13300                                                        UTF);
13301                             if (! valid) {
13302                                 RExC_parse = p; /* going to die anyway; point
13303                                                    to exact spot of failure */
13304                                 vFAIL(error_msg);
13305                             }
13306                             ender = result;
13307
13308                             if (ender < 0x100) {
13309 #ifdef EBCDIC
13310                                 if (RExC_recode_x_to_native) {
13311                                     ender = LATIN1_TO_NATIVE(ender);
13312                                 }
13313 #endif
13314                             }
13315                             else {
13316                                 REQUIRE_UTF8(flagp);
13317                             }
13318                             break;
13319                         }
13320                     case 'c':
13321                         p++;
13322                         ender = grok_bslash_c(*p++, PASS2);
13323                         break;
13324                     case '8': case '9': /* must be a backreference */
13325                         --p;
13326                         /* we have an escape like \8 which cannot be an octal escape
13327                          * so we exit the loop, and let the outer loop handle this
13328                          * escape which may or may not be a legitimate backref. */
13329                         goto loopdone;
13330                     case '1': case '2': case '3':case '4':
13331                     case '5': case '6': case '7':
13332                         /* When we parse backslash escapes there is ambiguity
13333                          * between backreferences and octal escapes. Any escape
13334                          * from \1 - \9 is a backreference, any multi-digit
13335                          * escape which does not start with 0 and which when
13336                          * evaluated as decimal could refer to an already
13337                          * parsed capture buffer is a back reference. Anything
13338                          * else is octal.
13339                          *
13340                          * Note this implies that \118 could be interpreted as
13341                          * 118 OR as "\11" . "8" depending on whether there
13342                          * were 118 capture buffers defined already in the
13343                          * pattern.  */
13344
13345                         /* NOTE, RExC_npar is 1 more than the actual number of
13346                          * parens we have seen so far, hence the < RExC_npar below. */
13347
13348                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13349                         {  /* Not to be treated as an octal constant, go
13350                                    find backref */
13351                             --p;
13352                             goto loopdone;
13353                         }
13354                         /* FALLTHROUGH */
13355                     case '0':
13356                         {
13357                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13358                             STRLEN numlen = 3;
13359                             ender = grok_oct(p, &numlen, &flags, NULL);
13360                             if (ender > 0xff) {
13361                                 REQUIRE_UTF8(flagp);
13362                             }
13363                             p += numlen;
13364                             if (PASS2   /* like \08, \178 */
13365                                 && numlen < 3
13366                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13367                             {
13368                                 reg_warn_non_literal_string(
13369                                          p + 1,
13370                                          form_short_octal_warning(p, numlen));
13371                             }
13372                         }
13373                         break;
13374                     case '\0':
13375                         if (p >= RExC_end)
13376                             FAIL("Trailing \\");
13377                         /* FALLTHROUGH */
13378                     default:
13379                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13380                             /* Include any left brace following the alpha to emphasize
13381                              * that it could be part of an escape at some point
13382                              * in the future */
13383                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13384                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13385                         }
13386                         goto normal_default;
13387                     } /* End of switch on '\' */
13388                     break;
13389                 case '{':
13390                     /* Currently we don't care if the lbrace is at the start
13391                      * of a construct.  This catches it in the middle of a
13392                      * literal string, or when it's the first thing after
13393                      * something like "\b" */
13394                     if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
13395                         RExC_parse = p + 1;
13396                         vFAIL("Unescaped left brace in regex is illegal here");
13397                     }
13398                     /*FALLTHROUGH*/
13399                 default:    /* A literal character */
13400                   normal_default:
13401                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
13402                         STRLEN numlen;
13403                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13404                                                &numlen, UTF8_ALLOW_DEFAULT);
13405                         p += numlen;
13406                     }
13407                     else
13408                         ender = (U8) *p++;
13409                     break;
13410                 } /* End of switch on the literal */
13411
13412                 /* Here, have looked at the literal character and <ender>
13413                  * contains its ordinal, <p> points to the character after it.
13414                  * We need to check if the next non-ignored thing is a
13415                  * quantifier.  Move <p> to after anything that should be
13416                  * ignored, which, as a side effect, positions <p> for the next
13417                  * loop iteration */
13418                 skip_to_be_ignored_text(pRExC_state, &p,
13419                                         FALSE /* Don't force to /x */ );
13420
13421                 /* If the next thing is a quantifier, it applies to this
13422                  * character only, which means that this character has to be in
13423                  * its own node and can't just be appended to the string in an
13424                  * existing node, so if there are already other characters in
13425                  * the node, close the node with just them, and set up to do
13426                  * this character again next time through, when it will be the
13427                  * only thing in its new node */
13428
13429                 if ((next_is_quantifier = (   LIKELY(p < RExC_end)
13430                                            && UNLIKELY(ISMULT2(p))))
13431                     && LIKELY(len))
13432                 {
13433                     p = oldp;
13434                     goto loopdone;
13435                 }
13436
13437                 /* Ready to add 'ender' to the node */
13438
13439                 if (! FOLD) {  /* The simple case, just append the literal */
13440
13441                     /* In the sizing pass, we need only the size of the
13442                      * character we are appending, hence we can delay getting
13443                      * its representation until PASS2. */
13444                     if (SIZE_ONLY) {
13445                         if (UTF) {
13446                             const STRLEN unilen = UVCHR_SKIP(ender);
13447                             s += unilen;
13448
13449                             /* We have to subtract 1 just below (and again in
13450                              * the corresponding PASS2 code) because the loop
13451                              * increments <len> each time, as all but this path
13452                              * (and one other) through it add a single byte to
13453                              * the EXACTish node.  But these paths would change
13454                              * len to be the correct final value, so cancel out
13455                              * the increment that follows */
13456                             len += unilen - 1;
13457                         }
13458                         else {
13459                             s++;
13460                         }
13461                     } else { /* PASS2 */
13462                       not_fold_common:
13463                         if (UTF) {
13464                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13465                             len += (char *) new_s - s - 1;
13466                             s = (char *) new_s;
13467                         }
13468                         else {
13469                             *(s++) = (char) ender;
13470                         }
13471                     }
13472                 }
13473                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13474
13475                     /* Here are folding under /l, and the code point is
13476                      * problematic.  First, we know we can't simplify things */
13477                     maybe_exact = FALSE;
13478                     maybe_exactfu = FALSE;
13479
13480                     /* A problematic code point in this context means that its
13481                      * fold isn't known until runtime, so we can't fold it now.
13482                      * (The non-problematic code points are the above-Latin1
13483                      * ones that fold to also all above-Latin1.  Their folds
13484                      * don't vary no matter what the locale is.) But here we
13485                      * have characters whose fold depends on the locale.
13486                      * Unlike the non-folding case above, we have to keep track
13487                      * of these in the sizing pass, so that we can make sure we
13488                      * don't split too-long nodes in the middle of a potential
13489                      * multi-char fold.  And unlike the regular fold case
13490                      * handled in the else clauses below, we don't actually
13491                      * fold and don't have special cases to consider.  What we
13492                      * do for both passes is the PASS2 code for non-folding */
13493                     goto not_fold_common;
13494                 }
13495                 else /* A regular FOLD code point */
13496                     if (! (   UTF
13497 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13498    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13499                                       || UNICODE_DOT_DOT_VERSION > 0)
13500                             /* See comments for join_exact() as to why we fold
13501                              * this non-UTF at compile time */
13502                             || (   node_type == EXACTFU
13503                                 && ender == LATIN_SMALL_LETTER_SHARP_S)
13504 #endif
13505                 )) {
13506                     /* Here, are folding and are not UTF-8 encoded; therefore
13507                      * the character must be in the range 0-255, and is not /l
13508                      * (Not /l because we already handled these under /l in
13509                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13510                     if (IS_IN_SOME_FOLD_L1(ender)) {
13511                         maybe_exact = FALSE;
13512
13513                         /* See if the character's fold differs between /d and
13514                          * /u.  This includes the multi-char fold SHARP S to
13515                          * 'ss' */
13516                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13517                             RExC_seen_unfolded_sharp_s = 1;
13518                             maybe_exactfu = FALSE;
13519                         }
13520                         else if (maybe_exactfu
13521                             && (PL_fold[ender] != PL_fold_latin1[ender]
13522 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13523    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13524                                       || UNICODE_DOT_DOT_VERSION > 0)
13525                                 || (   len > 0
13526                                     && isALPHA_FOLD_EQ(ender, 's')
13527                                     && isALPHA_FOLD_EQ(*(s-1), 's'))
13528 #endif
13529                         )) {
13530                             maybe_exactfu = FALSE;
13531                         }
13532                     }
13533
13534                     /* Even when folding, we store just the input character, as
13535                      * we have an array that finds its fold quickly */
13536                     *(s++) = (char) ender;
13537                 }
13538                 else {  /* FOLD, and UTF (or sharp s) */
13539                     /* Unlike the non-fold case, we do actually have to
13540                      * calculate the results here in pass 1.  This is for two
13541                      * reasons, the folded length may be longer than the
13542                      * unfolded, and we have to calculate how many EXACTish
13543                      * nodes it will take; and we may run out of room in a node
13544                      * in the middle of a potential multi-char fold, and have
13545                      * to back off accordingly.  */
13546
13547                     UV folded;
13548                     if (isASCII_uni(ender)) {
13549                         folded = toFOLD(ender);
13550                         *(s)++ = (U8) folded;
13551                     }
13552                     else {
13553                         STRLEN foldlen;
13554
13555                         folded = _to_uni_fold_flags(
13556                                      ender,
13557                                      (U8 *) s,
13558                                      &foldlen,
13559                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13560                                                         ? FOLD_FLAGS_NOMIX_ASCII
13561                                                         : 0));
13562                         s += foldlen;
13563
13564                         /* The loop increments <len> each time, as all but this
13565                          * path (and one other) through it add a single byte to
13566                          * the EXACTish node.  But this one has changed len to
13567                          * be the correct final value, so subtract one to
13568                          * cancel out the increment that follows */
13569                         len += foldlen - 1;
13570                     }
13571                     /* If this node only contains non-folding code points so
13572                      * far, see if this new one is also non-folding */
13573                     if (maybe_exact) {
13574                         if (folded != ender) {
13575                             maybe_exact = FALSE;
13576                         }
13577                         else {
13578                             /* Here the fold is the original; we have to check
13579                              * further to see if anything folds to it */
13580                             if (_invlist_contains_cp(PL_utf8_foldable,
13581                                                         ender))
13582                             {
13583                                 maybe_exact = FALSE;
13584                             }
13585                         }
13586                     }
13587                     ender = folded;
13588                 }
13589
13590                 if (next_is_quantifier) {
13591
13592                     /* Here, the next input is a quantifier, and to get here,
13593                      * the current character is the only one in the node.
13594                      * Also, here <len> doesn't include the final byte for this
13595                      * character */
13596                     len++;
13597                     goto loopdone;
13598                 }
13599
13600             } /* End of loop through literal characters */
13601
13602             /* Here we have either exhausted the input or ran out of room in
13603              * the node.  (If we encountered a character that can't be in the
13604              * node, transfer is made directly to <loopdone>, and so we
13605              * wouldn't have fallen off the end of the loop.)  In the latter
13606              * case, we artificially have to split the node into two, because
13607              * we just don't have enough space to hold everything.  This
13608              * creates a problem if the final character participates in a
13609              * multi-character fold in the non-final position, as a match that
13610              * should have occurred won't, due to the way nodes are matched,
13611              * and our artificial boundary.  So back off until we find a non-
13612              * problematic character -- one that isn't at the beginning or
13613              * middle of such a fold.  (Either it doesn't participate in any
13614              * folds, or appears only in the final position of all the folds it
13615              * does participate in.)  A better solution with far fewer false
13616              * positives, and that would fill the nodes more completely, would
13617              * be to actually have available all the multi-character folds to
13618              * test against, and to back-off only far enough to be sure that
13619              * this node isn't ending with a partial one.  <upper_parse> is set
13620              * further below (if we need to reparse the node) to include just
13621              * up through that final non-problematic character that this code
13622              * identifies, so when it is set to less than the full node, we can
13623              * skip the rest of this */
13624             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13625
13626                 const STRLEN full_len = len;
13627
13628                 assert(len >= MAX_NODE_STRING_SIZE);
13629
13630                 /* Here, <s> points to the final byte of the final character.
13631                  * Look backwards through the string until find a non-
13632                  * problematic character */
13633
13634                 if (! UTF) {
13635
13636                     /* This has no multi-char folds to non-UTF characters */
13637                     if (ASCII_FOLD_RESTRICTED) {
13638                         goto loopdone;
13639                     }
13640
13641                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13642                     len = s - s0 + 1;
13643                 }
13644                 else {
13645                     if (!  PL_NonL1NonFinalFold) {
13646                         PL_NonL1NonFinalFold = _new_invlist_C_array(
13647                                         NonL1_Perl_Non_Final_Folds_invlist);
13648                     }
13649
13650                     /* Point to the first byte of the final character */
13651                     s = (char *) utf8_hop((U8 *) s, -1);
13652
13653                     while (s >= s0) {   /* Search backwards until find
13654                                            non-problematic char */
13655                         if (UTF8_IS_INVARIANT(*s)) {
13656
13657                             /* There are no ascii characters that participate
13658                              * in multi-char folds under /aa.  In EBCDIC, the
13659                              * non-ascii invariants are all control characters,
13660                              * so don't ever participate in any folds. */
13661                             if (ASCII_FOLD_RESTRICTED
13662                                 || ! IS_NON_FINAL_FOLD(*s))
13663                             {
13664                                 break;
13665                             }
13666                         }
13667                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13668                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13669                                                                   *s, *(s+1))))
13670                             {
13671                                 break;
13672                             }
13673                         }
13674                         else if (! _invlist_contains_cp(
13675                                         PL_NonL1NonFinalFold,
13676                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
13677                         {
13678                             break;
13679                         }
13680
13681                         /* Here, the current character is problematic in that
13682                          * it does occur in the non-final position of some
13683                          * fold, so try the character before it, but have to
13684                          * special case the very first byte in the string, so
13685                          * we don't read outside the string */
13686                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13687                     } /* End of loop backwards through the string */
13688
13689                     /* If there were only problematic characters in the string,
13690                      * <s> will point to before s0, in which case the length
13691                      * should be 0, otherwise include the length of the
13692                      * non-problematic character just found */
13693                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13694                 }
13695
13696                 /* Here, have found the final character, if any, that is
13697                  * non-problematic as far as ending the node without splitting
13698                  * it across a potential multi-char fold.  <len> contains the
13699                  * number of bytes in the node up-to and including that
13700                  * character, or is 0 if there is no such character, meaning
13701                  * the whole node contains only problematic characters.  In
13702                  * this case, give up and just take the node as-is.  We can't
13703                  * do any better */
13704                 if (len == 0) {
13705                     len = full_len;
13706
13707                     /* If the node ends in an 's' we make sure it stays EXACTF,
13708                      * as if it turns into an EXACTFU, it could later get
13709                      * joined with another 's' that would then wrongly match
13710                      * the sharp s */
13711                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13712                     {
13713                         maybe_exactfu = FALSE;
13714                     }
13715                 } else {
13716
13717                     /* Here, the node does contain some characters that aren't
13718                      * problematic.  If one such is the final character in the
13719                      * node, we are done */
13720                     if (len == full_len) {
13721                         goto loopdone;
13722                     }
13723                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13724
13725                         /* If the final character is problematic, but the
13726                          * penultimate is not, back-off that last character to
13727                          * later start a new node with it */
13728                         p = oldp;
13729                         goto loopdone;
13730                     }
13731
13732                     /* Here, the final non-problematic character is earlier
13733                      * in the input than the penultimate character.  What we do
13734                      * is reparse from the beginning, going up only as far as
13735                      * this final ok one, thus guaranteeing that the node ends
13736                      * in an acceptable character.  The reason we reparse is
13737                      * that we know how far in the character is, but we don't
13738                      * know how to correlate its position with the input parse.
13739                      * An alternate implementation would be to build that
13740                      * correlation as we go along during the original parse,
13741                      * but that would entail extra work for every node, whereas
13742                      * this code gets executed only when the string is too
13743                      * large for the node, and the final two characters are
13744                      * problematic, an infrequent occurrence.  Yet another
13745                      * possible strategy would be to save the tail of the
13746                      * string, and the next time regatom is called, initialize
13747                      * with that.  The problem with this is that unless you
13748                      * back off one more character, you won't be guaranteed
13749                      * regatom will get called again, unless regbranch,
13750                      * regpiece ... are also changed.  If you do back off that
13751                      * extra character, so that there is input guaranteed to
13752                      * force calling regatom, you can't handle the case where
13753                      * just the first character in the node is acceptable.  I
13754                      * (khw) decided to try this method which doesn't have that
13755                      * pitfall; if performance issues are found, we can do a
13756                      * combination of the current approach plus that one */
13757                     upper_parse = len;
13758                     len = 0;
13759                     s = s0;
13760                     goto reparse;
13761                 }
13762             }   /* End of verifying node ends with an appropriate char */
13763
13764           loopdone:   /* Jumped to when encounters something that shouldn't be
13765                          in the node */
13766
13767             /* I (khw) don't know if you can get here with zero length, but the
13768              * old code handled this situation by creating a zero-length EXACT
13769              * node.  Might as well be NOTHING instead */
13770             if (len == 0) {
13771                 OP(ret) = NOTHING;
13772             }
13773             else {
13774                 if (FOLD) {
13775                     /* If 'maybe_exact' is still set here, means there are no
13776                      * code points in the node that participate in folds;
13777                      * similarly for 'maybe_exactfu' and code points that match
13778                      * differently depending on UTF8ness of the target string
13779                      * (for /u), or depending on locale for /l */
13780                     if (maybe_exact) {
13781                         OP(ret) = (LOC)
13782                                   ? EXACTL
13783                                   : EXACT;
13784                     }
13785                     else if (maybe_exactfu) {
13786                         OP(ret) = (LOC)
13787                                   ? EXACTFLU8
13788                                   : EXACTFU;
13789                     }
13790                 }
13791                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13792                                            FALSE /* Don't look to see if could
13793                                                     be turned into an EXACT
13794                                                     node, as we have already
13795                                                     computed that */
13796                                           );
13797             }
13798
13799             RExC_parse = p - 1;
13800             Set_Node_Cur_Length(ret, parse_start);
13801             RExC_parse = p;
13802             {
13803                 /* len is STRLEN which is unsigned, need to copy to signed */
13804                 IV iv = len;
13805                 if (iv < 0)
13806                     vFAIL("Internal disaster");
13807             }
13808
13809         } /* End of label 'defchar:' */
13810         break;
13811     } /* End of giant switch on input character */
13812
13813     /* Position parse to next real character */
13814     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13815                                             FALSE /* Don't force to /x */ );
13816     if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) {
13817         ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through");
13818     }
13819
13820     return(ret);
13821 }
13822
13823
13824 STATIC void
13825 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13826 {
13827     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
13828      * sets up the bitmap and any flags, removing those code points from the
13829      * inversion list, setting it to NULL should it become completely empty */
13830
13831     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13832     assert(PL_regkind[OP(node)] == ANYOF);
13833
13834     ANYOF_BITMAP_ZERO(node);
13835     if (*invlist_ptr) {
13836
13837         /* This gets set if we actually need to modify things */
13838         bool change_invlist = FALSE;
13839
13840         UV start, end;
13841
13842         /* Start looking through *invlist_ptr */
13843         invlist_iterinit(*invlist_ptr);
13844         while (invlist_iternext(*invlist_ptr, &start, &end)) {
13845             UV high;
13846             int i;
13847
13848             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13849                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13850             }
13851
13852             /* Quit if are above what we should change */
13853             if (start >= NUM_ANYOF_CODE_POINTS) {
13854                 break;
13855             }
13856
13857             change_invlist = TRUE;
13858
13859             /* Set all the bits in the range, up to the max that we are doing */
13860             high = (end < NUM_ANYOF_CODE_POINTS - 1)
13861                    ? end
13862                    : NUM_ANYOF_CODE_POINTS - 1;
13863             for (i = start; i <= (int) high; i++) {
13864                 if (! ANYOF_BITMAP_TEST(node, i)) {
13865                     ANYOF_BITMAP_SET(node, i);
13866                 }
13867             }
13868         }
13869         invlist_iterfinish(*invlist_ptr);
13870
13871         /* Done with loop; remove any code points that are in the bitmap from
13872          * *invlist_ptr; similarly for code points above the bitmap if we have
13873          * a flag to match all of them anyways */
13874         if (change_invlist) {
13875             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13876         }
13877         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13878             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13879         }
13880
13881         /* If have completely emptied it, remove it completely */
13882         if (_invlist_len(*invlist_ptr) == 0) {
13883             SvREFCNT_dec_NN(*invlist_ptr);
13884             *invlist_ptr = NULL;
13885         }
13886     }
13887 }
13888
13889 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13890    Character classes ([:foo:]) can also be negated ([:^foo:]).
13891    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13892    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13893    but trigger failures because they are currently unimplemented. */
13894
13895 #define POSIXCC_DONE(c)   ((c) == ':')
13896 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13897 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13898 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
13899
13900 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
13901 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
13902 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
13903
13904 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
13905
13906 /* 'posix_warnings' and 'warn_text' are names of variables in the following
13907  * routine. q.v. */
13908 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
13909         if (posix_warnings) {                                               \
13910             if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
13911             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                          \
13912                                              WARNING_PREFIX                 \
13913                                              text                           \
13914                                              REPORT_LOCATION,               \
13915                                              REPORT_LOCATION_ARGS(p)));     \
13916         }                                                                   \
13917     } STMT_END
13918
13919 STATIC int
13920 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
13921
13922     const char * const s,      /* Where the putative posix class begins.
13923                                   Normally, this is one past the '['.  This
13924                                   parameter exists so it can be somewhere
13925                                   besides RExC_parse. */
13926     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
13927                                   NULL */
13928     AV ** posix_warnings,      /* Where to place any generated warnings, or
13929                                   NULL */
13930     const bool check_only      /* Don't die if error */
13931 )
13932 {
13933     /* This parses what the caller thinks may be one of the three POSIX
13934      * constructs:
13935      *  1) a character class, like [:blank:]
13936      *  2) a collating symbol, like [. .]
13937      *  3) an equivalence class, like [= =]
13938      * In the latter two cases, it croaks if it finds a syntactically legal
13939      * one, as these are not handled by Perl.
13940      *
13941      * The main purpose is to look for a POSIX character class.  It returns:
13942      *  a) the class number
13943      *      if it is a completely syntactically and semantically legal class.
13944      *      'updated_parse_ptr', if not NULL, is set to point to just after the
13945      *      closing ']' of the class
13946      *  b) OOB_NAMEDCLASS
13947      *      if it appears that one of the three POSIX constructs was meant, but
13948      *      its specification was somehow defective.  'updated_parse_ptr', if
13949      *      not NULL, is set to point to the character just after the end
13950      *      character of the class.  See below for handling of warnings.
13951      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
13952      *      if it  doesn't appear that a POSIX construct was intended.
13953      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
13954      *      raised.
13955      *
13956      * In b) there may be errors or warnings generated.  If 'check_only' is
13957      * TRUE, then any errors are discarded.  Warnings are returned to the
13958      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
13959      * instead it is NULL, warnings are suppressed.  This is done in all
13960      * passes.  The reason for this is that the rest of the parsing is heavily
13961      * dependent on whether this routine found a valid posix class or not.  If
13962      * it did, the closing ']' is absorbed as part of the class.  If no class,
13963      * or an invalid one is found, any ']' will be considered the terminator of
13964      * the outer bracketed character class, leading to very different results.
13965      * In particular, a '(?[ ])' construct will likely have a syntax error if
13966      * the class is parsed other than intended, and this will happen in pass1,
13967      * before the warnings would normally be output.  This mechanism allows the
13968      * caller to output those warnings in pass1 just before dieing, giving a
13969      * much better clue as to what is wrong.
13970      *
13971      * The reason for this function, and its complexity is that a bracketed
13972      * character class can contain just about anything.  But it's easy to
13973      * mistype the very specific posix class syntax but yielding a valid
13974      * regular bracketed class, so it silently gets compiled into something
13975      * quite unintended.
13976      *
13977      * The solution adopted here maintains backward compatibility except that
13978      * it adds a warning if it looks like a posix class was intended but
13979      * improperly specified.  The warning is not raised unless what is input
13980      * very closely resembles one of the 14 legal posix classes.  To do this,
13981      * it uses fuzzy parsing.  It calculates how many single-character edits it
13982      * would take to transform what was input into a legal posix class.  Only
13983      * if that number is quite small does it think that the intention was a
13984      * posix class.  Obviously these are heuristics, and there will be cases
13985      * where it errs on one side or another, and they can be tweaked as
13986      * experience informs.
13987      *
13988      * The syntax for a legal posix class is:
13989      *
13990      * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
13991      *
13992      * What this routine considers syntactically to be an intended posix class
13993      * is this (the comments indicate some restrictions that the pattern
13994      * doesn't show):
13995      *
13996      *  qr/(?x: \[?                         # The left bracket, possibly
13997      *                                      # omitted
13998      *          \h*                         # possibly followed by blanks
13999      *          (?: \^ \h* )?               # possibly a misplaced caret
14000      *          [:;]?                       # The opening class character,
14001      *                                      # possibly omitted.  A typo
14002      *                                      # semi-colon can also be used.
14003      *          \h*
14004      *          \^?                         # possibly a correctly placed
14005      *                                      # caret, but not if there was also
14006      *                                      # a misplaced one
14007      *          \h*
14008      *          .{3,15}                     # The class name.  If there are
14009      *                                      # deviations from the legal syntax,
14010      *                                      # its edit distance must be close
14011      *                                      # to a real class name in order
14012      *                                      # for it to be considered to be
14013      *                                      # an intended posix class.
14014      *          \h*
14015      *          [:punct:]?                  # The closing class character,
14016      *                                      # possibly omitted.  If not a colon
14017      *                                      # nor semi colon, the class name
14018      *                                      # must be even closer to a valid
14019      *                                      # one
14020      *          \h*
14021      *          \]?                         # The right bracket, possibly
14022      *                                      # omitted.
14023      *     )/
14024      *
14025      * In the above, \h must be ASCII-only.
14026      *
14027      * These are heuristics, and can be tweaked as field experience dictates.
14028      * There will be cases when someone didn't intend to specify a posix class
14029      * that this warns as being so.  The goal is to minimize these, while
14030      * maximizing the catching of things intended to be a posix class that
14031      * aren't parsed as such.
14032      */
14033
14034     const char* p             = s;
14035     const char * const e      = RExC_end;
14036     unsigned complement       = 0;      /* If to complement the class */
14037     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
14038     bool has_opening_bracket  = FALSE;
14039     bool has_opening_colon    = FALSE;
14040     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
14041                                                    valid class */
14042     const char * possible_end = NULL;   /* used for a 2nd parse pass */
14043     const char* name_start;             /* ptr to class name first char */
14044
14045     /* If the number of single-character typos the input name is away from a
14046      * legal name is no more than this number, it is considered to have meant
14047      * the legal name */
14048     int max_distance          = 2;
14049
14050     /* to store the name.  The size determines the maximum length before we
14051      * decide that no posix class was intended.  Should be at least
14052      * sizeof("alphanumeric") */
14053     UV input_text[15];
14054
14055     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14056
14057     if (posix_warnings && RExC_warn_text)
14058         av_clear(RExC_warn_text);
14059
14060     if (p >= e) {
14061         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14062     }
14063
14064     if (*(p - 1) != '[') {
14065         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14066         found_problem = TRUE;
14067     }
14068     else {
14069         has_opening_bracket = TRUE;
14070     }
14071
14072     /* They could be confused and think you can put spaces between the
14073      * components */
14074     if (isBLANK(*p)) {
14075         found_problem = TRUE;
14076
14077         do {
14078             p++;
14079         } while (p < e && isBLANK(*p));
14080
14081         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14082     }
14083
14084     /* For [. .] and [= =].  These are quite different internally from [: :],
14085      * so they are handled separately.  */
14086     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14087                                             and 1 for at least one char in it
14088                                           */
14089     {
14090         const char open_char  = *p;
14091         const char * temp_ptr = p + 1;
14092
14093         /* These two constructs are not handled by perl, and if we find a
14094          * syntactically valid one, we croak.  khw, who wrote this code, finds
14095          * this explanation of them very unclear:
14096          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14097          * And searching the rest of the internet wasn't very helpful either.
14098          * It looks like just about any byte can be in these constructs,
14099          * depending on the locale.  But unless the pattern is being compiled
14100          * under /l, which is very rare, Perl runs under the C or POSIX locale.
14101          * In that case, it looks like [= =] isn't allowed at all, and that
14102          * [. .] could be any single code point, but for longer strings the
14103          * constituent characters would have to be the ASCII alphabetics plus
14104          * the minus-hyphen.  Any sensible locale definition would limit itself
14105          * to these.  And any portable one definitely should.  Trying to parse
14106          * the general case is a nightmare (see [perl #127604]).  So, this code
14107          * looks only for interiors of these constructs that match:
14108          *      qr/.|[-\w]{2,}/
14109          * Using \w relaxes the apparent rules a little, without adding much
14110          * danger of mistaking something else for one of these constructs.
14111          *
14112          * [. .] in some implementations described on the internet is usable to
14113          * escape a character that otherwise is special in bracketed character
14114          * classes.  For example [.].] means a literal right bracket instead of
14115          * the ending of the class
14116          *
14117          * [= =] can legitimately contain a [. .] construct, but we don't
14118          * handle this case, as that [. .] construct will later get parsed
14119          * itself and croak then.  And [= =] is checked for even when not under
14120          * /l, as Perl has long done so.
14121          *
14122          * The code below relies on there being a trailing NUL, so it doesn't
14123          * have to keep checking if the parse ptr < e.
14124          */
14125         if (temp_ptr[1] == open_char) {
14126             temp_ptr++;
14127         }
14128         else while (    temp_ptr < e
14129                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14130         {
14131             temp_ptr++;
14132         }
14133
14134         if (*temp_ptr == open_char) {
14135             temp_ptr++;
14136             if (*temp_ptr == ']') {
14137                 temp_ptr++;
14138                 if (! found_problem && ! check_only) {
14139                     RExC_parse = (char *) temp_ptr;
14140                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
14141                             "extensions", open_char, open_char);
14142                 }
14143
14144                 /* Here, the syntax wasn't completely valid, or else the call
14145                  * is to check-only */
14146                 if (updated_parse_ptr) {
14147                     *updated_parse_ptr = (char *) temp_ptr;
14148                 }
14149
14150                 return OOB_NAMEDCLASS;
14151             }
14152         }
14153
14154         /* If we find something that started out to look like one of these
14155          * constructs, but isn't, we continue below so that it can be checked
14156          * for being a class name with a typo of '.' or '=' instead of a colon.
14157          * */
14158     }
14159
14160     /* Here, we think there is a possibility that a [: :] class was meant, and
14161      * we have the first real character.  It could be they think the '^' comes
14162      * first */
14163     if (*p == '^') {
14164         found_problem = TRUE;
14165         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14166         complement = 1;
14167         p++;
14168
14169         if (isBLANK(*p)) {
14170             found_problem = TRUE;
14171
14172             do {
14173                 p++;
14174             } while (p < e && isBLANK(*p));
14175
14176             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14177         }
14178     }
14179
14180     /* But the first character should be a colon, which they could have easily
14181      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14182      * distinguish from a colon, so treat that as a colon).  */
14183     if (*p == ':') {
14184         p++;
14185         has_opening_colon = TRUE;
14186     }
14187     else if (*p == ';') {
14188         found_problem = TRUE;
14189         p++;
14190         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14191         has_opening_colon = TRUE;
14192     }
14193     else {
14194         found_problem = TRUE;
14195         ADD_POSIX_WARNING(p, "there must be a starting ':'");
14196
14197         /* Consider an initial punctuation (not one of the recognized ones) to
14198          * be a left terminator */
14199         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14200             p++;
14201         }
14202     }
14203
14204     /* They may think that you can put spaces between the components */
14205     if (isBLANK(*p)) {
14206         found_problem = TRUE;
14207
14208         do {
14209             p++;
14210         } while (p < e && isBLANK(*p));
14211
14212         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14213     }
14214
14215     if (*p == '^') {
14216
14217         /* We consider something like [^:^alnum:]] to not have been intended to
14218          * be a posix class, but XXX maybe we should */
14219         if (complement) {
14220             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14221         }
14222
14223         complement = 1;
14224         p++;
14225     }
14226
14227     /* Again, they may think that you can put spaces between the components */
14228     if (isBLANK(*p)) {
14229         found_problem = TRUE;
14230
14231         do {
14232             p++;
14233         } while (p < e && isBLANK(*p));
14234
14235         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14236     }
14237
14238     if (*p == ']') {
14239
14240         /* XXX This ']' may be a typo, and something else was meant.  But
14241          * treating it as such creates enough complications, that that
14242          * possibility isn't currently considered here.  So we assume that the
14243          * ']' is what is intended, and if we've already found an initial '[',
14244          * this leaves this construct looking like [:] or [:^], which almost
14245          * certainly weren't intended to be posix classes */
14246         if (has_opening_bracket) {
14247             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14248         }
14249
14250         /* But this function can be called when we parse the colon for
14251          * something like qr/[alpha:]]/, so we back up to look for the
14252          * beginning */
14253         p--;
14254
14255         if (*p == ';') {
14256             found_problem = TRUE;
14257             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14258         }
14259         else if (*p != ':') {
14260
14261             /* XXX We are currently very restrictive here, so this code doesn't
14262              * consider the possibility that, say, /[alpha.]]/ was intended to
14263              * be a posix class. */
14264             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14265         }
14266
14267         /* Here we have something like 'foo:]'.  There was no initial colon,
14268          * and we back up over 'foo.  XXX Unlike the going forward case, we
14269          * don't handle typos of non-word chars in the middle */
14270         has_opening_colon = FALSE;
14271         p--;
14272
14273         while (p > RExC_start && isWORDCHAR(*p)) {
14274             p--;
14275         }
14276         p++;
14277
14278         /* Here, we have positioned ourselves to where we think the first
14279          * character in the potential class is */
14280     }
14281
14282     /* Now the interior really starts.  There are certain key characters that
14283      * can end the interior, or these could just be typos.  To catch both
14284      * cases, we may have to do two passes.  In the first pass, we keep on
14285      * going unless we come to a sequence that matches
14286      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
14287      * This means it takes a sequence to end the pass, so two typos in a row if
14288      * that wasn't what was intended.  If the class is perfectly formed, just
14289      * this one pass is needed.  We also stop if there are too many characters
14290      * being accumulated, but this number is deliberately set higher than any
14291      * real class.  It is set high enough so that someone who thinks that
14292      * 'alphanumeric' is a correct name would get warned that it wasn't.
14293      * While doing the pass, we keep track of where the key characters were in
14294      * it.  If we don't find an end to the class, and one of the key characters
14295      * was found, we redo the pass, but stop when we get to that character.
14296      * Thus the key character was considered a typo in the first pass, but a
14297      * terminator in the second.  If two key characters are found, we stop at
14298      * the second one in the first pass.  Again this can miss two typos, but
14299      * catches a single one
14300      *
14301      * In the first pass, 'possible_end' starts as NULL, and then gets set to
14302      * point to the first key character.  For the second pass, it starts as -1.
14303      * */
14304
14305     name_start = p;
14306   parse_name:
14307     {
14308         bool has_blank               = FALSE;
14309         bool has_upper               = FALSE;
14310         bool has_terminating_colon   = FALSE;
14311         bool has_terminating_bracket = FALSE;
14312         bool has_semi_colon          = FALSE;
14313         unsigned int name_len        = 0;
14314         int punct_count              = 0;
14315
14316         while (p < e) {
14317
14318             /* Squeeze out blanks when looking up the class name below */
14319             if (isBLANK(*p) ) {
14320                 has_blank = TRUE;
14321                 found_problem = TRUE;
14322                 p++;
14323                 continue;
14324             }
14325
14326             /* The name will end with a punctuation */
14327             if (isPUNCT(*p)) {
14328                 const char * peek = p + 1;
14329
14330                 /* Treat any non-']' punctuation followed by a ']' (possibly
14331                  * with intervening blanks) as trying to terminate the class.
14332                  * ']]' is very likely to mean a class was intended (but
14333                  * missing the colon), but the warning message that gets
14334                  * generated shows the error position better if we exit the
14335                  * loop at the bottom (eventually), so skip it here. */
14336                 if (*p != ']') {
14337                     if (peek < e && isBLANK(*peek)) {
14338                         has_blank = TRUE;
14339                         found_problem = TRUE;
14340                         do {
14341                             peek++;
14342                         } while (peek < e && isBLANK(*peek));
14343                     }
14344
14345                     if (peek < e && *peek == ']') {
14346                         has_terminating_bracket = TRUE;
14347                         if (*p == ':') {
14348                             has_terminating_colon = TRUE;
14349                         }
14350                         else if (*p == ';') {
14351                             has_semi_colon = TRUE;
14352                             has_terminating_colon = TRUE;
14353                         }
14354                         else {
14355                             found_problem = TRUE;
14356                         }
14357                         p = peek + 1;
14358                         goto try_posix;
14359                     }
14360                 }
14361
14362                 /* Here we have punctuation we thought didn't end the class.
14363                  * Keep track of the position of the key characters that are
14364                  * more likely to have been class-enders */
14365                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14366
14367                     /* Allow just one such possible class-ender not actually
14368                      * ending the class. */
14369                     if (possible_end) {
14370                         break;
14371                     }
14372                     possible_end = p;
14373                 }
14374
14375                 /* If we have too many punctuation characters, no use in
14376                  * keeping going */
14377                 if (++punct_count > max_distance) {
14378                     break;
14379                 }
14380
14381                 /* Treat the punctuation as a typo. */
14382                 input_text[name_len++] = *p;
14383                 p++;
14384             }
14385             else if (isUPPER(*p)) { /* Use lowercase for lookup */
14386                 input_text[name_len++] = toLOWER(*p);
14387                 has_upper = TRUE;
14388                 found_problem = TRUE;
14389                 p++;
14390             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14391                 input_text[name_len++] = *p;
14392                 p++;
14393             }
14394             else {
14395                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14396                 p+= UTF8SKIP(p);
14397             }
14398
14399             /* The declaration of 'input_text' is how long we allow a potential
14400              * class name to be, before saying they didn't mean a class name at
14401              * all */
14402             if (name_len >= C_ARRAY_LENGTH(input_text)) {
14403                 break;
14404             }
14405         }
14406
14407         /* We get to here when the possible class name hasn't been properly
14408          * terminated before:
14409          *   1) we ran off the end of the pattern; or
14410          *   2) found two characters, each of which might have been intended to
14411          *      be the name's terminator
14412          *   3) found so many punctuation characters in the purported name,
14413          *      that the edit distance to a valid one is exceeded
14414          *   4) we decided it was more characters than anyone could have
14415          *      intended to be one. */
14416
14417         found_problem = TRUE;
14418
14419         /* In the final two cases, we know that looking up what we've
14420          * accumulated won't lead to a match, even a fuzzy one. */
14421         if (   name_len >= C_ARRAY_LENGTH(input_text)
14422             || punct_count > max_distance)
14423         {
14424             /* If there was an intermediate key character that could have been
14425              * an intended end, redo the parse, but stop there */
14426             if (possible_end && possible_end != (char *) -1) {
14427                 possible_end = (char *) -1; /* Special signal value to say
14428                                                we've done a first pass */
14429                 p = name_start;
14430                 goto parse_name;
14431             }
14432
14433             /* Otherwise, it can't have meant to have been a class */
14434             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14435         }
14436
14437         /* If we ran off the end, and the final character was a punctuation
14438          * one, back up one, to look at that final one just below.  Later, we
14439          * will restore the parse pointer if appropriate */
14440         if (name_len && p == e && isPUNCT(*(p-1))) {
14441             p--;
14442             name_len--;
14443         }
14444
14445         if (p < e && isPUNCT(*p)) {
14446             if (*p == ']') {
14447                 has_terminating_bracket = TRUE;
14448
14449                 /* If this is a 2nd ']', and the first one is just below this
14450                  * one, consider that to be the real terminator.  This gives a
14451                  * uniform and better positioning for the warning message  */
14452                 if (   possible_end
14453                     && possible_end != (char *) -1
14454                     && *possible_end == ']'
14455                     && name_len && input_text[name_len - 1] == ']')
14456                 {
14457                     name_len--;
14458                     p = possible_end;
14459
14460                     /* And this is actually equivalent to having done the 2nd
14461                      * pass now, so set it to not try again */
14462                     possible_end = (char *) -1;
14463                 }
14464             }
14465             else {
14466                 if (*p == ':') {
14467                     has_terminating_colon = TRUE;
14468                 }
14469                 else if (*p == ';') {
14470                     has_semi_colon = TRUE;
14471                     has_terminating_colon = TRUE;
14472                 }
14473                 p++;
14474             }
14475         }
14476
14477     try_posix:
14478
14479         /* Here, we have a class name to look up.  We can short circuit the
14480          * stuff below for short names that can't possibly be meant to be a
14481          * class name.  (We can do this on the first pass, as any second pass
14482          * will yield an even shorter name) */
14483         if (name_len < 3) {
14484             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14485         }
14486
14487         /* Find which class it is.  Initially switch on the length of the name.
14488          * */
14489         switch (name_len) {
14490             case 4:
14491                 if (memEQ(name_start, "word", 4)) {
14492                     /* this is not POSIX, this is the Perl \w */
14493                     class_number = ANYOF_WORDCHAR;
14494                 }
14495                 break;
14496             case 5:
14497                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14498                  *                        graph lower print punct space upper
14499                  * Offset 4 gives the best switch position.  */
14500                 switch (name_start[4]) {
14501                     case 'a':
14502                         if (memEQ(name_start, "alph", 4)) /* alpha */
14503                             class_number = ANYOF_ALPHA;
14504                         break;
14505                     case 'e':
14506                         if (memEQ(name_start, "spac", 4)) /* space */
14507                             class_number = ANYOF_SPACE;
14508                         break;
14509                     case 'h':
14510                         if (memEQ(name_start, "grap", 4)) /* graph */
14511                             class_number = ANYOF_GRAPH;
14512                         break;
14513                     case 'i':
14514                         if (memEQ(name_start, "asci", 4)) /* ascii */
14515                             class_number = ANYOF_ASCII;
14516                         break;
14517                     case 'k':
14518                         if (memEQ(name_start, "blan", 4)) /* blank */
14519                             class_number = ANYOF_BLANK;
14520                         break;
14521                     case 'l':
14522                         if (memEQ(name_start, "cntr", 4)) /* cntrl */
14523                             class_number = ANYOF_CNTRL;
14524                         break;
14525                     case 'm':
14526                         if (memEQ(name_start, "alnu", 4)) /* alnum */
14527                             class_number = ANYOF_ALPHANUMERIC;
14528                         break;
14529                     case 'r':
14530                         if (memEQ(name_start, "lowe", 4)) /* lower */
14531                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14532                         else if (memEQ(name_start, "uppe", 4)) /* upper */
14533                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14534                         break;
14535                     case 't':
14536                         if (memEQ(name_start, "digi", 4)) /* digit */
14537                             class_number = ANYOF_DIGIT;
14538                         else if (memEQ(name_start, "prin", 4)) /* print */
14539                             class_number = ANYOF_PRINT;
14540                         else if (memEQ(name_start, "punc", 4)) /* punct */
14541                             class_number = ANYOF_PUNCT;
14542                         break;
14543                 }
14544                 break;
14545             case 6:
14546                 if (memEQ(name_start, "xdigit", 6))
14547                     class_number = ANYOF_XDIGIT;
14548                 break;
14549         }
14550
14551         /* If the name exactly matches a posix class name the class number will
14552          * here be set to it, and the input almost certainly was meant to be a
14553          * posix class, so we can skip further checking.  If instead the syntax
14554          * is exactly correct, but the name isn't one of the legal ones, we
14555          * will return that as an error below.  But if neither of these apply,
14556          * it could be that no posix class was intended at all, or that one
14557          * was, but there was a typo.  We tease these apart by doing fuzzy
14558          * matching on the name */
14559         if (class_number == OOB_NAMEDCLASS && found_problem) {
14560             const UV posix_names[][6] = {
14561                                                 { 'a', 'l', 'n', 'u', 'm' },
14562                                                 { 'a', 'l', 'p', 'h', 'a' },
14563                                                 { 'a', 's', 'c', 'i', 'i' },
14564                                                 { 'b', 'l', 'a', 'n', 'k' },
14565                                                 { 'c', 'n', 't', 'r', 'l' },
14566                                                 { 'd', 'i', 'g', 'i', 't' },
14567                                                 { 'g', 'r', 'a', 'p', 'h' },
14568                                                 { 'l', 'o', 'w', 'e', 'r' },
14569                                                 { 'p', 'r', 'i', 'n', 't' },
14570                                                 { 'p', 'u', 'n', 'c', 't' },
14571                                                 { 's', 'p', 'a', 'c', 'e' },
14572                                                 { 'u', 'p', 'p', 'e', 'r' },
14573                                                 { 'w', 'o', 'r', 'd' },
14574                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
14575                                             };
14576             /* The names of the above all have added NULs to make them the same
14577              * size, so we need to also have the real lengths */
14578             const UV posix_name_lengths[] = {
14579                                                 sizeof("alnum") - 1,
14580                                                 sizeof("alpha") - 1,
14581                                                 sizeof("ascii") - 1,
14582                                                 sizeof("blank") - 1,
14583                                                 sizeof("cntrl") - 1,
14584                                                 sizeof("digit") - 1,
14585                                                 sizeof("graph") - 1,
14586                                                 sizeof("lower") - 1,
14587                                                 sizeof("print") - 1,
14588                                                 sizeof("punct") - 1,
14589                                                 sizeof("space") - 1,
14590                                                 sizeof("upper") - 1,
14591                                                 sizeof("word")  - 1,
14592                                                 sizeof("xdigit")- 1
14593                                             };
14594             unsigned int i;
14595             int temp_max = max_distance;    /* Use a temporary, so if we
14596                                                reparse, we haven't changed the
14597                                                outer one */
14598
14599             /* Use a smaller max edit distance if we are missing one of the
14600              * delimiters */
14601             if (   has_opening_bracket + has_opening_colon < 2
14602                 || has_terminating_bracket + has_terminating_colon < 2)
14603             {
14604                 temp_max--;
14605             }
14606
14607             /* See if the input name is close to a legal one */
14608             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14609
14610                 /* Short circuit call if the lengths are too far apart to be
14611                  * able to match */
14612                 if (abs( (int) (name_len - posix_name_lengths[i]))
14613                     > temp_max)
14614                 {
14615                     continue;
14616                 }
14617
14618                 if (edit_distance(input_text,
14619                                   posix_names[i],
14620                                   name_len,
14621                                   posix_name_lengths[i],
14622                                   temp_max
14623                                  )
14624                     > -1)
14625                 { /* If it is close, it probably was intended to be a class */
14626                     goto probably_meant_to_be;
14627                 }
14628             }
14629
14630             /* Here the input name is not close enough to a valid class name
14631              * for us to consider it to be intended to be a posix class.  If
14632              * we haven't already done so, and the parse found a character that
14633              * could have been terminators for the name, but which we absorbed
14634              * as typos during the first pass, repeat the parse, signalling it
14635              * to stop at that character */
14636             if (possible_end && possible_end != (char *) -1) {
14637                 possible_end = (char *) -1;
14638                 p = name_start;
14639                 goto parse_name;
14640             }
14641
14642             /* Here neither pass found a close-enough class name */
14643             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14644         }
14645
14646     probably_meant_to_be:
14647
14648         /* Here we think that a posix specification was intended.  Update any
14649          * parse pointer */
14650         if (updated_parse_ptr) {
14651             *updated_parse_ptr = (char *) p;
14652         }
14653
14654         /* If a posix class name was intended but incorrectly specified, we
14655          * output or return the warnings */
14656         if (found_problem) {
14657
14658             /* We set flags for these issues in the parse loop above instead of
14659              * adding them to the list of warnings, because we can parse it
14660              * twice, and we only want one warning instance */
14661             if (has_upper) {
14662                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14663             }
14664             if (has_blank) {
14665                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14666             }
14667             if (has_semi_colon) {
14668                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14669             }
14670             else if (! has_terminating_colon) {
14671                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14672             }
14673             if (! has_terminating_bracket) {
14674                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14675             }
14676
14677             if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
14678                 *posix_warnings = RExC_warn_text;
14679             }
14680         }
14681         else if (class_number != OOB_NAMEDCLASS) {
14682             /* If it is a known class, return the class.  The class number
14683              * #defines are structured so each complement is +1 to the normal
14684              * one */
14685             return class_number + complement;
14686         }
14687         else if (! check_only) {
14688
14689             /* Here, it is an unrecognized class.  This is an error (unless the
14690             * call is to check only, which we've already handled above) */
14691             const char * const complement_string = (complement)
14692                                                    ? "^"
14693                                                    : "";
14694             RExC_parse = (char *) p;
14695             vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
14696                         complement_string,
14697                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14698         }
14699     }
14700
14701     return OOB_NAMEDCLASS;
14702 }
14703 #undef ADD_POSIX_WARNING
14704
14705 STATIC unsigned  int
14706 S_regex_set_precedence(const U8 my_operator) {
14707
14708     /* Returns the precedence in the (?[...]) construct of the input operator,
14709      * specified by its character representation.  The precedence follows
14710      * general Perl rules, but it extends this so that ')' and ']' have (low)
14711      * precedence even though they aren't really operators */
14712
14713     switch (my_operator) {
14714         case '!':
14715             return 5;
14716         case '&':
14717             return 4;
14718         case '^':
14719         case '|':
14720         case '+':
14721         case '-':
14722             return 3;
14723         case ')':
14724             return 2;
14725         case ']':
14726             return 1;
14727     }
14728
14729     NOT_REACHED; /* NOTREACHED */
14730     return 0;   /* Silence compiler warning */
14731 }
14732
14733 STATIC regnode *
14734 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14735                     I32 *flagp, U32 depth,
14736                     char * const oregcomp_parse)
14737 {
14738     /* Handle the (?[...]) construct to do set operations */
14739
14740     U8 curchar;                     /* Current character being parsed */
14741     UV start, end;                  /* End points of code point ranges */
14742     SV* final = NULL;               /* The end result inversion list */
14743     SV* result_string;              /* 'final' stringified */
14744     AV* stack;                      /* stack of operators and operands not yet
14745                                        resolved */
14746     AV* fence_stack = NULL;         /* A stack containing the positions in
14747                                        'stack' of where the undealt-with left
14748                                        parens would be if they were actually
14749                                        put there */
14750     /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
14751      * in Solaris Studio 12.3. See RT #127455 */
14752     VOL IV fence = 0;               /* Position of where most recent undealt-
14753                                        with left paren in stack is; -1 if none.
14754                                      */
14755     STRLEN len;                     /* Temporary */
14756     regnode* node;                  /* Temporary, and final regnode returned by
14757                                        this function */
14758     const bool save_fold = FOLD;    /* Temporary */
14759     char *save_end, *save_parse;    /* Temporaries */
14760     const bool in_locale = LOC;     /* we turn off /l during processing */
14761     AV* posix_warnings = NULL;
14762
14763     GET_RE_DEBUG_FLAGS_DECL;
14764
14765     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14766
14767     if (in_locale) {
14768         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14769     }
14770
14771     REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
14772                                          This is required so that the compile
14773                                          time values are valid in all runtime
14774                                          cases */
14775
14776     /* This will return only an ANYOF regnode, or (unlikely) something smaller
14777      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
14778      * call regclass to handle '[]' so as to not have to reinvent its parsing
14779      * rules here (throwing away the size it computes each time).  And, we exit
14780      * upon an unescaped ']' that isn't one ending a regclass.  To do both
14781      * these things, we need to realize that something preceded by a backslash
14782      * is escaped, so we have to keep track of backslashes */
14783     if (SIZE_ONLY) {
14784         UV depth = 0; /* how many nested (?[...]) constructs */
14785
14786         while (RExC_parse < RExC_end) {
14787             SV* current = NULL;
14788
14789             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14790                                     TRUE /* Force /x */ );
14791
14792             switch (*RExC_parse) {
14793                 case '?':
14794                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
14795                     /* FALLTHROUGH */
14796                 default:
14797                     break;
14798                 case '\\':
14799                     /* Skip past this, so the next character gets skipped, after
14800                      * the switch */
14801                     RExC_parse++;
14802                     if (*RExC_parse == 'c') {
14803                             /* Skip the \cX notation for control characters */
14804                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14805                     }
14806                     break;
14807
14808                 case '[':
14809                 {
14810                     /* See if this is a [:posix:] class. */
14811                     bool is_posix_class = (OOB_NAMEDCLASS
14812                             < handle_possible_posix(pRExC_state,
14813                                                 RExC_parse + 1,
14814                                                 NULL,
14815                                                 NULL,
14816                                                 TRUE /* checking only */));
14817                     /* If it is a posix class, leave the parse pointer at the
14818                      * '[' to fool regclass() into thinking it is part of a
14819                      * '[[:posix:]]'. */
14820                     if (! is_posix_class) {
14821                         RExC_parse++;
14822                     }
14823
14824                     /* regclass() can only return RESTART_PASS1 and NEED_UTF8
14825                      * if multi-char folds are allowed.  */
14826                     if (!regclass(pRExC_state, flagp,depth+1,
14827                                   is_posix_class, /* parse the whole char
14828                                                      class only if not a
14829                                                      posix class */
14830                                   FALSE, /* don't allow multi-char folds */
14831                                   TRUE, /* silence non-portable warnings. */
14832                                   TRUE, /* strict */
14833                                   FALSE, /* Require return to be an ANYOF */
14834                                   &current,
14835                                   &posix_warnings
14836                                  ))
14837                         FAIL2("panic: regclass returned NULL to handle_sets, "
14838                               "flags=%#"UVxf"", (UV) *flagp);
14839
14840                     /* function call leaves parse pointing to the ']', except
14841                      * if we faked it */
14842                     if (is_posix_class) {
14843                         RExC_parse--;
14844                     }
14845
14846                     SvREFCNT_dec(current);   /* In case it returned something */
14847                     break;
14848                 }
14849
14850                 case ']':
14851                     if (depth--) break;
14852                     RExC_parse++;
14853                     if (*RExC_parse == ')') {
14854                         node = reganode(pRExC_state, ANYOF, 0);
14855                         RExC_size += ANYOF_SKIP;
14856                         nextchar(pRExC_state);
14857                         Set_Node_Length(node,
14858                                 RExC_parse - oregcomp_parse + 1); /* MJD */
14859                         if (in_locale) {
14860                             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14861                         }
14862
14863                         return node;
14864                     }
14865                     goto no_close;
14866             }
14867
14868             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14869         }
14870
14871       no_close:
14872         /* We output the messages even if warnings are off, because we'll fail
14873          * the very next thing, and these give a likely diagnosis for that */
14874         if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
14875             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
14876         }
14877
14878         FAIL("Syntax error in (?[...])");
14879     }
14880
14881     /* Pass 2 only after this. */
14882     Perl_ck_warner_d(aTHX_
14883         packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
14884         "The regex_sets feature is experimental" REPORT_LOCATION,
14885         REPORT_LOCATION_ARGS(RExC_parse));
14886
14887     /* Everything in this construct is a metacharacter.  Operands begin with
14888      * either a '\' (for an escape sequence), or a '[' for a bracketed
14889      * character class.  Any other character should be an operator, or
14890      * parenthesis for grouping.  Both types of operands are handled by calling
14891      * regclass() to parse them.  It is called with a parameter to indicate to
14892      * return the computed inversion list.  The parsing here is implemented via
14893      * a stack.  Each entry on the stack is a single character representing one
14894      * of the operators; or else a pointer to an operand inversion list. */
14895
14896 #define IS_OPERATOR(a) SvIOK(a)
14897 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
14898
14899     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
14900      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
14901      * with pronouncing it called it Reverse Polish instead, but now that YOU
14902      * know how to pronounce it you can use the correct term, thus giving due
14903      * credit to the person who invented it, and impressing your geek friends.
14904      * Wikipedia says that the pronounciation of "Ł" has been changing so that
14905      * it is now more like an English initial W (as in wonk) than an L.)
14906      *
14907      * This means that, for example, 'a | b & c' is stored on the stack as
14908      *
14909      * c  [4]
14910      * b  [3]
14911      * &  [2]
14912      * a  [1]
14913      * |  [0]
14914      *
14915      * where the numbers in brackets give the stack [array] element number.
14916      * In this implementation, parentheses are not stored on the stack.
14917      * Instead a '(' creates a "fence" so that the part of the stack below the
14918      * fence is invisible except to the corresponding ')' (this allows us to
14919      * replace testing for parens, by using instead subtraction of the fence
14920      * position).  As new operands are processed they are pushed onto the stack
14921      * (except as noted in the next paragraph).  New operators of higher
14922      * precedence than the current final one are inserted on the stack before
14923      * the lhs operand (so that when the rhs is pushed next, everything will be
14924      * in the correct positions shown above.  When an operator of equal or
14925      * lower precedence is encountered in parsing, all the stacked operations
14926      * of equal or higher precedence are evaluated, leaving the result as the
14927      * top entry on the stack.  This makes higher precedence operations
14928      * evaluate before lower precedence ones, and causes operations of equal
14929      * precedence to left associate.
14930      *
14931      * The only unary operator '!' is immediately pushed onto the stack when
14932      * encountered.  When an operand is encountered, if the top of the stack is
14933      * a '!", the complement is immediately performed, and the '!' popped.  The
14934      * resulting value is treated as a new operand, and the logic in the
14935      * previous paragraph is executed.  Thus in the expression
14936      *      [a] + ! [b]
14937      * the stack looks like
14938      *
14939      * !
14940      * a
14941      * +
14942      *
14943      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
14944      * becomes
14945      *
14946      * !b
14947      * a
14948      * +
14949      *
14950      * A ')' is treated as an operator with lower precedence than all the
14951      * aforementioned ones, which causes all operations on the stack above the
14952      * corresponding '(' to be evaluated down to a single resultant operand.
14953      * Then the fence for the '(' is removed, and the operand goes through the
14954      * algorithm above, without the fence.
14955      *
14956      * A separate stack is kept of the fence positions, so that the position of
14957      * the latest so-far unbalanced '(' is at the top of it.
14958      *
14959      * The ']' ending the construct is treated as the lowest operator of all,
14960      * so that everything gets evaluated down to a single operand, which is the
14961      * result */
14962
14963     sv_2mortal((SV *)(stack = newAV()));
14964     sv_2mortal((SV *)(fence_stack = newAV()));
14965
14966     while (RExC_parse < RExC_end) {
14967         I32 top_index;              /* Index of top-most element in 'stack' */
14968         SV** top_ptr;               /* Pointer to top 'stack' element */
14969         SV* current = NULL;         /* To contain the current inversion list
14970                                        operand */
14971         SV* only_to_avoid_leaks;
14972
14973         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14974                                 TRUE /* Force /x */ );
14975         if (RExC_parse >= RExC_end) {
14976             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
14977         }
14978
14979         curchar = UCHARAT(RExC_parse);
14980
14981 redo_curchar:
14982
14983         top_index = av_tindex_nomg(stack);
14984
14985         switch (curchar) {
14986             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
14987             char stacked_operator;  /* The topmost operator on the 'stack'. */
14988             SV* lhs;                /* Operand to the left of the operator */
14989             SV* rhs;                /* Operand to the right of the operator */
14990             SV* fence_ptr;          /* Pointer to top element of the fence
14991                                        stack */
14992
14993             case '(':
14994
14995                 if (   RExC_parse < RExC_end - 1
14996                     && (UCHARAT(RExC_parse + 1) == '?'))
14997                 {
14998                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
14999                      * This happens when we have some thing like
15000                      *
15001                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15002                      *   ...
15003                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15004                      *
15005                      * Here we would be handling the interpolated
15006                      * '$thai_or_lao'.  We handle this by a recursive call to
15007                      * ourselves which returns the inversion list the
15008                      * interpolated expression evaluates to.  We use the flags
15009                      * from the interpolated pattern. */
15010                     U32 save_flags = RExC_flags;
15011                     const char * save_parse;
15012
15013                     RExC_parse += 2;        /* Skip past the '(?' */
15014                     save_parse = RExC_parse;
15015
15016                     /* Parse any flags for the '(?' */
15017                     parse_lparen_question_flags(pRExC_state);
15018
15019                     if (RExC_parse == save_parse  /* Makes sure there was at
15020                                                      least one flag (or else
15021                                                      this embedding wasn't
15022                                                      compiled) */
15023                         || RExC_parse >= RExC_end - 4
15024                         || UCHARAT(RExC_parse) != ':'
15025                         || UCHARAT(++RExC_parse) != '('
15026                         || UCHARAT(++RExC_parse) != '?'
15027                         || UCHARAT(++RExC_parse) != '[')
15028                     {
15029
15030                         /* In combination with the above, this moves the
15031                          * pointer to the point just after the first erroneous
15032                          * character (or if there are no flags, to where they
15033                          * should have been) */
15034                         if (RExC_parse >= RExC_end - 4) {
15035                             RExC_parse = RExC_end;
15036                         }
15037                         else if (RExC_parse != save_parse) {
15038                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15039                         }
15040                         vFAIL("Expecting '(?flags:(?[...'");
15041                     }
15042
15043                     /* Recurse, with the meat of the embedded expression */
15044                     RExC_parse++;
15045                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15046                                                     depth+1, oregcomp_parse);
15047
15048                     /* Here, 'current' contains the embedded expression's
15049                      * inversion list, and RExC_parse points to the trailing
15050                      * ']'; the next character should be the ')' */
15051                     RExC_parse++;
15052                     assert(UCHARAT(RExC_parse) == ')');
15053
15054                     /* Then the ')' matching the original '(' handled by this
15055                      * case: statement */
15056                     RExC_parse++;
15057                     assert(UCHARAT(RExC_parse) == ')');
15058
15059                     RExC_parse++;
15060                     RExC_flags = save_flags;
15061                     goto handle_operand;
15062                 }
15063
15064                 /* A regular '('.  Look behind for illegal syntax */
15065                 if (top_index - fence >= 0) {
15066                     /* If the top entry on the stack is an operator, it had
15067                      * better be a '!', otherwise the entry below the top
15068                      * operand should be an operator */
15069                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
15070                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15071                         || (   IS_OPERAND(*top_ptr)
15072                             && (   top_index - fence < 1
15073                                 || ! (stacked_ptr = av_fetch(stack,
15074                                                              top_index - 1,
15075                                                              FALSE))
15076                                 || ! IS_OPERATOR(*stacked_ptr))))
15077                     {
15078                         RExC_parse++;
15079                         vFAIL("Unexpected '(' with no preceding operator");
15080                     }
15081                 }
15082
15083                 /* Stack the position of this undealt-with left paren */
15084                 fence = top_index + 1;
15085                 av_push(fence_stack, newSViv(fence));
15086                 break;
15087
15088             case '\\':
15089                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15090                  * multi-char folds are allowed.  */
15091                 if (!regclass(pRExC_state, flagp,depth+1,
15092                               TRUE, /* means parse just the next thing */
15093                               FALSE, /* don't allow multi-char folds */
15094                               FALSE, /* don't silence non-portable warnings.  */
15095                               TRUE,  /* strict */
15096                               FALSE, /* Require return to be an ANYOF */
15097                               &current,
15098                               NULL))
15099                 {
15100                     FAIL2("panic: regclass returned NULL to handle_sets, "
15101                           "flags=%#"UVxf"", (UV) *flagp);
15102                 }
15103
15104                 /* regclass() will return with parsing just the \ sequence,
15105                  * leaving the parse pointer at the next thing to parse */
15106                 RExC_parse--;
15107                 goto handle_operand;
15108
15109             case '[':   /* Is a bracketed character class */
15110             {
15111                 /* See if this is a [:posix:] class. */
15112                 bool is_posix_class = (OOB_NAMEDCLASS
15113                             < handle_possible_posix(pRExC_state,
15114                                                 RExC_parse + 1,
15115                                                 NULL,
15116                                                 NULL,
15117                                                 TRUE /* checking only */));
15118                 /* If it is a posix class, leave the parse pointer at the '['
15119                  * to fool regclass() into thinking it is part of a
15120                  * '[[:posix:]]'. */
15121                 if (! is_posix_class) {
15122                     RExC_parse++;
15123                 }
15124
15125                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
15126                  * multi-char folds are allowed.  */
15127                 if (!regclass(pRExC_state, flagp,depth+1,
15128                                 is_posix_class, /* parse the whole char
15129                                                     class only if not a
15130                                                     posix class */
15131                                 FALSE, /* don't allow multi-char folds */
15132                                 TRUE, /* silence non-portable warnings. */
15133                                 TRUE, /* strict */
15134                                 FALSE, /* Require return to be an ANYOF */
15135                                 &current,
15136                                 NULL
15137                                 ))
15138                 {
15139                     FAIL2("panic: regclass returned NULL to handle_sets, "
15140                           "flags=%#"UVxf"", (UV) *flagp);
15141                 }
15142
15143                 /* function call leaves parse pointing to the ']', except if we
15144                  * faked it */
15145                 if (is_posix_class) {
15146                     RExC_parse--;
15147                 }
15148
15149                 goto handle_operand;
15150             }
15151
15152             case ']':
15153                 if (top_index >= 1) {
15154                     goto join_operators;
15155                 }
15156
15157                 /* Only a single operand on the stack: are done */
15158                 goto done;
15159
15160             case ')':
15161                 if (av_tindex_nomg(fence_stack) < 0) {
15162                     RExC_parse++;
15163                     vFAIL("Unexpected ')'");
15164                 }
15165
15166                  /* If at least two thing on the stack, treat this as an
15167                   * operator */
15168                 if (top_index - fence >= 1) {
15169                     goto join_operators;
15170                 }
15171
15172                 /* Here only a single thing on the fenced stack, and there is a
15173                  * fence.  Get rid of it */
15174                 fence_ptr = av_pop(fence_stack);
15175                 assert(fence_ptr);
15176                 fence = SvIV(fence_ptr) - 1;
15177                 SvREFCNT_dec_NN(fence_ptr);
15178                 fence_ptr = NULL;
15179
15180                 if (fence < 0) {
15181                     fence = 0;
15182                 }
15183
15184                 /* Having gotten rid of the fence, we pop the operand at the
15185                  * stack top and process it as a newly encountered operand */
15186                 current = av_pop(stack);
15187                 if (IS_OPERAND(current)) {
15188                     goto handle_operand;
15189                 }
15190
15191                 RExC_parse++;
15192                 goto bad_syntax;
15193
15194             case '&':
15195             case '|':
15196             case '+':
15197             case '-':
15198             case '^':
15199
15200                 /* These binary operators should have a left operand already
15201                  * parsed */
15202                 if (   top_index - fence < 0
15203                     || top_index - fence == 1
15204                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15205                     || ! IS_OPERAND(*top_ptr))
15206                 {
15207                     goto unexpected_binary;
15208                 }
15209
15210                 /* If only the one operand is on the part of the stack visible
15211                  * to us, we just place this operator in the proper position */
15212                 if (top_index - fence < 2) {
15213
15214                     /* Place the operator before the operand */
15215
15216                     SV* lhs = av_pop(stack);
15217                     av_push(stack, newSVuv(curchar));
15218                     av_push(stack, lhs);
15219                     break;
15220                 }
15221
15222                 /* But if there is something else on the stack, we need to
15223                  * process it before this new operator if and only if the
15224                  * stacked operation has equal or higher precedence than the
15225                  * new one */
15226
15227              join_operators:
15228
15229                 /* The operator on the stack is supposed to be below both its
15230                  * operands */
15231                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15232                     || IS_OPERAND(*stacked_ptr))
15233                 {
15234                     /* But if not, it's legal and indicates we are completely
15235                      * done if and only if we're currently processing a ']',
15236                      * which should be the final thing in the expression */
15237                     if (curchar == ']') {
15238                         goto done;
15239                     }
15240
15241                   unexpected_binary:
15242                     RExC_parse++;
15243                     vFAIL2("Unexpected binary operator '%c' with no "
15244                            "preceding operand", curchar);
15245                 }
15246                 stacked_operator = (char) SvUV(*stacked_ptr);
15247
15248                 if (regex_set_precedence(curchar)
15249                     > regex_set_precedence(stacked_operator))
15250                 {
15251                     /* Here, the new operator has higher precedence than the
15252                      * stacked one.  This means we need to add the new one to
15253                      * the stack to await its rhs operand (and maybe more
15254                      * stuff).  We put it before the lhs operand, leaving
15255                      * untouched the stacked operator and everything below it
15256                      * */
15257                     lhs = av_pop(stack);
15258                     assert(IS_OPERAND(lhs));
15259
15260                     av_push(stack, newSVuv(curchar));
15261                     av_push(stack, lhs);
15262                     break;
15263                 }
15264
15265                 /* Here, the new operator has equal or lower precedence than
15266                  * what's already there.  This means the operation already
15267                  * there should be performed now, before the new one. */
15268
15269                 rhs = av_pop(stack);
15270                 if (! IS_OPERAND(rhs)) {
15271
15272                     /* This can happen when a ! is not followed by an operand,
15273                      * like in /(?[\t &!])/ */
15274                     goto bad_syntax;
15275                 }
15276
15277                 lhs = av_pop(stack);
15278
15279                 if (! IS_OPERAND(lhs)) {
15280
15281                     /* This can happen when there is an empty (), like in
15282                      * /(?[[0]+()+])/ */
15283                     goto bad_syntax;
15284                 }
15285
15286                 switch (stacked_operator) {
15287                     case '&':
15288                         _invlist_intersection(lhs, rhs, &rhs);
15289                         break;
15290
15291                     case '|':
15292                     case '+':
15293                         _invlist_union(lhs, rhs, &rhs);
15294                         break;
15295
15296                     case '-':
15297                         _invlist_subtract(lhs, rhs, &rhs);
15298                         break;
15299
15300                     case '^':   /* The union minus the intersection */
15301                     {
15302                         SV* i = NULL;
15303                         SV* u = NULL;
15304                         SV* element;
15305
15306                         _invlist_union(lhs, rhs, &u);
15307                         _invlist_intersection(lhs, rhs, &i);
15308                         /* _invlist_subtract will overwrite rhs
15309                             without freeing what it already contains */
15310                         element = rhs;
15311                         _invlist_subtract(u, i, &rhs);
15312                         SvREFCNT_dec_NN(i);
15313                         SvREFCNT_dec_NN(u);
15314                         SvREFCNT_dec_NN(element);
15315                         break;
15316                     }
15317                 }
15318                 SvREFCNT_dec(lhs);
15319
15320                 /* Here, the higher precedence operation has been done, and the
15321                  * result is in 'rhs'.  We overwrite the stacked operator with
15322                  * the result.  Then we redo this code to either push the new
15323                  * operator onto the stack or perform any higher precedence
15324                  * stacked operation */
15325                 only_to_avoid_leaks = av_pop(stack);
15326                 SvREFCNT_dec(only_to_avoid_leaks);
15327                 av_push(stack, rhs);
15328                 goto redo_curchar;
15329
15330             case '!':   /* Highest priority, right associative */
15331
15332                 /* If what's already at the top of the stack is another '!",
15333                  * they just cancel each other out */
15334                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
15335                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15336                 {
15337                     only_to_avoid_leaks = av_pop(stack);
15338                     SvREFCNT_dec(only_to_avoid_leaks);
15339                 }
15340                 else { /* Otherwise, since it's right associative, just push
15341                           onto the stack */
15342                     av_push(stack, newSVuv(curchar));
15343                 }
15344                 break;
15345
15346             default:
15347                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15348                 vFAIL("Unexpected character");
15349
15350           handle_operand:
15351
15352             /* Here 'current' is the operand.  If something is already on the
15353              * stack, we have to check if it is a !.  But first, the code above
15354              * may have altered the stack in the time since we earlier set
15355              * 'top_index'.  */
15356
15357             top_index = av_tindex_nomg(stack);
15358             if (top_index - fence >= 0) {
15359                 /* If the top entry on the stack is an operator, it had better
15360                  * be a '!', otherwise the entry below the top operand should
15361                  * be an operator */
15362                 top_ptr = av_fetch(stack, top_index, FALSE);
15363                 assert(top_ptr);
15364                 if (IS_OPERATOR(*top_ptr)) {
15365
15366                     /* The only permissible operator at the top of the stack is
15367                      * '!', which is applied immediately to this operand. */
15368                     curchar = (char) SvUV(*top_ptr);
15369                     if (curchar != '!') {
15370                         SvREFCNT_dec(current);
15371                         vFAIL2("Unexpected binary operator '%c' with no "
15372                                 "preceding operand", curchar);
15373                     }
15374
15375                     _invlist_invert(current);
15376
15377                     only_to_avoid_leaks = av_pop(stack);
15378                     SvREFCNT_dec(only_to_avoid_leaks);
15379
15380                     /* And we redo with the inverted operand.  This allows
15381                      * handling multiple ! in a row */
15382                     goto handle_operand;
15383                 }
15384                           /* Single operand is ok only for the non-binary ')'
15385                            * operator */
15386                 else if ((top_index - fence == 0 && curchar != ')')
15387                          || (top_index - fence > 0
15388                              && (! (stacked_ptr = av_fetch(stack,
15389                                                            top_index - 1,
15390                                                            FALSE))
15391                                  || IS_OPERAND(*stacked_ptr))))
15392                 {
15393                     SvREFCNT_dec(current);
15394                     vFAIL("Operand with no preceding operator");
15395                 }
15396             }
15397
15398             /* Here there was nothing on the stack or the top element was
15399              * another operand.  Just add this new one */
15400             av_push(stack, current);
15401
15402         } /* End of switch on next parse token */
15403
15404         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15405     } /* End of loop parsing through the construct */
15406
15407   done:
15408     if (av_tindex_nomg(fence_stack) >= 0) {
15409         vFAIL("Unmatched (");
15410     }
15411
15412     if (av_tindex_nomg(stack) < 0   /* Was empty */
15413         || ((final = av_pop(stack)) == NULL)
15414         || ! IS_OPERAND(final)
15415         || SvTYPE(final) != SVt_INVLIST
15416         || av_tindex_nomg(stack) >= 0)  /* More left on stack */
15417     {
15418       bad_syntax:
15419         SvREFCNT_dec(final);
15420         vFAIL("Incomplete expression within '(?[ ])'");
15421     }
15422
15423     /* Here, 'final' is the resultant inversion list from evaluating the
15424      * expression.  Return it if so requested */
15425     if (return_invlist) {
15426         *return_invlist = final;
15427         return END;
15428     }
15429
15430     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
15431      * expecting a string of ranges and individual code points */
15432     invlist_iterinit(final);
15433     result_string = newSVpvs("");
15434     while (invlist_iternext(final, &start, &end)) {
15435         if (start == end) {
15436             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
15437         }
15438         else {
15439             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
15440                                                      start,          end);
15441         }
15442     }
15443
15444     /* About to generate an ANYOF (or similar) node from the inversion list we
15445      * have calculated */
15446     save_parse = RExC_parse;
15447     RExC_parse = SvPV(result_string, len);
15448     save_end = RExC_end;
15449     RExC_end = RExC_parse + len;
15450
15451     /* We turn off folding around the call, as the class we have constructed
15452      * already has all folding taken into consideration, and we don't want
15453      * regclass() to add to that */
15454     RExC_flags &= ~RXf_PMf_FOLD;
15455     /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15456      * folds are allowed.  */
15457     node = regclass(pRExC_state, flagp,depth+1,
15458                     FALSE, /* means parse the whole char class */
15459                     FALSE, /* don't allow multi-char folds */
15460                     TRUE, /* silence non-portable warnings.  The above may very
15461                              well have generated non-portable code points, but
15462                              they're valid on this machine */
15463                     FALSE, /* similarly, no need for strict */
15464                     FALSE, /* Require return to be an ANYOF */
15465                     NULL,
15466                     NULL
15467                 );
15468     if (!node)
15469         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
15470                     PTR2UV(flagp));
15471
15472     /* Fix up the node type if we are in locale.  (We have pretended we are
15473      * under /u for the purposes of regclass(), as this construct will only
15474      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
15475      * as to cause any warnings about bad locales to be output in regexec.c),
15476      * and add the flag that indicates to check if not in a UTF-8 locale.  The
15477      * reason we above forbid optimization into something other than an ANYOF
15478      * node is simply to minimize the number of code changes in regexec.c.
15479      * Otherwise we would have to create new EXACTish node types and deal with
15480      * them.  This decision could be revisited should this construct become
15481      * popular.
15482      *
15483      * (One might think we could look at the resulting ANYOF node and suppress
15484      * the flag if everything is above 255, as those would be UTF-8 only,
15485      * but this isn't true, as the components that led to that result could
15486      * have been locale-affected, and just happen to cancel each other out
15487      * under UTF-8 locales.) */
15488     if (in_locale) {
15489         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15490
15491         assert(OP(node) == ANYOF);
15492
15493         OP(node) = ANYOFL;
15494         ANYOF_FLAGS(node)
15495                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15496     }
15497
15498     if (save_fold) {
15499         RExC_flags |= RXf_PMf_FOLD;
15500     }
15501
15502     RExC_parse = save_parse + 1;
15503     RExC_end = save_end;
15504     SvREFCNT_dec_NN(final);
15505     SvREFCNT_dec_NN(result_string);
15506
15507     nextchar(pRExC_state);
15508     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15509     return node;
15510 }
15511 #undef IS_OPERATOR
15512 #undef IS_OPERAND
15513
15514 STATIC void
15515 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15516 {
15517     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15518      * innocent-looking character class, like /[ks]/i won't have to go out to
15519      * disk to find the possible matches.
15520      *
15521      * This should be called only for a Latin1-range code points, cp, which is
15522      * known to be involved in a simple fold with other code points above
15523      * Latin1.  It would give false results if /aa has been specified.
15524      * Multi-char folds are outside the scope of this, and must be handled
15525      * specially.
15526      *
15527      * XXX It would be better to generate these via regen, in case a new
15528      * version of the Unicode standard adds new mappings, though that is not
15529      * really likely, and may be caught by the default: case of the switch
15530      * below. */
15531
15532     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15533
15534     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15535
15536     switch (cp) {
15537         case 'k':
15538         case 'K':
15539           *invlist =
15540              add_cp_to_invlist(*invlist, KELVIN_SIGN);
15541             break;
15542         case 's':
15543         case 'S':
15544           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15545             break;
15546         case MICRO_SIGN:
15547           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15548           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15549             break;
15550         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15551         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15552           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15553             break;
15554         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15555           *invlist = add_cp_to_invlist(*invlist,
15556                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15557             break;
15558
15559 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15560
15561         case LATIN_SMALL_LETTER_SHARP_S:
15562           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15563             break;
15564
15565 #endif
15566
15567 #if    UNICODE_MAJOR_VERSION < 3                                        \
15568    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15569
15570         /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15571          * U+0131.  */
15572         case 'i':
15573         case 'I':
15574           *invlist =
15575              add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15576 #   if UNICODE_DOT_DOT_VERSION == 1
15577           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15578 #   endif
15579             break;
15580 #endif
15581
15582         default:
15583             /* Use deprecated warning to increase the chances of this being
15584              * output */
15585             if (PASS2) {
15586                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15587             }
15588             break;
15589     }
15590 }
15591
15592 STATIC void
15593 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15594 {
15595     /* If the final parameter is NULL, output the elements of the array given
15596      * by '*posix_warnings' as REGEXP warnings.  Otherwise, the elements are
15597      * pushed onto it, (creating if necessary) */
15598
15599     SV * msg;
15600     const bool first_is_fatal =  ! return_posix_warnings
15601                                 && ckDEAD(packWARN(WARN_REGEXP));
15602
15603     PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15604
15605     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15606         if (return_posix_warnings) {
15607             if (! *return_posix_warnings) { /* mortalize to not leak if
15608                                                warnings are fatal */
15609                 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15610             }
15611             av_push(*return_posix_warnings, msg);
15612         }
15613         else {
15614             if (first_is_fatal) {           /* Avoid leaking this */
15615                 av_undef(posix_warnings);   /* This isn't necessary if the
15616                                                array is mortal, but is a
15617                                                fail-safe */
15618                 (void) sv_2mortal(msg);
15619                 if (PASS2) {
15620                     SAVEFREESV(RExC_rx_sv);
15621                 }
15622             }
15623             Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15624             SvREFCNT_dec_NN(msg);
15625         }
15626     }
15627 }
15628
15629 STATIC AV *
15630 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15631 {
15632     /* This adds the string scalar <multi_string> to the array
15633      * <multi_char_matches>.  <multi_string> is known to have exactly
15634      * <cp_count> code points in it.  This is used when constructing a
15635      * bracketed character class and we find something that needs to match more
15636      * than a single character.
15637      *
15638      * <multi_char_matches> is actually an array of arrays.  Each top-level
15639      * element is an array that contains all the strings known so far that are
15640      * the same length.  And that length (in number of code points) is the same
15641      * as the index of the top-level array.  Hence, the [2] element is an
15642      * array, each element thereof is a string containing TWO code points;
15643      * while element [3] is for strings of THREE characters, and so on.  Since
15644      * this is for multi-char strings there can never be a [0] nor [1] element.
15645      *
15646      * When we rewrite the character class below, we will do so such that the
15647      * longest strings are written first, so that it prefers the longest
15648      * matching strings first.  This is done even if it turns out that any
15649      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
15650      * Christiansen has agreed that this is ok.  This makes the test for the
15651      * ligature 'ffi' come before the test for 'ff', for example */
15652
15653     AV* this_array;
15654     AV** this_array_ptr;
15655
15656     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15657
15658     if (! multi_char_matches) {
15659         multi_char_matches = newAV();
15660     }
15661
15662     if (av_exists(multi_char_matches, cp_count)) {
15663         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15664         this_array = *this_array_ptr;
15665     }
15666     else {
15667         this_array = newAV();
15668         av_store(multi_char_matches, cp_count,
15669                  (SV*) this_array);
15670     }
15671     av_push(this_array, multi_string);
15672
15673     return multi_char_matches;
15674 }
15675
15676 /* The names of properties whose definitions are not known at compile time are
15677  * stored in this SV, after a constant heading.  So if the length has been
15678  * changed since initialization, then there is a run-time definition. */
15679 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
15680                                         (SvCUR(listsv) != initial_listsv_len)
15681
15682 /* There is a restricted set of white space characters that are legal when
15683  * ignoring white space in a bracketed character class.  This generates the
15684  * code to skip them.
15685  *
15686  * There is a line below that uses the same white space criteria but is outside
15687  * this macro.  Both here and there must use the same definition */
15688 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
15689     STMT_START {                                                        \
15690         if (do_skip) {                                                  \
15691             while (isBLANK_A(UCHARAT(p)))                               \
15692             {                                                           \
15693                 p++;                                                    \
15694             }                                                           \
15695         }                                                               \
15696     } STMT_END
15697
15698 STATIC regnode *
15699 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15700                  const bool stop_at_1,  /* Just parse the next thing, don't
15701                                            look for a full character class */
15702                  bool allow_multi_folds,
15703                  const bool silence_non_portable,   /* Don't output warnings
15704                                                        about too large
15705                                                        characters */
15706                  const bool strict,
15707                  bool optimizable,                  /* ? Allow a non-ANYOF return
15708                                                        node */
15709                  SV** ret_invlist, /* Return an inversion list, not a node */
15710                  AV** return_posix_warnings
15711           )
15712 {
15713     /* parse a bracketed class specification.  Most of these will produce an
15714      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
15715      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
15716      * under /i with multi-character folds: it will be rewritten following the
15717      * paradigm of this example, where the <multi-fold>s are characters which
15718      * fold to multiple character sequences:
15719      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
15720      * gets effectively rewritten as:
15721      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
15722      * reg() gets called (recursively) on the rewritten version, and this
15723      * function will return what it constructs.  (Actually the <multi-fold>s
15724      * aren't physically removed from the [abcdefghi], it's just that they are
15725      * ignored in the recursion by means of a flag:
15726      * <RExC_in_multi_char_class>.)
15727      *
15728      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
15729      * characters, with the corresponding bit set if that character is in the
15730      * list.  For characters above this, a range list or swash is used.  There
15731      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
15732      * determinable at compile time
15733      *
15734      * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
15735      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
15736      * to UTF-8.  This can only happen if ret_invlist is non-NULL.
15737      */
15738
15739     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
15740     IV range = 0;
15741     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
15742     regnode *ret;
15743     STRLEN numlen;
15744     int namedclass = OOB_NAMEDCLASS;
15745     char *rangebegin = NULL;
15746     bool need_class = 0;
15747     SV *listsv = NULL;
15748     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
15749                                       than just initialized.  */
15750     SV* properties = NULL;    /* Code points that match \p{} \P{} */
15751     SV* posixes = NULL;     /* Code points that match classes like [:word:],
15752                                extended beyond the Latin1 range.  These have to
15753                                be kept separate from other code points for much
15754                                of this function because their handling  is
15755                                different under /i, and for most classes under
15756                                /d as well */
15757     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
15758                                separate for a while from the non-complemented
15759                                versions because of complications with /d
15760                                matching */
15761     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
15762                                   treated more simply than the general case,
15763                                   leading to less compilation and execution
15764                                   work */
15765     UV element_count = 0;   /* Number of distinct elements in the class.
15766                                Optimizations may be possible if this is tiny */
15767     AV * multi_char_matches = NULL; /* Code points that fold to more than one
15768                                        character; used under /i */
15769     UV n;
15770     char * stop_ptr = RExC_end;    /* where to stop parsing */
15771     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
15772                                                    space? */
15773
15774     /* Unicode properties are stored in a swash; this holds the current one
15775      * being parsed.  If this swash is the only above-latin1 component of the
15776      * character class, an optimization is to pass it directly on to the
15777      * execution engine.  Otherwise, it is set to NULL to indicate that there
15778      * are other things in the class that have to be dealt with at execution
15779      * time */
15780     SV* swash = NULL;           /* Code points that match \p{} \P{} */
15781
15782     /* Set if a component of this character class is user-defined; just passed
15783      * on to the engine */
15784     bool has_user_defined_property = FALSE;
15785
15786     /* inversion list of code points this node matches only when the target
15787      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
15788      * /d) */
15789     SV* has_upper_latin1_only_utf8_matches = NULL;
15790
15791     /* Inversion list of code points this node matches regardless of things
15792      * like locale, folding, utf8ness of the target string */
15793     SV* cp_list = NULL;
15794
15795     /* Like cp_list, but code points on this list need to be checked for things
15796      * that fold to/from them under /i */
15797     SV* cp_foldable_list = NULL;
15798
15799     /* Like cp_list, but code points on this list are valid only when the
15800      * runtime locale is UTF-8 */
15801     SV* only_utf8_locale_list = NULL;
15802
15803     /* In a range, if one of the endpoints is non-character-set portable,
15804      * meaning that it hard-codes a code point that may mean a different
15805      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
15806      * mnemonic '\t' which each mean the same character no matter which
15807      * character set the platform is on. */
15808     unsigned int non_portable_endpoint = 0;
15809
15810     /* Is the range unicode? which means on a platform that isn't 1-1 native
15811      * to Unicode (i.e. non-ASCII), each code point in it should be considered
15812      * to be a Unicode value.  */
15813     bool unicode_range = FALSE;
15814     bool invert = FALSE;    /* Is this class to be complemented */
15815
15816     bool warn_super = ALWAYS_WARN_SUPER;
15817
15818     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
15819         case we need to change the emitted regop to an EXACT. */
15820     const char * orig_parse = RExC_parse;
15821     const SSize_t orig_size = RExC_size;
15822     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
15823
15824     /* This variable is used to mark where the end in the input is of something
15825      * that looks like a POSIX construct but isn't.  During the parse, when
15826      * something looks like it could be such a construct is encountered, it is
15827      * checked for being one, but not if we've already checked this area of the
15828      * input.  Only after this position is reached do we check again */
15829     char *not_posix_region_end = RExC_parse - 1;
15830
15831     AV* posix_warnings = NULL;
15832     const bool do_posix_warnings =     return_posix_warnings
15833                                    || (PASS2 && ckWARN(WARN_REGEXP));
15834
15835     GET_RE_DEBUG_FLAGS_DECL;
15836
15837     PERL_ARGS_ASSERT_REGCLASS;
15838 #ifndef DEBUGGING
15839     PERL_UNUSED_ARG(depth);
15840 #endif
15841
15842     DEBUG_PARSE("clas");
15843
15844 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
15845     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
15846                                    && UNICODE_DOT_DOT_VERSION == 0)
15847     allow_multi_folds = FALSE;
15848 #endif
15849
15850     /* Assume we are going to generate an ANYOF node. */
15851     ret = reganode(pRExC_state,
15852                    (LOC)
15853                     ? ANYOFL
15854                     : ANYOF,
15855                    0);
15856
15857     if (SIZE_ONLY) {
15858         RExC_size += ANYOF_SKIP;
15859         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
15860     }
15861     else {
15862         ANYOF_FLAGS(ret) = 0;
15863
15864         RExC_emit += ANYOF_SKIP;
15865         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
15866         initial_listsv_len = SvCUR(listsv);
15867         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
15868     }
15869
15870     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15871
15872     assert(RExC_parse <= RExC_end);
15873
15874     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
15875         RExC_parse++;
15876         invert = TRUE;
15877         allow_multi_folds = FALSE;
15878         MARK_NAUGHTY(1);
15879         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15880     }
15881
15882     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
15883     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
15884         int maybe_class = handle_possible_posix(pRExC_state,
15885                                                 RExC_parse,
15886                                                 &not_posix_region_end,
15887                                                 NULL,
15888                                                 TRUE /* checking only */);
15889         if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
15890             SAVEFREESV(RExC_rx_sv);
15891             ckWARN4reg(not_posix_region_end,
15892                     "POSIX syntax [%c %c] belongs inside character classes%s",
15893                     *RExC_parse, *RExC_parse,
15894                     (maybe_class == OOB_NAMEDCLASS)
15895                     ? ((POSIXCC_NOTYET(*RExC_parse))
15896                         ? " (but this one isn't implemented)"
15897                         : " (but this one isn't fully valid)")
15898                     : ""
15899                     );
15900             (void)ReREFCNT_inc(RExC_rx_sv);
15901         }
15902     }
15903
15904     /* If the caller wants us to just parse a single element, accomplish this
15905      * by faking the loop ending condition */
15906     if (stop_at_1 && RExC_end > RExC_parse) {
15907         stop_ptr = RExC_parse + 1;
15908     }
15909
15910     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
15911     if (UCHARAT(RExC_parse) == ']')
15912         goto charclassloop;
15913
15914     while (1) {
15915
15916         if (   posix_warnings
15917             && av_tindex_nomg(posix_warnings) >= 0
15918             && RExC_parse > not_posix_region_end)
15919         {
15920             /* Warnings about posix class issues are considered tentative until
15921              * we are far enough along in the parse that we can no longer
15922              * change our mind, at which point we either output them or add
15923              * them, if it has so specified, to what gets returned to the
15924              * caller.  This is done each time through the loop so that a later
15925              * class won't zap them before they have been dealt with. */
15926             output_or_return_posix_warnings(pRExC_state, posix_warnings,
15927                                             return_posix_warnings);
15928         }
15929
15930         if  (RExC_parse >= stop_ptr) {
15931             break;
15932         }
15933
15934         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15935
15936         if  (UCHARAT(RExC_parse) == ']') {
15937             break;
15938         }
15939
15940       charclassloop:
15941
15942         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
15943         save_value = value;
15944         save_prevvalue = prevvalue;
15945
15946         if (!range) {
15947             rangebegin = RExC_parse;
15948             element_count++;
15949             non_portable_endpoint = 0;
15950         }
15951         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
15952             value = utf8n_to_uvchr((U8*)RExC_parse,
15953                                    RExC_end - RExC_parse,
15954                                    &numlen, UTF8_ALLOW_DEFAULT);
15955             RExC_parse += numlen;
15956         }
15957         else
15958             value = UCHARAT(RExC_parse++);
15959
15960         if (value == '[') {
15961             char * posix_class_end;
15962             namedclass = handle_possible_posix(pRExC_state,
15963                                                RExC_parse,
15964                                                &posix_class_end,
15965                                                do_posix_warnings ? &posix_warnings : NULL,
15966                                                FALSE    /* die if error */);
15967             if (namedclass > OOB_NAMEDCLASS) {
15968
15969                 /* If there was an earlier attempt to parse this particular
15970                  * posix class, and it failed, it was a false alarm, as this
15971                  * successful one proves */
15972                 if (   posix_warnings
15973                     && av_tindex_nomg(posix_warnings) >= 0
15974                     && not_posix_region_end >= RExC_parse
15975                     && not_posix_region_end <= posix_class_end)
15976                 {
15977                     av_undef(posix_warnings);
15978                 }
15979
15980                 RExC_parse = posix_class_end;
15981             }
15982             else if (namedclass == OOB_NAMEDCLASS) {
15983                 not_posix_region_end = posix_class_end;
15984             }
15985             else {
15986                 namedclass = OOB_NAMEDCLASS;
15987             }
15988         }
15989         else if (   RExC_parse - 1 > not_posix_region_end
15990                  && MAYBE_POSIXCC(value))
15991         {
15992             (void) handle_possible_posix(
15993                         pRExC_state,
15994                         RExC_parse - 1,  /* -1 because parse has already been
15995                                             advanced */
15996                         &not_posix_region_end,
15997                         do_posix_warnings ? &posix_warnings : NULL,
15998                         TRUE /* checking only */);
15999         }
16000         else if (value == '\\') {
16001             /* Is a backslash; get the code point of the char after it */
16002
16003             if (RExC_parse >= RExC_end) {
16004                 vFAIL("Unmatched [");
16005             }
16006
16007             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16008                 value = utf8n_to_uvchr((U8*)RExC_parse,
16009                                    RExC_end - RExC_parse,
16010                                    &numlen, UTF8_ALLOW_DEFAULT);
16011                 RExC_parse += numlen;
16012             }
16013             else
16014                 value = UCHARAT(RExC_parse++);
16015
16016             /* Some compilers cannot handle switching on 64-bit integer
16017              * values, therefore value cannot be an UV.  Yes, this will
16018              * be a problem later if we want switch on Unicode.
16019              * A similar issue a little bit later when switching on
16020              * namedclass. --jhi */
16021
16022             /* If the \ is escaping white space when white space is being
16023              * skipped, it means that that white space is wanted literally, and
16024              * is already in 'value'.  Otherwise, need to translate the escape
16025              * into what it signifies. */
16026             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16027
16028             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
16029             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
16030             case 's':   namedclass = ANYOF_SPACE;       break;
16031             case 'S':   namedclass = ANYOF_NSPACE;      break;
16032             case 'd':   namedclass = ANYOF_DIGIT;       break;
16033             case 'D':   namedclass = ANYOF_NDIGIT;      break;
16034             case 'v':   namedclass = ANYOF_VERTWS;      break;
16035             case 'V':   namedclass = ANYOF_NVERTWS;     break;
16036             case 'h':   namedclass = ANYOF_HORIZWS;     break;
16037             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
16038             case 'N':  /* Handle \N{NAME} in class */
16039                 {
16040                     const char * const backslash_N_beg = RExC_parse - 2;
16041                     int cp_count;
16042
16043                     if (! grok_bslash_N(pRExC_state,
16044                                         NULL,      /* No regnode */
16045                                         &value,    /* Yes single value */
16046                                         &cp_count, /* Multiple code pt count */
16047                                         flagp,
16048                                         strict,
16049                                         depth)
16050                     ) {
16051
16052                         if (*flagp & NEED_UTF8)
16053                             FAIL("panic: grok_bslash_N set NEED_UTF8");
16054                         if (*flagp & RESTART_PASS1)
16055                             return NULL;
16056
16057                         if (cp_count < 0) {
16058                             vFAIL("\\N in a character class must be a named character: \\N{...}");
16059                         }
16060                         else if (cp_count == 0) {
16061                             if (PASS2) {
16062                                 ckWARNreg(RExC_parse,
16063                                         "Ignoring zero length \\N{} in character class");
16064                             }
16065                         }
16066                         else { /* cp_count > 1 */
16067                             if (! RExC_in_multi_char_class) {
16068                                 if (invert || range || *RExC_parse == '-') {
16069                                     if (strict) {
16070                                         RExC_parse--;
16071                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16072                                     }
16073                                     else if (PASS2) {
16074                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16075                                     }
16076                                     break; /* <value> contains the first code
16077                                               point. Drop out of the switch to
16078                                               process it */
16079                                 }
16080                                 else {
16081                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
16082                                                  RExC_parse - backslash_N_beg);
16083                                     multi_char_matches
16084                                         = add_multi_match(multi_char_matches,
16085                                                           multi_char_N,
16086                                                           cp_count);
16087                                 }
16088                             }
16089                         } /* End of cp_count != 1 */
16090
16091                         /* This element should not be processed further in this
16092                          * class */
16093                         element_count--;
16094                         value = save_value;
16095                         prevvalue = save_prevvalue;
16096                         continue;   /* Back to top of loop to get next char */
16097                     }
16098
16099                     /* Here, is a single code point, and <value> contains it */
16100                     unicode_range = TRUE;   /* \N{} are Unicode */
16101                 }
16102                 break;
16103             case 'p':
16104             case 'P':
16105                 {
16106                 char *e;
16107
16108                 /* We will handle any undefined properties ourselves */
16109                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16110                                        /* And we actually would prefer to get
16111                                         * the straight inversion list of the
16112                                         * swash, since we will be accessing it
16113                                         * anyway, to save a little time */
16114                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16115
16116                 if (RExC_parse >= RExC_end)
16117                     vFAIL2("Empty \\%c", (U8)value);
16118                 if (*RExC_parse == '{') {
16119                     const U8 c = (U8)value;
16120                     e = strchr(RExC_parse, '}');
16121                     if (!e) {
16122                         RExC_parse++;
16123                         vFAIL2("Missing right brace on \\%c{}", c);
16124                     }
16125
16126                     RExC_parse++;
16127                     while (isSPACE(*RExC_parse)) {
16128                          RExC_parse++;
16129                     }
16130
16131                     if (UCHARAT(RExC_parse) == '^') {
16132
16133                         /* toggle.  (The rhs xor gets the single bit that
16134                          * differs between P and p; the other xor inverts just
16135                          * that bit) */
16136                         value ^= 'P' ^ 'p';
16137
16138                         RExC_parse++;
16139                         while (isSPACE(*RExC_parse)) {
16140                             RExC_parse++;
16141                         }
16142                     }
16143
16144                     if (e == RExC_parse)
16145                         vFAIL2("Empty \\%c{}", c);
16146
16147                     n = e - RExC_parse;
16148                     while (isSPACE(*(RExC_parse + n - 1)))
16149                         n--;
16150                 }   /* The \p isn't immediately followed by a '{' */
16151                 else if (! isALPHA(*RExC_parse)) {
16152                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16153                     vFAIL2("Character following \\%c must be '{' or a "
16154                            "single-character Unicode property name",
16155                            (U8) value);
16156                 }
16157                 else {
16158                     e = RExC_parse;
16159                     n = 1;
16160                 }
16161                 if (!SIZE_ONLY) {
16162                     SV* invlist;
16163                     char* name;
16164                     char* base_name;    /* name after any packages are stripped */
16165                     char* lookup_name = NULL;
16166                     const char * const colon_colon = "::";
16167
16168                     /* Try to get the definition of the property into
16169                      * <invlist>.  If /i is in effect, the effective property
16170                      * will have its name be <__NAME_i>.  The design is
16171                      * discussed in commit
16172                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16173                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16174                     SAVEFREEPV(name);
16175                     if (FOLD) {
16176                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16177
16178                         /* The function call just below that uses this can fail
16179                          * to return, leaking memory if we don't do this */
16180                         SAVEFREEPV(lookup_name);
16181                     }
16182
16183                     /* Look up the property name, and get its swash and
16184                      * inversion list, if the property is found  */
16185                     SvREFCNT_dec(swash); /* Free any left-overs */
16186                     swash = _core_swash_init("utf8",
16187                                              (lookup_name)
16188                                               ? lookup_name
16189                                               : name,
16190                                              &PL_sv_undef,
16191                                              1, /* binary */
16192                                              0, /* not tr/// */
16193                                              NULL, /* No inversion list */
16194                                              &swash_init_flags
16195                                             );
16196                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16197                         HV* curpkg = (IN_PERL_COMPILETIME)
16198                                       ? PL_curstash
16199                                       : CopSTASH(PL_curcop);
16200                         UV final_n = n;
16201                         bool has_pkg;
16202
16203                         if (swash) {    /* Got a swash but no inversion list.
16204                                            Something is likely wrong that will
16205                                            be sorted-out later */
16206                             SvREFCNT_dec_NN(swash);
16207                             swash = NULL;
16208                         }
16209
16210                         /* Here didn't find it.  It could be a an error (like a
16211                          * typo) in specifying a Unicode property, or it could
16212                          * be a user-defined property that will be available at
16213                          * run-time.  The names of these must begin with 'In'
16214                          * or 'Is' (after any packages are stripped off).  So
16215                          * if not one of those, or if we accept only
16216                          * compile-time properties, is an error; otherwise add
16217                          * it to the list for run-time look up. */
16218                         if ((base_name = rninstr(name, name + n,
16219                                                  colon_colon, colon_colon + 2)))
16220                         { /* Has ::.  We know this must be a user-defined
16221                              property */
16222                             base_name += 2;
16223                             final_n -= base_name - name;
16224                             has_pkg = TRUE;
16225                         }
16226                         else {
16227                             base_name = name;
16228                             has_pkg = FALSE;
16229                         }
16230
16231                         if (   final_n < 3
16232                             || base_name[0] != 'I'
16233                             || (base_name[1] != 's' && base_name[1] != 'n')
16234                             || ret_invlist)
16235                         {
16236                             const char * const msg
16237                                 = (has_pkg)
16238                                   ? "Illegal user-defined property name"
16239                                   : "Can't find Unicode property definition";
16240                             RExC_parse = e + 1;
16241
16242                             /* diag_listed_as: Can't find Unicode property definition "%s" */
16243                             vFAIL3utf8f("%s \"%"UTF8f"\"",
16244                                 msg, UTF8fARG(UTF, n, name));
16245                         }
16246
16247                         /* If the property name doesn't already have a package
16248                          * name, add the current one to it so that it can be
16249                          * referred to outside it. [perl #121777] */
16250                         if (! has_pkg && curpkg) {
16251                             char* pkgname = HvNAME(curpkg);
16252                             if (strNE(pkgname, "main")) {
16253                                 char* full_name = Perl_form(aTHX_
16254                                                             "%s::%s",
16255                                                             pkgname,
16256                                                             name);
16257                                 n = strlen(full_name);
16258                                 name = savepvn(full_name, n);
16259                                 SAVEFREEPV(name);
16260                             }
16261                         }
16262                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
16263                                         (value == 'p' ? '+' : '!'),
16264                                         (FOLD) ? "__" : "",
16265                                         UTF8fARG(UTF, n, name),
16266                                         (FOLD) ? "_i" : "");
16267                         has_user_defined_property = TRUE;
16268                         optimizable = FALSE;    /* Will have to leave this an
16269                                                    ANYOF node */
16270
16271                         /* We don't know yet what this matches, so have to flag
16272                          * it */
16273                         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16274                     }
16275                     else {
16276
16277                         /* Here, did get the swash and its inversion list.  If
16278                          * the swash is from a user-defined property, then this
16279                          * whole character class should be regarded as such */
16280                         if (swash_init_flags
16281                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16282                         {
16283                             has_user_defined_property = TRUE;
16284                         }
16285                         else if
16286                             /* We warn on matching an above-Unicode code point
16287                              * if the match would return true, except don't
16288                              * warn for \p{All}, which has exactly one element
16289                              * = 0 */
16290                             (_invlist_contains_cp(invlist, 0x110000)
16291                                 && (! (_invlist_len(invlist) == 1
16292                                        && *invlist_array(invlist) == 0)))
16293                         {
16294                             warn_super = TRUE;
16295                         }
16296
16297
16298                         /* Invert if asking for the complement */
16299                         if (value == 'P') {
16300                             _invlist_union_complement_2nd(properties,
16301                                                           invlist,
16302                                                           &properties);
16303
16304                             /* The swash can't be used as-is, because we've
16305                              * inverted things; delay removing it to here after
16306                              * have copied its invlist above */
16307                             SvREFCNT_dec_NN(swash);
16308                             swash = NULL;
16309                         }
16310                         else {
16311                             _invlist_union(properties, invlist, &properties);
16312                         }
16313                     }
16314                 }
16315                 RExC_parse = e + 1;
16316                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
16317                                                 named */
16318
16319                 /* \p means they want Unicode semantics */
16320                 REQUIRE_UNI_RULES(flagp, NULL);
16321                 }
16322                 break;
16323             case 'n':   value = '\n';                   break;
16324             case 'r':   value = '\r';                   break;
16325             case 't':   value = '\t';                   break;
16326             case 'f':   value = '\f';                   break;
16327             case 'b':   value = '\b';                   break;
16328             case 'e':   value = ESC_NATIVE;             break;
16329             case 'a':   value = '\a';                   break;
16330             case 'o':
16331                 RExC_parse--;   /* function expects to be pointed at the 'o' */
16332                 {
16333                     const char* error_msg;
16334                     bool valid = grok_bslash_o(&RExC_parse,
16335                                                &value,
16336                                                &error_msg,
16337                                                PASS2,   /* warnings only in
16338                                                            pass 2 */
16339                                                strict,
16340                                                silence_non_portable,
16341                                                UTF);
16342                     if (! valid) {
16343                         vFAIL(error_msg);
16344                     }
16345                 }
16346                 non_portable_endpoint++;
16347                 break;
16348             case 'x':
16349                 RExC_parse--;   /* function expects to be pointed at the 'x' */
16350                 {
16351                     const char* error_msg;
16352                     bool valid = grok_bslash_x(&RExC_parse,
16353                                                &value,
16354                                                &error_msg,
16355                                                PASS2, /* Output warnings */
16356                                                strict,
16357                                                silence_non_portable,
16358                                                UTF);
16359                     if (! valid) {
16360                         vFAIL(error_msg);
16361                     }
16362                 }
16363                 non_portable_endpoint++;
16364                 break;
16365             case 'c':
16366                 value = grok_bslash_c(*RExC_parse++, PASS2);
16367                 non_portable_endpoint++;
16368                 break;
16369             case '0': case '1': case '2': case '3': case '4':
16370             case '5': case '6': case '7':
16371                 {
16372                     /* Take 1-3 octal digits */
16373                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16374                     numlen = (strict) ? 4 : 3;
16375                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16376                     RExC_parse += numlen;
16377                     if (numlen != 3) {
16378                         if (strict) {
16379                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16380                             vFAIL("Need exactly 3 octal digits");
16381                         }
16382                         else if (! SIZE_ONLY /* like \08, \178 */
16383                                  && numlen < 3
16384                                  && RExC_parse < RExC_end
16385                                  && isDIGIT(*RExC_parse)
16386                                  && ckWARN(WARN_REGEXP))
16387                         {
16388                             SAVEFREESV(RExC_rx_sv);
16389                             reg_warn_non_literal_string(
16390                                  RExC_parse + 1,
16391                                  form_short_octal_warning(RExC_parse, numlen));
16392                             (void)ReREFCNT_inc(RExC_rx_sv);
16393                         }
16394                     }
16395                     non_portable_endpoint++;
16396                     break;
16397                 }
16398             default:
16399                 /* Allow \_ to not give an error */
16400                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16401                     if (strict) {
16402                         vFAIL2("Unrecognized escape \\%c in character class",
16403                                (int)value);
16404                     }
16405                     else {
16406                         SAVEFREESV(RExC_rx_sv);
16407                         ckWARN2reg(RExC_parse,
16408                             "Unrecognized escape \\%c in character class passed through",
16409                             (int)value);
16410                         (void)ReREFCNT_inc(RExC_rx_sv);
16411                     }
16412                 }
16413                 break;
16414             }   /* End of switch on char following backslash */
16415         } /* end of handling backslash escape sequences */
16416
16417         /* Here, we have the current token in 'value' */
16418
16419         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16420             U8 classnum;
16421
16422             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
16423              * literal, as is the character that began the false range, i.e.
16424              * the 'a' in the examples */
16425             if (range) {
16426                 if (!SIZE_ONLY) {
16427                     const int w = (RExC_parse >= rangebegin)
16428                                   ? RExC_parse - rangebegin
16429                                   : 0;
16430                     if (strict) {
16431                         vFAIL2utf8f(
16432                             "False [] range \"%"UTF8f"\"",
16433                             UTF8fARG(UTF, w, rangebegin));
16434                     }
16435                     else {
16436                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16437                         ckWARN2reg(RExC_parse,
16438                             "False [] range \"%"UTF8f"\"",
16439                             UTF8fARG(UTF, w, rangebegin));
16440                         (void)ReREFCNT_inc(RExC_rx_sv);
16441                         cp_list = add_cp_to_invlist(cp_list, '-');
16442                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16443                                                              prevvalue);
16444                     }
16445                 }
16446
16447                 range = 0; /* this was not a true range */
16448                 element_count += 2; /* So counts for three values */
16449             }
16450
16451             classnum = namedclass_to_classnum(namedclass);
16452
16453             if (LOC && namedclass < ANYOF_POSIXL_MAX
16454 #ifndef HAS_ISASCII
16455                 && classnum != _CC_ASCII
16456 #endif
16457             ) {
16458                 /* What the Posix classes (like \w, [:space:]) match in locale
16459                  * isn't knowable under locale until actual match time.  Room
16460                  * must be reserved (one time per outer bracketed class) to
16461                  * store such classes.  The space will contain a bit for each
16462                  * named class that is to be matched against.  This isn't
16463                  * needed for \p{} and pseudo-classes, as they are not affected
16464                  * by locale, and hence are dealt with separately */
16465                 if (! need_class) {
16466                     need_class = 1;
16467                     if (SIZE_ONLY) {
16468                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16469                     }
16470                     else {
16471                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16472                     }
16473                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16474                     ANYOF_POSIXL_ZERO(ret);
16475
16476                     /* We can't change this into some other type of node
16477                      * (unless this is the only element, in which case there
16478                      * are nodes that mean exactly this) as has runtime
16479                      * dependencies */
16480                     optimizable = FALSE;
16481                 }
16482
16483                 /* Coverity thinks it is possible for this to be negative; both
16484                  * jhi and khw think it's not, but be safer */
16485                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16486                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16487
16488                 /* See if it already matches the complement of this POSIX
16489                  * class */
16490                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16491                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16492                                                             ? -1
16493                                                             : 1)))
16494                 {
16495                     posixl_matches_all = TRUE;
16496                     break;  /* No need to continue.  Since it matches both
16497                                e.g., \w and \W, it matches everything, and the
16498                                bracketed class can be optimized into qr/./s */
16499                 }
16500
16501                 /* Add this class to those that should be checked at runtime */
16502                 ANYOF_POSIXL_SET(ret, namedclass);
16503
16504                 /* The above-Latin1 characters are not subject to locale rules.
16505                  * Just add them, in the second pass, to the
16506                  * unconditionally-matched list */
16507                 if (! SIZE_ONLY) {
16508                     SV* scratch_list = NULL;
16509
16510                     /* Get the list of the above-Latin1 code points this
16511                      * matches */
16512                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16513                                           PL_XPosix_ptrs[classnum],
16514
16515                                           /* Odd numbers are complements, like
16516                                            * NDIGIT, NASCII, ... */
16517                                           namedclass % 2 != 0,
16518                                           &scratch_list);
16519                     /* Checking if 'cp_list' is NULL first saves an extra
16520                      * clone.  Its reference count will be decremented at the
16521                      * next union, etc, or if this is the only instance, at the
16522                      * end of the routine */
16523                     if (! cp_list) {
16524                         cp_list = scratch_list;
16525                     }
16526                     else {
16527                         _invlist_union(cp_list, scratch_list, &cp_list);
16528                         SvREFCNT_dec_NN(scratch_list);
16529                     }
16530                     continue;   /* Go get next character */
16531                 }
16532             }
16533             else if (! SIZE_ONLY) {
16534
16535                 /* Here, not in pass1 (in that pass we skip calculating the
16536                  * contents of this class), and is not /l, or is a POSIX class
16537                  * for which /l doesn't matter (or is a Unicode property, which
16538                  * is skipped here). */
16539                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
16540                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16541
16542                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
16543                          * nor /l make a difference in what these match,
16544                          * therefore we just add what they match to cp_list. */
16545                         if (classnum != _CC_VERTSPACE) {
16546                             assert(   namedclass == ANYOF_HORIZWS
16547                                    || namedclass == ANYOF_NHORIZWS);
16548
16549                             /* It turns out that \h is just a synonym for
16550                              * XPosixBlank */
16551                             classnum = _CC_BLANK;
16552                         }
16553
16554                         _invlist_union_maybe_complement_2nd(
16555                                 cp_list,
16556                                 PL_XPosix_ptrs[classnum],
16557                                 namedclass % 2 != 0,    /* Complement if odd
16558                                                           (NHORIZWS, NVERTWS)
16559                                                         */
16560                                 &cp_list);
16561                     }
16562                 }
16563                 else if (  UNI_SEMANTICS
16564                         || classnum == _CC_ASCII
16565                         || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
16566                                                   || classnum == _CC_XDIGIT)))
16567                 {
16568                     /* We usually have to worry about /d and /a affecting what
16569                      * POSIX classes match, with special code needed for /d
16570                      * because we won't know until runtime what all matches.
16571                      * But there is no extra work needed under /u, and
16572                      * [:ascii:] is unaffected by /a and /d; and :digit: and
16573                      * :xdigit: don't have runtime differences under /d.  So we
16574                      * can special case these, and avoid some extra work below,
16575                      * and at runtime. */
16576                     _invlist_union_maybe_complement_2nd(
16577                                                      simple_posixes,
16578                                                      PL_XPosix_ptrs[classnum],
16579                                                      namedclass % 2 != 0,
16580                                                      &simple_posixes);
16581                 }
16582                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
16583                            complement and use nposixes */
16584                     SV** posixes_ptr = namedclass % 2 == 0
16585                                        ? &posixes
16586                                        : &nposixes;
16587                     _invlist_union_maybe_complement_2nd(
16588                                                      *posixes_ptr,
16589                                                      PL_XPosix_ptrs[classnum],
16590                                                      namedclass % 2 != 0,
16591                                                      posixes_ptr);
16592                 }
16593             }
16594         } /* end of namedclass \blah */
16595
16596         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16597
16598         /* If 'range' is set, 'value' is the ending of a range--check its
16599          * validity.  (If value isn't a single code point in the case of a
16600          * range, we should have figured that out above in the code that
16601          * catches false ranges).  Later, we will handle each individual code
16602          * point in the range.  If 'range' isn't set, this could be the
16603          * beginning of a range, so check for that by looking ahead to see if
16604          * the next real character to be processed is the range indicator--the
16605          * minus sign */
16606
16607         if (range) {
16608 #ifdef EBCDIC
16609             /* For unicode ranges, we have to test that the Unicode as opposed
16610              * to the native values are not decreasing.  (Above 255, there is
16611              * no difference between native and Unicode) */
16612             if (unicode_range && prevvalue < 255 && value < 255) {
16613                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16614                     goto backwards_range;
16615                 }
16616             }
16617             else
16618 #endif
16619             if (prevvalue > value) /* b-a */ {
16620                 int w;
16621 #ifdef EBCDIC
16622               backwards_range:
16623 #endif
16624                 w = RExC_parse - rangebegin;
16625                 vFAIL2utf8f(
16626                     "Invalid [] range \"%"UTF8f"\"",
16627                     UTF8fARG(UTF, w, rangebegin));
16628                 NOT_REACHED; /* NOTREACHED */
16629             }
16630         }
16631         else {
16632             prevvalue = value; /* save the beginning of the potential range */
16633             if (! stop_at_1     /* Can't be a range if parsing just one thing */
16634                 && *RExC_parse == '-')
16635             {
16636                 char* next_char_ptr = RExC_parse + 1;
16637
16638                 /* Get the next real char after the '-' */
16639                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16640
16641                 /* If the '-' is at the end of the class (just before the ']',
16642                  * it is a literal minus; otherwise it is a range */
16643                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16644                     RExC_parse = next_char_ptr;
16645
16646                     /* a bad range like \w-, [:word:]- ? */
16647                     if (namedclass > OOB_NAMEDCLASS) {
16648                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16649                             const int w = RExC_parse >= rangebegin
16650                                           ?  RExC_parse - rangebegin
16651                                           : 0;
16652                             if (strict) {
16653                                 vFAIL4("False [] range \"%*.*s\"",
16654                                     w, w, rangebegin);
16655                             }
16656                             else if (PASS2) {
16657                                 vWARN4(RExC_parse,
16658                                     "False [] range \"%*.*s\"",
16659                                     w, w, rangebegin);
16660                             }
16661                         }
16662                         if (!SIZE_ONLY) {
16663                             cp_list = add_cp_to_invlist(cp_list, '-');
16664                         }
16665                         element_count++;
16666                     } else
16667                         range = 1;      /* yeah, it's a range! */
16668                     continue;   /* but do it the next time */
16669                 }
16670             }
16671         }
16672
16673         if (namedclass > OOB_NAMEDCLASS) {
16674             continue;
16675         }
16676
16677         /* Here, we have a single value this time through the loop, and
16678          * <prevvalue> is the beginning of the range, if any; or <value> if
16679          * not. */
16680
16681         /* non-Latin1 code point implies unicode semantics.  Must be set in
16682          * pass1 so is there for the whole of pass 2 */
16683         if (value > 255) {
16684             REQUIRE_UNI_RULES(flagp, NULL);
16685         }
16686
16687         /* Ready to process either the single value, or the completed range.
16688          * For single-valued non-inverted ranges, we consider the possibility
16689          * of multi-char folds.  (We made a conscious decision to not do this
16690          * for the other cases because it can often lead to non-intuitive
16691          * results.  For example, you have the peculiar case that:
16692          *  "s s" =~ /^[^\xDF]+$/i => Y
16693          *  "ss"  =~ /^[^\xDF]+$/i => N
16694          *
16695          * See [perl #89750] */
16696         if (FOLD && allow_multi_folds && value == prevvalue) {
16697             if (value == LATIN_SMALL_LETTER_SHARP_S
16698                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
16699                                                         value)))
16700             {
16701                 /* Here <value> is indeed a multi-char fold.  Get what it is */
16702
16703                 U8 foldbuf[UTF8_MAXBYTES_CASE];
16704                 STRLEN foldlen;
16705
16706                 UV folded = _to_uni_fold_flags(
16707                                 value,
16708                                 foldbuf,
16709                                 &foldlen,
16710                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
16711                                                    ? FOLD_FLAGS_NOMIX_ASCII
16712                                                    : 0)
16713                                 );
16714
16715                 /* Here, <folded> should be the first character of the
16716                  * multi-char fold of <value>, with <foldbuf> containing the
16717                  * whole thing.  But, if this fold is not allowed (because of
16718                  * the flags), <fold> will be the same as <value>, and should
16719                  * be processed like any other character, so skip the special
16720                  * handling */
16721                 if (folded != value) {
16722
16723                     /* Skip if we are recursed, currently parsing the class
16724                      * again.  Otherwise add this character to the list of
16725                      * multi-char folds. */
16726                     if (! RExC_in_multi_char_class) {
16727                         STRLEN cp_count = utf8_length(foldbuf,
16728                                                       foldbuf + foldlen);
16729                         SV* multi_fold = sv_2mortal(newSVpvs(""));
16730
16731                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
16732
16733                         multi_char_matches
16734                                         = add_multi_match(multi_char_matches,
16735                                                           multi_fold,
16736                                                           cp_count);
16737
16738                     }
16739
16740                     /* This element should not be processed further in this
16741                      * class */
16742                     element_count--;
16743                     value = save_value;
16744                     prevvalue = save_prevvalue;
16745                     continue;
16746                 }
16747             }
16748         }
16749
16750         if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
16751             if (range) {
16752
16753                 /* If the range starts above 255, everything is portable and
16754                  * likely to be so for any forseeable character set, so don't
16755                  * warn. */
16756                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
16757                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
16758                 }
16759                 else if (prevvalue != value) {
16760
16761                     /* Under strict, ranges that stop and/or end in an ASCII
16762                      * printable should have each end point be a portable value
16763                      * for it (preferably like 'A', but we don't warn if it is
16764                      * a (portable) Unicode name or code point), and the range
16765                      * must be be all digits or all letters of the same case.
16766                      * Otherwise, the range is non-portable and unclear as to
16767                      * what it contains */
16768                     if ((isPRINT_A(prevvalue) || isPRINT_A(value))
16769                         && (non_portable_endpoint
16770                             || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
16771                                    || (isLOWER_A(prevvalue) && isLOWER_A(value))
16772                                    || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
16773                     {
16774                         vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
16775                     }
16776                     else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
16777
16778                         /* But the nature of Unicode and languages mean we
16779                          * can't do the same checks for above-ASCII ranges,
16780                          * except in the case of digit ones.  These should
16781                          * contain only digits from the same group of 10.  The
16782                          * ASCII case is handled just above.  0x660 is the
16783                          * first digit character beyond ASCII.  Hence here, the
16784                          * range could be a range of digits.  Find out.  */
16785                         IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16786                                                          prevvalue);
16787                         IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16788                                                          value);
16789
16790                         /* If the range start and final points are in the same
16791                          * inversion list element, it means that either both
16792                          * are not digits, or both are digits in a consecutive
16793                          * sequence of digits.  (So far, Unicode has kept all
16794                          * such sequences as distinct groups of 10, but assert
16795                          * to make sure).  If the end points are not in the
16796                          * same element, neither should be a digit. */
16797                         if (index_start == index_final) {
16798                             assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
16799                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16800                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16801                                == 10)
16802                                /* But actually Unicode did have one group of 11
16803                                 * 'digits' in 5.2, so in case we are operating
16804                                 * on that version, let that pass */
16805                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16806                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16807                                 == 11
16808                                && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16809                                 == 0x19D0)
16810                             );
16811                         }
16812                         else if ((index_start >= 0
16813                                   && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
16814                                  || (index_final >= 0
16815                                      && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
16816                         {
16817                             vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
16818                         }
16819                     }
16820                 }
16821             }
16822             if ((! range || prevvalue == value) && non_portable_endpoint) {
16823                 if (isPRINT_A(value)) {
16824                     char literal[3];
16825                     unsigned d = 0;
16826                     if (isBACKSLASHED_PUNCT(value)) {
16827                         literal[d++] = '\\';
16828                     }
16829                     literal[d++] = (char) value;
16830                     literal[d++] = '\0';
16831
16832                     vWARN4(RExC_parse,
16833                            "\"%.*s\" is more clearly written simply as \"%s\"",
16834                            (int) (RExC_parse - rangebegin),
16835                            rangebegin,
16836                            literal
16837                         );
16838                 }
16839                 else if isMNEMONIC_CNTRL(value) {
16840                     vWARN4(RExC_parse,
16841                            "\"%.*s\" is more clearly written simply as \"%s\"",
16842                            (int) (RExC_parse - rangebegin),
16843                            rangebegin,
16844                            cntrl_to_mnemonic((U8) value)
16845                         );
16846                 }
16847             }
16848         }
16849
16850         /* Deal with this element of the class */
16851         if (! SIZE_ONLY) {
16852
16853 #ifndef EBCDIC
16854             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16855                                                      prevvalue, value);
16856 #else
16857             /* On non-ASCII platforms, for ranges that span all of 0..255, and
16858              * ones that don't require special handling, we can just add the
16859              * range like we do for ASCII platforms */
16860             if ((UNLIKELY(prevvalue == 0) && value >= 255)
16861                 || ! (prevvalue < 256
16862                       && (unicode_range
16863                           || (! non_portable_endpoint
16864                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
16865                                   || (isUPPER_A(prevvalue)
16866                                       && isUPPER_A(value)))))))
16867             {
16868                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16869                                                          prevvalue, value);
16870             }
16871             else {
16872                 /* Here, requires special handling.  This can be because it is
16873                  * a range whose code points are considered to be Unicode, and
16874                  * so must be individually translated into native, or because
16875                  * its a subrange of 'A-Z' or 'a-z' which each aren't
16876                  * contiguous in EBCDIC, but we have defined them to include
16877                  * only the "expected" upper or lower case ASCII alphabetics.
16878                  * Subranges above 255 are the same in native and Unicode, so
16879                  * can be added as a range */
16880                 U8 start = NATIVE_TO_LATIN1(prevvalue);
16881                 unsigned j;
16882                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
16883                 for (j = start; j <= end; j++) {
16884                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
16885                 }
16886                 if (value > 255) {
16887                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16888                                                              256, value);
16889                 }
16890             }
16891 #endif
16892         }
16893
16894         range = 0; /* this range (if it was one) is done now */
16895     } /* End of loop through all the text within the brackets */
16896
16897
16898     if (   posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
16899         output_or_return_posix_warnings(pRExC_state, posix_warnings,
16900                                         return_posix_warnings);
16901     }
16902
16903     /* If anything in the class expands to more than one character, we have to
16904      * deal with them by building up a substitute parse string, and recursively
16905      * calling reg() on it, instead of proceeding */
16906     if (multi_char_matches) {
16907         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
16908         I32 cp_count;
16909         STRLEN len;
16910         char *save_end = RExC_end;
16911         char *save_parse = RExC_parse;
16912         char *save_start = RExC_start;
16913         STRLEN prefix_end = 0;      /* We copy the character class after a
16914                                        prefix supplied here.  This is the size
16915                                        + 1 of that prefix */
16916         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
16917                                        a "|" */
16918         I32 reg_flags;
16919
16920         assert(! invert);
16921         assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
16922
16923 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
16924            because too confusing */
16925         if (invert) {
16926             sv_catpv(substitute_parse, "(?:");
16927         }
16928 #endif
16929
16930         /* Look at the longest folds first */
16931         for (cp_count = av_tindex_nomg(multi_char_matches);
16932                         cp_count > 0;
16933                         cp_count--)
16934         {
16935
16936             if (av_exists(multi_char_matches, cp_count)) {
16937                 AV** this_array_ptr;
16938                 SV* this_sequence;
16939
16940                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
16941                                                  cp_count, FALSE);
16942                 while ((this_sequence = av_pop(*this_array_ptr)) !=
16943                                                                 &PL_sv_undef)
16944                 {
16945                     if (! first_time) {
16946                         sv_catpv(substitute_parse, "|");
16947                     }
16948                     first_time = FALSE;
16949
16950                     sv_catpv(substitute_parse, SvPVX(this_sequence));
16951                 }
16952             }
16953         }
16954
16955         /* If the character class contains anything else besides these
16956          * multi-character folds, have to include it in recursive parsing */
16957         if (element_count) {
16958             sv_catpv(substitute_parse, "|[");
16959             prefix_end = SvCUR(substitute_parse);
16960             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
16961
16962             /* Put in a closing ']' only if not going off the end, as otherwise
16963              * we are adding something that really isn't there */
16964             if (RExC_parse < RExC_end) {
16965                 sv_catpv(substitute_parse, "]");
16966             }
16967         }
16968
16969         sv_catpv(substitute_parse, ")");
16970 #if 0
16971         if (invert) {
16972             /* This is a way to get the parse to skip forward a whole named
16973              * sequence instead of matching the 2nd character when it fails the
16974              * first */
16975             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
16976         }
16977 #endif
16978
16979         /* Set up the data structure so that any errors will be properly
16980          * reported.  See the comments at the definition of
16981          * REPORT_LOCATION_ARGS for details */
16982         RExC_precomp_adj = orig_parse - RExC_precomp;
16983         RExC_start =  RExC_parse = SvPV(substitute_parse, len);
16984         RExC_adjusted_start = RExC_start + prefix_end;
16985         RExC_end = RExC_parse + len;
16986         RExC_in_multi_char_class = 1;
16987         RExC_override_recoding = 1;
16988         RExC_emit = (regnode *)orig_emit;
16989
16990         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
16991
16992         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
16993
16994         /* And restore so can parse the rest of the pattern */
16995         RExC_parse = save_parse;
16996         RExC_start = RExC_adjusted_start = save_start;
16997         RExC_precomp_adj = 0;
16998         RExC_end = save_end;
16999         RExC_in_multi_char_class = 0;
17000         RExC_override_recoding = 0;
17001         SvREFCNT_dec_NN(multi_char_matches);
17002         return ret;
17003     }
17004
17005     /* Here, we've gone through the entire class and dealt with multi-char
17006      * folds.  We are now in a position that we can do some checks to see if we
17007      * can optimize this ANYOF node into a simpler one, even in Pass 1.
17008      * Currently we only do two checks:
17009      * 1) is in the unlikely event that the user has specified both, eg. \w and
17010      *    \W under /l, then the class matches everything.  (This optimization
17011      *    is done only to make the optimizer code run later work.)
17012      * 2) if the character class contains only a single element (including a
17013      *    single range), we see if there is an equivalent node for it.
17014      * Other checks are possible */
17015     if (   optimizable
17016         && ! ret_invlist   /* Can't optimize if returning the constructed
17017                               inversion list */
17018         && (UNLIKELY(posixl_matches_all) || element_count == 1))
17019     {
17020         U8 op = END;
17021         U8 arg = 0;
17022
17023         if (UNLIKELY(posixl_matches_all)) {
17024             op = SANY;
17025         }
17026         else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
17027                                                    class, like \w or [:digit:]
17028                                                    or \p{foo} */
17029
17030             /* All named classes are mapped into POSIXish nodes, with its FLAG
17031              * argument giving which class it is */
17032             switch ((I32)namedclass) {
17033                 case ANYOF_UNIPROP:
17034                     break;
17035
17036                 /* These don't depend on the charset modifiers.  They always
17037                  * match under /u rules */
17038                 case ANYOF_NHORIZWS:
17039                 case ANYOF_HORIZWS:
17040                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
17041                     /* FALLTHROUGH */
17042
17043                 case ANYOF_NVERTWS:
17044                 case ANYOF_VERTWS:
17045                     op = POSIXU;
17046                     goto join_posix;
17047
17048                 /* The actual POSIXish node for all the rest depends on the
17049                  * charset modifier.  The ones in the first set depend only on
17050                  * ASCII or, if available on this platform, also locale */
17051                 case ANYOF_ASCII:
17052                 case ANYOF_NASCII:
17053 #ifdef HAS_ISASCII
17054                     op = (LOC) ? POSIXL : POSIXA;
17055 #else
17056                     op = POSIXA;
17057 #endif
17058                     goto join_posix;
17059
17060                 /* The following don't have any matches in the upper Latin1
17061                  * range, hence /d is equivalent to /u for them.  Making it /u
17062                  * saves some branches at runtime */
17063                 case ANYOF_DIGIT:
17064                 case ANYOF_NDIGIT:
17065                 case ANYOF_XDIGIT:
17066                 case ANYOF_NXDIGIT:
17067                     if (! DEPENDS_SEMANTICS) {
17068                         goto treat_as_default;
17069                     }
17070
17071                     op = POSIXU;
17072                     goto join_posix;
17073
17074                 /* The following change to CASED under /i */
17075                 case ANYOF_LOWER:
17076                 case ANYOF_NLOWER:
17077                 case ANYOF_UPPER:
17078                 case ANYOF_NUPPER:
17079                     if (FOLD) {
17080                         namedclass = ANYOF_CASED + (namedclass % 2);
17081                     }
17082                     /* FALLTHROUGH */
17083
17084                 /* The rest have more possibilities depending on the charset.
17085                  * We take advantage of the enum ordering of the charset
17086                  * modifiers to get the exact node type, */
17087                 default:
17088                   treat_as_default:
17089                     op = POSIXD + get_regex_charset(RExC_flags);
17090                     if (op > POSIXA) { /* /aa is same as /a */
17091                         op = POSIXA;
17092                     }
17093
17094                   join_posix:
17095                     /* The odd numbered ones are the complements of the
17096                      * next-lower even number one */
17097                     if (namedclass % 2 == 1) {
17098                         invert = ! invert;
17099                         namedclass--;
17100                     }
17101                     arg = namedclass_to_classnum(namedclass);
17102                     break;
17103             }
17104         }
17105         else if (value == prevvalue) {
17106
17107             /* Here, the class consists of just a single code point */
17108
17109             if (invert) {
17110                 if (! LOC && value == '\n') {
17111                     op = REG_ANY; /* Optimize [^\n] */
17112                     *flagp |= HASWIDTH|SIMPLE;
17113                     MARK_NAUGHTY(1);
17114                 }
17115             }
17116             else if (value < 256 || UTF) {
17117
17118                 /* Optimize a single value into an EXACTish node, but not if it
17119                  * would require converting the pattern to UTF-8. */
17120                 op = compute_EXACTish(pRExC_state);
17121             }
17122         } /* Otherwise is a range */
17123         else if (! LOC) {   /* locale could vary these */
17124             if (prevvalue == '0') {
17125                 if (value == '9') {
17126                     arg = _CC_DIGIT;
17127                     op = POSIXA;
17128                 }
17129             }
17130             else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17131                 /* We can optimize A-Z or a-z, but not if they could match
17132                  * something like the KELVIN SIGN under /i. */
17133                 if (prevvalue == 'A') {
17134                     if (value == 'Z'
17135 #ifdef EBCDIC
17136                         && ! non_portable_endpoint
17137 #endif
17138                     ) {
17139                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17140                         op = POSIXA;
17141                     }
17142                 }
17143                 else if (prevvalue == 'a') {
17144                     if (value == 'z'
17145 #ifdef EBCDIC
17146                         && ! non_portable_endpoint
17147 #endif
17148                     ) {
17149                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17150                         op = POSIXA;
17151                     }
17152                 }
17153             }
17154         }
17155
17156         /* Here, we have changed <op> away from its initial value iff we found
17157          * an optimization */
17158         if (op != END) {
17159
17160             /* Throw away this ANYOF regnode, and emit the calculated one,
17161              * which should correspond to the beginning, not current, state of
17162              * the parse */
17163             const char * cur_parse = RExC_parse;
17164             RExC_parse = (char *)orig_parse;
17165             if ( SIZE_ONLY) {
17166                 if (! LOC) {
17167
17168                     /* To get locale nodes to not use the full ANYOF size would
17169                      * require moving the code above that writes the portions
17170                      * of it that aren't in other nodes to after this point.
17171                      * e.g.  ANYOF_POSIXL_SET */
17172                     RExC_size = orig_size;
17173                 }
17174             }
17175             else {
17176                 RExC_emit = (regnode *)orig_emit;
17177                 if (PL_regkind[op] == POSIXD) {
17178                     if (op == POSIXL) {
17179                         RExC_contains_locale = 1;
17180                     }
17181                     if (invert) {
17182                         op += NPOSIXD - POSIXD;
17183                     }
17184                 }
17185             }
17186
17187             ret = reg_node(pRExC_state, op);
17188
17189             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17190                 if (! SIZE_ONLY) {
17191                     FLAGS(ret) = arg;
17192                 }
17193                 *flagp |= HASWIDTH|SIMPLE;
17194             }
17195             else if (PL_regkind[op] == EXACT) {
17196                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17197                                            TRUE /* downgradable to EXACT */
17198                                            );
17199             }
17200
17201             RExC_parse = (char *) cur_parse;
17202
17203             SvREFCNT_dec(posixes);
17204             SvREFCNT_dec(nposixes);
17205             SvREFCNT_dec(simple_posixes);
17206             SvREFCNT_dec(cp_list);
17207             SvREFCNT_dec(cp_foldable_list);
17208             return ret;
17209         }
17210     }
17211
17212     if (SIZE_ONLY)
17213         return ret;
17214     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17215
17216     /* If folding, we calculate all characters that could fold to or from the
17217      * ones already on the list */
17218     if (cp_foldable_list) {
17219         if (FOLD) {
17220             UV start, end;      /* End points of code point ranges */
17221
17222             SV* fold_intersection = NULL;
17223             SV** use_list;
17224
17225             /* Our calculated list will be for Unicode rules.  For locale
17226              * matching, we have to keep a separate list that is consulted at
17227              * runtime only when the locale indicates Unicode rules.  For
17228              * non-locale, we just use the general list */
17229             if (LOC) {
17230                 use_list = &only_utf8_locale_list;
17231             }
17232             else {
17233                 use_list = &cp_list;
17234             }
17235
17236             /* Only the characters in this class that participate in folds need
17237              * be checked.  Get the intersection of this class and all the
17238              * possible characters that are foldable.  This can quickly narrow
17239              * down a large class */
17240             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17241                                   &fold_intersection);
17242
17243             /* The folds for all the Latin1 characters are hard-coded into this
17244              * program, but we have to go out to disk to get the others. */
17245             if (invlist_highest(cp_foldable_list) >= 256) {
17246
17247                 /* This is a hash that for a particular fold gives all
17248                  * characters that are involved in it */
17249                 if (! PL_utf8_foldclosures) {
17250                     _load_PL_utf8_foldclosures();
17251                 }
17252             }
17253
17254             /* Now look at the foldable characters in this class individually */
17255             invlist_iterinit(fold_intersection);
17256             while (invlist_iternext(fold_intersection, &start, &end)) {
17257                 UV j;
17258
17259                 /* Look at every character in the range */
17260                 for (j = start; j <= end; j++) {
17261                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17262                     STRLEN foldlen;
17263                     SV** listp;
17264
17265                     if (j < 256) {
17266
17267                         if (IS_IN_SOME_FOLD_L1(j)) {
17268
17269                             /* ASCII is always matched; non-ASCII is matched
17270                              * only under Unicode rules (which could happen
17271                              * under /l if the locale is a UTF-8 one */
17272                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17273                                 *use_list = add_cp_to_invlist(*use_list,
17274                                                             PL_fold_latin1[j]);
17275                             }
17276                             else {
17277                                 has_upper_latin1_only_utf8_matches
17278                                     = add_cp_to_invlist(
17279                                             has_upper_latin1_only_utf8_matches,
17280                                             PL_fold_latin1[j]);
17281                             }
17282                         }
17283
17284                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17285                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17286                         {
17287                             add_above_Latin1_folds(pRExC_state,
17288                                                    (U8) j,
17289                                                    use_list);
17290                         }
17291                         continue;
17292                     }
17293
17294                     /* Here is an above Latin1 character.  We don't have the
17295                      * rules hard-coded for it.  First, get its fold.  This is
17296                      * the simple fold, as the multi-character folds have been
17297                      * handled earlier and separated out */
17298                     _to_uni_fold_flags(j, foldbuf, &foldlen,
17299                                                         (ASCII_FOLD_RESTRICTED)
17300                                                         ? FOLD_FLAGS_NOMIX_ASCII
17301                                                         : 0);
17302
17303                     /* Single character fold of above Latin1.  Add everything in
17304                     * its fold closure to the list that this node should match.
17305                     * The fold closures data structure is a hash with the keys
17306                     * being the UTF-8 of every character that is folded to, like
17307                     * 'k', and the values each an array of all code points that
17308                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
17309                     * Multi-character folds are not included */
17310                     if ((listp = hv_fetch(PL_utf8_foldclosures,
17311                                         (char *) foldbuf, foldlen, FALSE)))
17312                     {
17313                         AV* list = (AV*) *listp;
17314                         IV k;
17315                         for (k = 0; k <= av_tindex_nomg(list); k++) {
17316                             SV** c_p = av_fetch(list, k, FALSE);
17317                             UV c;
17318                             assert(c_p);
17319
17320                             c = SvUV(*c_p);
17321
17322                             /* /aa doesn't allow folds between ASCII and non- */
17323                             if ((ASCII_FOLD_RESTRICTED
17324                                 && (isASCII(c) != isASCII(j))))
17325                             {
17326                                 continue;
17327                             }
17328
17329                             /* Folds under /l which cross the 255/256 boundary
17330                              * are added to a separate list.  (These are valid
17331                              * only when the locale is UTF-8.) */
17332                             if (c < 256 && LOC) {
17333                                 *use_list = add_cp_to_invlist(*use_list, c);
17334                                 continue;
17335                             }
17336
17337                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17338                             {
17339                                 cp_list = add_cp_to_invlist(cp_list, c);
17340                             }
17341                             else {
17342                                 /* Similarly folds involving non-ascii Latin1
17343                                 * characters under /d are added to their list */
17344                                 has_upper_latin1_only_utf8_matches
17345                                         = add_cp_to_invlist(
17346                                            has_upper_latin1_only_utf8_matches,
17347                                            c);
17348                             }
17349                         }
17350                     }
17351                 }
17352             }
17353             SvREFCNT_dec_NN(fold_intersection);
17354         }
17355
17356         /* Now that we have finished adding all the folds, there is no reason
17357          * to keep the foldable list separate */
17358         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17359         SvREFCNT_dec_NN(cp_foldable_list);
17360     }
17361
17362     /* And combine the result (if any) with any inversion lists from posix
17363      * classes.  The lists are kept separate up to now because we don't want to
17364      * fold the classes (folding of those is automatically handled by the swash
17365      * fetching code) */
17366     if (simple_posixes) {   /* These are the classes known to be unaffected by
17367                                /a, /aa, and /d */
17368         if (cp_list) {
17369             _invlist_union(cp_list, simple_posixes, &cp_list);
17370             SvREFCNT_dec_NN(simple_posixes);
17371         }
17372         else {
17373             cp_list = simple_posixes;
17374         }
17375     }
17376     if (posixes || nposixes) {
17377
17378         /* We have to adjust /a and /aa */
17379         if (AT_LEAST_ASCII_RESTRICTED) {
17380
17381             /* Under /a and /aa, nothing above ASCII matches these */
17382             if (posixes) {
17383                 _invlist_intersection(posixes,
17384                                     PL_XPosix_ptrs[_CC_ASCII],
17385                                     &posixes);
17386             }
17387
17388             /* Under /a and /aa, everything above ASCII matches these
17389              * complements */
17390             if (nposixes) {
17391                 _invlist_union_complement_2nd(nposixes,
17392                                               PL_XPosix_ptrs[_CC_ASCII],
17393                                               &nposixes);
17394             }
17395         }
17396
17397         if (! DEPENDS_SEMANTICS) {
17398
17399             /* For everything but /d, we can just add the current 'posixes' and
17400              * 'nposixes' to the main list */
17401             if (posixes) {
17402                 if (cp_list) {
17403                     _invlist_union(cp_list, posixes, &cp_list);
17404                     SvREFCNT_dec_NN(posixes);
17405                 }
17406                 else {
17407                     cp_list = posixes;
17408                 }
17409             }
17410             if (nposixes) {
17411                 if (cp_list) {
17412                     _invlist_union(cp_list, nposixes, &cp_list);
17413                     SvREFCNT_dec_NN(nposixes);
17414                 }
17415                 else {
17416                     cp_list = nposixes;
17417                 }
17418             }
17419         }
17420         else {
17421             /* Under /d, things like \w match upper Latin1 characters only if
17422              * the target string is in UTF-8.  But things like \W match all the
17423              * upper Latin1 characters if the target string is not in UTF-8.
17424              *
17425              * Handle the case where there something like \W separately */
17426             if (nposixes) {
17427                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17428
17429                 /* A complemented posix class matches all upper Latin1
17430                  * characters if not in UTF-8.  And it matches just certain
17431                  * ones when in UTF-8.  That means those certain ones are
17432                  * matched regardless, so can just be added to the
17433                  * unconditional list */
17434                 if (cp_list) {
17435                     _invlist_union(cp_list, nposixes, &cp_list);
17436                     SvREFCNT_dec_NN(nposixes);
17437                     nposixes = NULL;
17438                 }
17439                 else {
17440                     cp_list = nposixes;
17441                 }
17442
17443                 /* Likewise for 'posixes' */
17444                 _invlist_union(posixes, cp_list, &cp_list);
17445
17446                 /* Likewise for anything else in the range that matched only
17447                  * under UTF-8 */
17448                 if (has_upper_latin1_only_utf8_matches) {
17449                     _invlist_union(cp_list,
17450                                    has_upper_latin1_only_utf8_matches,
17451                                    &cp_list);
17452                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17453                     has_upper_latin1_only_utf8_matches = NULL;
17454                 }
17455
17456                 /* If we don't match all the upper Latin1 characters regardless
17457                  * of UTF-8ness, we have to set a flag to match the rest when
17458                  * not in UTF-8 */
17459                 _invlist_subtract(only_non_utf8_list, cp_list,
17460                                   &only_non_utf8_list);
17461                 if (_invlist_len(only_non_utf8_list) != 0) {
17462                     ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17463                 }
17464             }
17465             else {
17466                 /* Here there were no complemented posix classes.  That means
17467                  * the upper Latin1 characters in 'posixes' match only when the
17468                  * target string is in UTF-8.  So we have to add them to the
17469                  * list of those types of code points, while adding the
17470                  * remainder to the unconditional list.
17471                  *
17472                  * First calculate what they are */
17473                 SV* nonascii_but_latin1_properties = NULL;
17474                 _invlist_intersection(posixes, PL_UpperLatin1,
17475                                       &nonascii_but_latin1_properties);
17476
17477                 /* And add them to the final list of such characters. */
17478                 _invlist_union(has_upper_latin1_only_utf8_matches,
17479                                nonascii_but_latin1_properties,
17480                                &has_upper_latin1_only_utf8_matches);
17481
17482                 /* Remove them from what now becomes the unconditional list */
17483                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
17484                                   &posixes);
17485
17486                 /* And add those unconditional ones to the final list */
17487                 if (cp_list) {
17488                     _invlist_union(cp_list, posixes, &cp_list);
17489                     SvREFCNT_dec_NN(posixes);
17490                     posixes = NULL;
17491                 }
17492                 else {
17493                     cp_list = posixes;
17494                 }
17495
17496                 SvREFCNT_dec(nonascii_but_latin1_properties);
17497
17498                 /* Get rid of any characters that we now know are matched
17499                  * unconditionally from the conditional list, which may make
17500                  * that list empty */
17501                 _invlist_subtract(has_upper_latin1_only_utf8_matches,
17502                                   cp_list,
17503                                   &has_upper_latin1_only_utf8_matches);
17504                 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17505                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17506                     has_upper_latin1_only_utf8_matches = NULL;
17507                 }
17508             }
17509         }
17510     }
17511
17512     /* And combine the result (if any) with any inversion list from properties.
17513      * The lists are kept separate up to now so that we can distinguish the two
17514      * in regards to matching above-Unicode.  A run-time warning is generated
17515      * if a Unicode property is matched against a non-Unicode code point. But,
17516      * we allow user-defined properties to match anything, without any warning,
17517      * and we also suppress the warning if there is a portion of the character
17518      * class that isn't a Unicode property, and which matches above Unicode, \W
17519      * or [\x{110000}] for example.
17520      * (Note that in this case, unlike the Posix one above, there is no
17521      * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17522      * forces Unicode semantics */
17523     if (properties) {
17524         if (cp_list) {
17525
17526             /* If it matters to the final outcome, see if a non-property
17527              * component of the class matches above Unicode.  If so, the
17528              * warning gets suppressed.  This is true even if just a single
17529              * such code point is specified, as, though not strictly correct if
17530              * another such code point is matched against, the fact that they
17531              * are using above-Unicode code points indicates they should know
17532              * the issues involved */
17533             if (warn_super) {
17534                 warn_super = ! (invert
17535                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17536             }
17537
17538             _invlist_union(properties, cp_list, &cp_list);
17539             SvREFCNT_dec_NN(properties);
17540         }
17541         else {
17542             cp_list = properties;
17543         }
17544
17545         if (warn_super) {
17546             ANYOF_FLAGS(ret)
17547              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17548
17549             /* Because an ANYOF node is the only one that warns, this node
17550              * can't be optimized into something else */
17551             optimizable = FALSE;
17552         }
17553     }
17554
17555     /* Here, we have calculated what code points should be in the character
17556      * class.
17557      *
17558      * Now we can see about various optimizations.  Fold calculation (which we
17559      * did above) needs to take place before inversion.  Otherwise /[^k]/i
17560      * would invert to include K, which under /i would match k, which it
17561      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
17562      * folded until runtime */
17563
17564     /* If we didn't do folding, it's because some information isn't available
17565      * until runtime; set the run-time fold flag for these.  (We don't have to
17566      * worry about properties folding, as that is taken care of by the swash
17567      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
17568      * locales, or the class matches at least one 0-255 range code point */
17569     if (LOC && FOLD) {
17570
17571         /* Some things on the list might be unconditionally included because of
17572          * other components.  Remove them, and clean up the list if it goes to
17573          * 0 elements */
17574         if (only_utf8_locale_list && cp_list) {
17575             _invlist_subtract(only_utf8_locale_list, cp_list,
17576                               &only_utf8_locale_list);
17577
17578             if (_invlist_len(only_utf8_locale_list) == 0) {
17579                 SvREFCNT_dec_NN(only_utf8_locale_list);
17580                 only_utf8_locale_list = NULL;
17581             }
17582         }
17583         if (only_utf8_locale_list) {
17584             ANYOF_FLAGS(ret)
17585                  |=  ANYOFL_FOLD
17586                     |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17587         }
17588         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17589             UV start, end;
17590             invlist_iterinit(cp_list);
17591             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17592                 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17593             }
17594             invlist_iterfinish(cp_list);
17595         }
17596     }
17597     else if (   DEPENDS_SEMANTICS
17598              && (    has_upper_latin1_only_utf8_matches
17599                  || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
17600     {
17601         OP(ret) = ANYOFD;
17602         optimizable = FALSE;
17603     }
17604
17605
17606     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17607      * at compile time.  Besides not inverting folded locale now, we can't
17608      * invert if there are things such as \w, which aren't known until runtime
17609      * */
17610     if (cp_list
17611         && invert
17612         && OP(ret) != ANYOFD
17613         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17614         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17615     {
17616         _invlist_invert(cp_list);
17617
17618         /* Any swash can't be used as-is, because we've inverted things */
17619         if (swash) {
17620             SvREFCNT_dec_NN(swash);
17621             swash = NULL;
17622         }
17623
17624         /* Clear the invert flag since have just done it here */
17625         invert = FALSE;
17626     }
17627
17628     if (ret_invlist) {
17629         assert(cp_list);
17630
17631         *ret_invlist = cp_list;
17632         SvREFCNT_dec(swash);
17633
17634         /* Discard the generated node */
17635         if (SIZE_ONLY) {
17636             RExC_size = orig_size;
17637         }
17638         else {
17639             RExC_emit = orig_emit;
17640         }
17641         return orig_emit;
17642     }
17643
17644     /* Some character classes are equivalent to other nodes.  Such nodes take
17645      * up less room and generally fewer operations to execute than ANYOF nodes.
17646      * Above, we checked for and optimized into some such equivalents for
17647      * certain common classes that are easy to test.  Getting to this point in
17648      * the code means that the class didn't get optimized there.  Since this
17649      * code is only executed in Pass 2, it is too late to save space--it has
17650      * been allocated in Pass 1, and currently isn't given back.  But turning
17651      * things into an EXACTish node can allow the optimizer to join it to any
17652      * adjacent such nodes.  And if the class is equivalent to things like /./,
17653      * expensive run-time swashes can be avoided.  Now that we have more
17654      * complete information, we can find things necessarily missed by the
17655      * earlier code.  Another possible "optimization" that isn't done is that
17656      * something like [Ee] could be changed into an EXACTFU.  khw tried this
17657      * and found that the ANYOF is faster, including for code points not in the
17658      * bitmap.  This still might make sense to do, provided it got joined with
17659      * an adjacent node(s) to create a longer EXACTFU one.  This could be
17660      * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
17661      * routine would know is joinable.  If that didn't happen, the node type
17662      * could then be made a straight ANYOF */
17663
17664     if (optimizable && cp_list && ! invert) {
17665         UV start, end;
17666         U8 op = END;  /* The optimzation node-type */
17667         int posix_class = -1;   /* Illegal value */
17668         const char * cur_parse= RExC_parse;
17669
17670         invlist_iterinit(cp_list);
17671         if (! invlist_iternext(cp_list, &start, &end)) {
17672
17673             /* Here, the list is empty.  This happens, for example, when a
17674              * Unicode property that doesn't match anything is the only element
17675              * in the character class (perluniprops.pod notes such properties).
17676              * */
17677             op = OPFAIL;
17678             *flagp |= HASWIDTH|SIMPLE;
17679         }
17680         else if (start == end) {    /* The range is a single code point */
17681             if (! invlist_iternext(cp_list, &start, &end)
17682
17683                     /* Don't do this optimization if it would require changing
17684                      * the pattern to UTF-8 */
17685                 && (start < 256 || UTF))
17686             {
17687                 /* Here, the list contains a single code point.  Can optimize
17688                  * into an EXACTish node */
17689
17690                 value = start;
17691
17692                 if (! FOLD) {
17693                     op = (LOC)
17694                          ? EXACTL
17695                          : EXACT;
17696                 }
17697                 else if (LOC) {
17698
17699                     /* A locale node under folding with one code point can be
17700                      * an EXACTFL, as its fold won't be calculated until
17701                      * runtime */
17702                     op = EXACTFL;
17703                 }
17704                 else {
17705
17706                     /* Here, we are generally folding, but there is only one
17707                      * code point to match.  If we have to, we use an EXACT
17708                      * node, but it would be better for joining with adjacent
17709                      * nodes in the optimization pass if we used the same
17710                      * EXACTFish node that any such are likely to be.  We can
17711                      * do this iff the code point doesn't participate in any
17712                      * folds.  For example, an EXACTF of a colon is the same as
17713                      * an EXACT one, since nothing folds to or from a colon. */
17714                     if (value < 256) {
17715                         if (IS_IN_SOME_FOLD_L1(value)) {
17716                             op = EXACT;
17717                         }
17718                     }
17719                     else {
17720                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
17721                             op = EXACT;
17722                         }
17723                     }
17724
17725                     /* If we haven't found the node type, above, it means we
17726                      * can use the prevailing one */
17727                     if (op == END) {
17728                         op = compute_EXACTish(pRExC_state);
17729                     }
17730                 }
17731             }
17732         }   /* End of first range contains just a single code point */
17733         else if (start == 0) {
17734             if (end == UV_MAX) {
17735                 op = SANY;
17736                 *flagp |= HASWIDTH|SIMPLE;
17737                 MARK_NAUGHTY(1);
17738             }
17739             else if (end == '\n' - 1
17740                     && invlist_iternext(cp_list, &start, &end)
17741                     && start == '\n' + 1 && end == UV_MAX)
17742             {
17743                 op = REG_ANY;
17744                 *flagp |= HASWIDTH|SIMPLE;
17745                 MARK_NAUGHTY(1);
17746             }
17747         }
17748         invlist_iterfinish(cp_list);
17749
17750         if (op == END) {
17751             const UV cp_list_len = _invlist_len(cp_list);
17752             const UV* cp_list_array = invlist_array(cp_list);
17753
17754             /* Here, didn't find an optimization.  See if this matches any of
17755              * the POSIX classes.  These run slightly faster for above-Unicode
17756              * code points, so don't bother with POSIXA ones nor the 2 that
17757              * have no above-Unicode matches.  We can avoid these checks unless
17758              * the ANYOF matches at least as high as the lowest POSIX one
17759              * (which was manually found to be \v.  The actual code point may
17760              * increase in later Unicode releases, if a higher code point is
17761              * assigned to be \v, but this code will never break.  It would
17762              * just mean we could execute the checks for posix optimizations
17763              * unnecessarily) */
17764
17765             if (cp_list_array[cp_list_len-1] > 0x2029) {
17766                 for (posix_class = 0;
17767                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
17768                      posix_class++)
17769                 {
17770                     int try_inverted;
17771                     if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
17772                         continue;
17773                     }
17774                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
17775
17776                         /* Check if matches normal or inverted */
17777                         if (_invlistEQ(cp_list,
17778                                        PL_XPosix_ptrs[posix_class],
17779                                        try_inverted))
17780                         {
17781                             op = (try_inverted)
17782                                  ? NPOSIXU
17783                                  : POSIXU;
17784                             *flagp |= HASWIDTH|SIMPLE;
17785                             goto found_posix;
17786                         }
17787                     }
17788                 }
17789               found_posix: ;
17790             }
17791         }
17792
17793         if (op != END) {
17794             RExC_parse = (char *)orig_parse;
17795             RExC_emit = (regnode *)orig_emit;
17796
17797             if (regarglen[op]) {
17798                 ret = reganode(pRExC_state, op, 0);
17799             } else {
17800                 ret = reg_node(pRExC_state, op);
17801             }
17802
17803             RExC_parse = (char *)cur_parse;
17804
17805             if (PL_regkind[op] == EXACT) {
17806                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17807                                            TRUE /* downgradable to EXACT */
17808                                           );
17809             }
17810             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17811                 FLAGS(ret) = posix_class;
17812             }
17813
17814             SvREFCNT_dec_NN(cp_list);
17815             return ret;
17816         }
17817     }
17818
17819     /* Here, <cp_list> contains all the code points we can determine at
17820      * compile time that match under all conditions.  Go through it, and
17821      * for things that belong in the bitmap, put them there, and delete from
17822      * <cp_list>.  While we are at it, see if everything above 255 is in the
17823      * list, and if so, set a flag to speed up execution */
17824
17825     populate_ANYOF_from_invlist(ret, &cp_list);
17826
17827     if (invert) {
17828         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
17829     }
17830
17831     /* Here, the bitmap has been populated with all the Latin1 code points that
17832      * always match.  Can now add to the overall list those that match only
17833      * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
17834      * */
17835     if (has_upper_latin1_only_utf8_matches) {
17836         if (cp_list) {
17837             _invlist_union(cp_list,
17838                            has_upper_latin1_only_utf8_matches,
17839                            &cp_list);
17840             SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17841         }
17842         else {
17843             cp_list = has_upper_latin1_only_utf8_matches;
17844         }
17845         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17846     }
17847
17848     /* If there is a swash and more than one element, we can't use the swash in
17849      * the optimization below. */
17850     if (swash && element_count > 1) {
17851         SvREFCNT_dec_NN(swash);
17852         swash = NULL;
17853     }
17854
17855     /* Note that the optimization of using 'swash' if it is the only thing in
17856      * the class doesn't have us change swash at all, so it can include things
17857      * that are also in the bitmap; otherwise we have purposely deleted that
17858      * duplicate information */
17859     set_ANYOF_arg(pRExC_state, ret, cp_list,
17860                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17861                    ? listsv : NULL,
17862                   only_utf8_locale_list,
17863                   swash, has_user_defined_property);
17864
17865     *flagp |= HASWIDTH|SIMPLE;
17866
17867     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
17868         RExC_contains_locale = 1;
17869     }
17870
17871     return ret;
17872 }
17873
17874 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
17875
17876 STATIC void
17877 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
17878                 regnode* const node,
17879                 SV* const cp_list,
17880                 SV* const runtime_defns,
17881                 SV* const only_utf8_locale_list,
17882                 SV* const swash,
17883                 const bool has_user_defined_property)
17884 {
17885     /* Sets the arg field of an ANYOF-type node 'node', using information about
17886      * the node passed-in.  If there is nothing outside the node's bitmap, the
17887      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
17888      * the count returned by add_data(), having allocated and stored an array,
17889      * av, that that count references, as follows:
17890      *  av[0] stores the character class description in its textual form.
17891      *        This is used later (regexec.c:Perl_regclass_swash()) to
17892      *        initialize the appropriate swash, and is also useful for dumping
17893      *        the regnode.  This is set to &PL_sv_undef if the textual
17894      *        description is not needed at run-time (as happens if the other
17895      *        elements completely define the class)
17896      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
17897      *        computed from av[0].  But if no further computation need be done,
17898      *        the swash is stored here now (and av[0] is &PL_sv_undef).
17899      *  av[2] stores the inversion list of code points that match only if the
17900      *        current locale is UTF-8
17901      *  av[3] stores the cp_list inversion list for use in addition or instead
17902      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
17903      *        (Otherwise everything needed is already in av[0] and av[1])
17904      *  av[4] is set if any component of the class is from a user-defined
17905      *        property; used only if av[3] exists */
17906
17907     UV n;
17908
17909     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
17910
17911     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
17912         assert(! (ANYOF_FLAGS(node)
17913                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
17914         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
17915     }
17916     else {
17917         AV * const av = newAV();
17918         SV *rv;
17919
17920         av_store(av, 0, (runtime_defns)
17921                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
17922         if (swash) {
17923             assert(cp_list);
17924             av_store(av, 1, swash);
17925             SvREFCNT_dec_NN(cp_list);
17926         }
17927         else {
17928             av_store(av, 1, &PL_sv_undef);
17929             if (cp_list) {
17930                 av_store(av, 3, cp_list);
17931                 av_store(av, 4, newSVuv(has_user_defined_property));
17932             }
17933         }
17934
17935         if (only_utf8_locale_list) {
17936             av_store(av, 2, only_utf8_locale_list);
17937         }
17938         else {
17939             av_store(av, 2, &PL_sv_undef);
17940         }
17941
17942         rv = newRV_noinc(MUTABLE_SV(av));
17943         n = add_data(pRExC_state, STR_WITH_LEN("s"));
17944         RExC_rxi->data->data[n] = (void*)rv;
17945         ARG_SET(node, n);
17946     }
17947 }
17948
17949 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
17950 SV *
17951 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
17952                                         const regnode* node,
17953                                         bool doinit,
17954                                         SV** listsvp,
17955                                         SV** only_utf8_locale_ptr,
17956                                         SV** output_invlist)
17957
17958 {
17959     /* For internal core use only.
17960      * Returns the swash for the input 'node' in the regex 'prog'.
17961      * If <doinit> is 'true', will attempt to create the swash if not already
17962      *    done.
17963      * If <listsvp> is non-null, will return the printable contents of the
17964      *    swash.  This can be used to get debugging information even before the
17965      *    swash exists, by calling this function with 'doinit' set to false, in
17966      *    which case the components that will be used to eventually create the
17967      *    swash are returned  (in a printable form).
17968      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
17969      *    store an inversion list of code points that should match only if the
17970      *    execution-time locale is a UTF-8 one.
17971      * If <output_invlist> is not NULL, it is where this routine is to store an
17972      *    inversion list of the code points that would be instead returned in
17973      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
17974      *    when this parameter is used, is just the non-code point data that
17975      *    will go into creating the swash.  This currently should be just
17976      *    user-defined properties whose definitions were not known at compile
17977      *    time.  Using this parameter allows for easier manipulation of the
17978      *    swash's data by the caller.  It is illegal to call this function with
17979      *    this parameter set, but not <listsvp>
17980      *
17981      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
17982      * that, in spite of this function's name, the swash it returns may include
17983      * the bitmap data as well */
17984
17985     SV *sw  = NULL;
17986     SV *si  = NULL;         /* Input swash initialization string */
17987     SV* invlist = NULL;
17988
17989     RXi_GET_DECL(prog,progi);
17990     const struct reg_data * const data = prog ? progi->data : NULL;
17991
17992     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
17993     assert(! output_invlist || listsvp);
17994
17995     if (data && data->count) {
17996         const U32 n = ARG(node);
17997
17998         if (data->what[n] == 's') {
17999             SV * const rv = MUTABLE_SV(data->data[n]);
18000             AV * const av = MUTABLE_AV(SvRV(rv));
18001             SV **const ary = AvARRAY(av);
18002             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18003
18004             si = *ary;  /* ary[0] = the string to initialize the swash with */
18005
18006             if (av_tindex_nomg(av) >= 2) {
18007                 if (only_utf8_locale_ptr
18008                     && ary[2]
18009                     && ary[2] != &PL_sv_undef)
18010                 {
18011                     *only_utf8_locale_ptr = ary[2];
18012                 }
18013                 else {
18014                     assert(only_utf8_locale_ptr);
18015                     *only_utf8_locale_ptr = NULL;
18016                 }
18017
18018                 /* Elements 3 and 4 are either both present or both absent. [3]
18019                  * is any inversion list generated at compile time; [4]
18020                  * indicates if that inversion list has any user-defined
18021                  * properties in it. */
18022                 if (av_tindex_nomg(av) >= 3) {
18023                     invlist = ary[3];
18024                     if (SvUV(ary[4])) {
18025                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18026                     }
18027                 }
18028                 else {
18029                     invlist = NULL;
18030                 }
18031             }
18032
18033             /* Element [1] is reserved for the set-up swash.  If already there,
18034              * return it; if not, create it and store it there */
18035             if (ary[1] && SvROK(ary[1])) {
18036                 sw = ary[1];
18037             }
18038             else if (doinit && ((si && si != &PL_sv_undef)
18039                                  || (invlist && invlist != &PL_sv_undef))) {
18040                 assert(si);
18041                 sw = _core_swash_init("utf8", /* the utf8 package */
18042                                       "", /* nameless */
18043                                       si,
18044                                       1, /* binary */
18045                                       0, /* not from tr/// */
18046                                       invlist,
18047                                       &swash_init_flags);
18048                 (void)av_store(av, 1, sw);
18049             }
18050         }
18051     }
18052
18053     /* If requested, return a printable version of what this swash matches */
18054     if (listsvp) {
18055         SV* matches_string = NULL;
18056
18057         /* The swash should be used, if possible, to get the data, as it
18058          * contains the resolved data.  But this function can be called at
18059          * compile-time, before everything gets resolved, in which case we
18060          * return the currently best available information, which is the string
18061          * that will eventually be used to do that resolving, 'si' */
18062         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18063             && (si && si != &PL_sv_undef))
18064         {
18065             /* Here, we only have 'si' (and possibly some passed-in data in
18066              * 'invlist', which is handled below)  If the caller only wants
18067              * 'si', use that.  */
18068             if (! output_invlist) {
18069                 matches_string = newSVsv(si);
18070             }
18071             else {
18072                 /* But if the caller wants an inversion list of the node, we
18073                  * need to parse 'si' and place as much as possible in the
18074                  * desired output inversion list, making 'matches_string' only
18075                  * contain the currently unresolvable things */
18076                 const char *si_string = SvPVX(si);
18077                 STRLEN remaining = SvCUR(si);
18078                 UV prev_cp = 0;
18079                 U8 count = 0;
18080
18081                 /* Ignore everything before the first new-line */
18082                 while (*si_string != '\n' && remaining > 0) {
18083                     si_string++;
18084                     remaining--;
18085                 }
18086                 assert(remaining > 0);
18087
18088                 si_string++;
18089                 remaining--;
18090
18091                 while (remaining > 0) {
18092
18093                     /* The data consists of just strings defining user-defined
18094                      * property names, but in prior incarnations, and perhaps
18095                      * somehow from pluggable regex engines, it could still
18096                      * hold hex code point definitions.  Each component of a
18097                      * range would be separated by a tab, and each range by a
18098                      * new-line.  If these are found, instead add them to the
18099                      * inversion list */
18100                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
18101                                      |PERL_SCAN_SILENT_NON_PORTABLE;
18102                     STRLEN len = remaining;
18103                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
18104
18105                     /* If the hex decode routine found something, it should go
18106                      * up to the next \n */
18107                     if (   *(si_string + len) == '\n') {
18108                         if (count) {    /* 2nd code point on line */
18109                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
18110                         }
18111                         else {
18112                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
18113                         }
18114                         count = 0;
18115                         goto prepare_for_next_iteration;
18116                     }
18117
18118                     /* If the hex decode was instead for the lower range limit,
18119                      * save it, and go parse the upper range limit */
18120                     if (*(si_string + len) == '\t') {
18121                         assert(count == 0);
18122
18123                         prev_cp = cp;
18124                         count = 1;
18125                       prepare_for_next_iteration:
18126                         si_string += len + 1;
18127                         remaining -= len + 1;
18128                         continue;
18129                     }
18130
18131                     /* Here, didn't find a legal hex number.  Just add it from
18132                      * here to the next \n */
18133
18134                     remaining -= len;
18135                     while (*(si_string + len) != '\n' && remaining > 0) {
18136                         remaining--;
18137                         len++;
18138                     }
18139                     if (*(si_string + len) == '\n') {
18140                         len++;
18141                         remaining--;
18142                     }
18143                     if (matches_string) {
18144                         sv_catpvn(matches_string, si_string, len - 1);
18145                     }
18146                     else {
18147                         matches_string = newSVpvn(si_string, len - 1);
18148                     }
18149                     si_string += len;
18150                     sv_catpvs(matches_string, " ");
18151                 } /* end of loop through the text */
18152
18153                 assert(matches_string);
18154                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
18155                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18156                 }
18157             } /* end of has an 'si' but no swash */
18158         }
18159
18160         /* If we have a swash in place, its equivalent inversion list was above
18161          * placed into 'invlist'.  If not, this variable may contain a stored
18162          * inversion list which is information beyond what is in 'si' */
18163         if (invlist) {
18164
18165             /* Again, if the caller doesn't want the output inversion list, put
18166              * everything in 'matches-string' */
18167             if (! output_invlist) {
18168                 if ( ! matches_string) {
18169                     matches_string = newSVpvs("\n");
18170                 }
18171                 sv_catsv(matches_string, invlist_contents(invlist,
18172                                                   TRUE /* traditional style */
18173                                                   ));
18174             }
18175             else if (! *output_invlist) {
18176                 *output_invlist = invlist_clone(invlist);
18177             }
18178             else {
18179                 _invlist_union(*output_invlist, invlist, output_invlist);
18180             }
18181         }
18182
18183         *listsvp = matches_string;
18184     }
18185
18186     return sw;
18187 }
18188 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18189
18190 /* reg_skipcomment()
18191
18192    Absorbs an /x style # comment from the input stream,
18193    returning a pointer to the first character beyond the comment, or if the
18194    comment terminates the pattern without anything following it, this returns
18195    one past the final character of the pattern (in other words, RExC_end) and
18196    sets the REG_RUN_ON_COMMENT_SEEN flag.
18197
18198    Note it's the callers responsibility to ensure that we are
18199    actually in /x mode
18200
18201 */
18202
18203 PERL_STATIC_INLINE char*
18204 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18205 {
18206     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18207
18208     assert(*p == '#');
18209
18210     while (p < RExC_end) {
18211         if (*(++p) == '\n') {
18212             return p+1;
18213         }
18214     }
18215
18216     /* we ran off the end of the pattern without ending the comment, so we have
18217      * to add an \n when wrapping */
18218     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18219     return p;
18220 }
18221
18222 STATIC void
18223 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18224                                 char ** p,
18225                                 const bool force_to_xmod
18226                          )
18227 {
18228     /* If the text at the current parse position '*p' is a '(?#...)' comment,
18229      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18230      * is /x whitespace, advance '*p' so that on exit it points to the first
18231      * byte past all such white space and comments */
18232
18233     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18234
18235     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18236
18237     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18238
18239     for (;;) {
18240         if (RExC_end - (*p) >= 3
18241             && *(*p)     == '('
18242             && *(*p + 1) == '?'
18243             && *(*p + 2) == '#')
18244         {
18245             while (*(*p) != ')') {
18246                 if ((*p) == RExC_end)
18247                     FAIL("Sequence (?#... not terminated");
18248                 (*p)++;
18249             }
18250             (*p)++;
18251             continue;
18252         }
18253
18254         if (use_xmod) {
18255             const char * save_p = *p;
18256             while ((*p) < RExC_end) {
18257                 STRLEN len;
18258                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18259                     (*p) += len;
18260                 }
18261                 else if (*(*p) == '#') {
18262                     (*p) = reg_skipcomment(pRExC_state, (*p));
18263                 }
18264                 else {
18265                     break;
18266                 }
18267             }
18268             if (*p != save_p) {
18269                 continue;
18270             }
18271         }
18272
18273         break;
18274     }
18275
18276     return;
18277 }
18278
18279 /* nextchar()
18280
18281    Advances the parse position by one byte, unless that byte is the beginning
18282    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
18283    those two cases, the parse position is advanced beyond all such comments and
18284    white space.
18285
18286    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18287 */
18288
18289 STATIC void
18290 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18291 {
18292     PERL_ARGS_ASSERT_NEXTCHAR;
18293
18294     if (RExC_parse < RExC_end) {
18295         assert(   ! UTF
18296                || UTF8_IS_INVARIANT(*RExC_parse)
18297                || UTF8_IS_START(*RExC_parse));
18298
18299         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18300
18301         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18302                                 FALSE /* Don't force /x */ );
18303     }
18304 }
18305
18306 STATIC regnode *
18307 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18308 {
18309     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18310      * space.  In pass1, it aligns and increments RExC_size; in pass2,
18311      * RExC_emit */
18312
18313     regnode * const ret = RExC_emit;
18314     GET_RE_DEBUG_FLAGS_DECL;
18315
18316     PERL_ARGS_ASSERT_REGNODE_GUTS;
18317
18318     assert(extra_size >= regarglen[op]);
18319
18320     if (SIZE_ONLY) {
18321         SIZE_ALIGN(RExC_size);
18322         RExC_size += 1 + extra_size;
18323         return(ret);
18324     }
18325     if (RExC_emit >= RExC_emit_bound)
18326         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18327                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
18328
18329     NODE_ALIGN_FILL(ret);
18330 #ifndef RE_TRACK_PATTERN_OFFSETS
18331     PERL_UNUSED_ARG(name);
18332 #else
18333     if (RExC_offsets) {         /* MJD */
18334         MJD_OFFSET_DEBUG(
18335               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
18336               name, __LINE__,
18337               PL_reg_name[op],
18338               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18339                 ? "Overwriting end of array!\n" : "OK",
18340               (UV)(RExC_emit - RExC_emit_start),
18341               (UV)(RExC_parse - RExC_start),
18342               (UV)RExC_offsets[0]));
18343         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18344     }
18345 #endif
18346     return(ret);
18347 }
18348
18349 /*
18350 - reg_node - emit a node
18351 */
18352 STATIC regnode *                        /* Location. */
18353 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18354 {
18355     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18356
18357     PERL_ARGS_ASSERT_REG_NODE;
18358
18359     assert(regarglen[op] == 0);
18360
18361     if (PASS2) {
18362         regnode *ptr = ret;
18363         FILL_ADVANCE_NODE(ptr, op);
18364         RExC_emit = ptr;
18365     }
18366     return(ret);
18367 }
18368
18369 /*
18370 - reganode - emit a node with an argument
18371 */
18372 STATIC regnode *                        /* Location. */
18373 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18374 {
18375     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18376
18377     PERL_ARGS_ASSERT_REGANODE;
18378
18379     assert(regarglen[op] == 1);
18380
18381     if (PASS2) {
18382         regnode *ptr = ret;
18383         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18384         RExC_emit = ptr;
18385     }
18386     return(ret);
18387 }
18388
18389 STATIC regnode *
18390 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18391 {
18392     /* emit a node with U32 and I32 arguments */
18393
18394     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18395
18396     PERL_ARGS_ASSERT_REG2LANODE;
18397
18398     assert(regarglen[op] == 2);
18399
18400     if (PASS2) {
18401         regnode *ptr = ret;
18402         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18403         RExC_emit = ptr;
18404     }
18405     return(ret);
18406 }
18407
18408 /*
18409 - reginsert - insert an operator in front of already-emitted operand
18410 *
18411 * Means relocating the operand.
18412 */
18413 STATIC void
18414 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
18415 {
18416     regnode *src;
18417     regnode *dst;
18418     regnode *place;
18419     const int offset = regarglen[(U8)op];
18420     const int size = NODE_STEP_REGNODE + offset;
18421     GET_RE_DEBUG_FLAGS_DECL;
18422
18423     PERL_ARGS_ASSERT_REGINSERT;
18424     PERL_UNUSED_CONTEXT;
18425     PERL_UNUSED_ARG(depth);
18426 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18427     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18428     if (SIZE_ONLY) {
18429         RExC_size += size;
18430         return;
18431     }
18432     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18433                                     studying. If this is wrong then we need to adjust RExC_recurse
18434                                     below like we do with RExC_open_parens/RExC_close_parens. */
18435     src = RExC_emit;
18436     RExC_emit += size;
18437     dst = RExC_emit;
18438     if (RExC_open_parens) {
18439         int paren;
18440         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
18441         /* remember that RExC_npar is rex->nparens + 1,
18442          * iow it is 1 more than the number of parens seen in
18443          * the pattern so far. */
18444         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18445             /* note, RExC_open_parens[0] is the start of the
18446              * regex, it can't move. RExC_close_parens[0] is the end
18447              * of the regex, it *can* move. */
18448             if ( paren && RExC_open_parens[paren] >= opnd ) {
18449                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18450                 RExC_open_parens[paren] += size;
18451             } else {
18452                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18453             }
18454             if ( RExC_close_parens[paren] >= opnd ) {
18455                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18456                 RExC_close_parens[paren] += size;
18457             } else {
18458                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18459             }
18460         }
18461     }
18462     if (RExC_end_op)
18463         RExC_end_op += size;
18464
18465     while (src > opnd) {
18466         StructCopy(--src, --dst, regnode);
18467 #ifdef RE_TRACK_PATTERN_OFFSETS
18468         if (RExC_offsets) {     /* MJD 20010112 */
18469             MJD_OFFSET_DEBUG(
18470                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
18471                   "reg_insert",
18472                   __LINE__,
18473                   PL_reg_name[op],
18474                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18475                     ? "Overwriting end of array!\n" : "OK",
18476                   (UV)(src - RExC_emit_start),
18477                   (UV)(dst - RExC_emit_start),
18478                   (UV)RExC_offsets[0]));
18479             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18480             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18481         }
18482 #endif
18483     }
18484
18485
18486     place = opnd;               /* Op node, where operand used to be. */
18487 #ifdef RE_TRACK_PATTERN_OFFSETS
18488     if (RExC_offsets) {         /* MJD */
18489         MJD_OFFSET_DEBUG(
18490               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
18491               "reginsert",
18492               __LINE__,
18493               PL_reg_name[op],
18494               (UV)(place - RExC_emit_start) > RExC_offsets[0]
18495               ? "Overwriting end of array!\n" : "OK",
18496               (UV)(place - RExC_emit_start),
18497               (UV)(RExC_parse - RExC_start),
18498               (UV)RExC_offsets[0]));
18499         Set_Node_Offset(place, RExC_parse);
18500         Set_Node_Length(place, 1);
18501     }
18502 #endif
18503     src = NEXTOPER(place);
18504     FILL_ADVANCE_NODE(place, op);
18505     Zero(src, offset, regnode);
18506 }
18507
18508 /*
18509 - regtail - set the next-pointer at the end of a node chain of p to val.
18510 - SEE ALSO: regtail_study
18511 */
18512 STATIC void
18513 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18514                 const regnode * const p,
18515                 const regnode * const val,
18516                 const U32 depth)
18517 {
18518     regnode *scan;
18519     GET_RE_DEBUG_FLAGS_DECL;
18520
18521     PERL_ARGS_ASSERT_REGTAIL;
18522 #ifndef DEBUGGING
18523     PERL_UNUSED_ARG(depth);
18524 #endif
18525
18526     if (SIZE_ONLY)
18527         return;
18528
18529     /* Find last node. */
18530     scan = (regnode *) p;
18531     for (;;) {
18532         regnode * const temp = regnext(scan);
18533         DEBUG_PARSE_r({
18534             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18535             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18536             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
18537                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18538                     (temp == NULL ? "->" : ""),
18539                     (temp == NULL ? PL_reg_name[OP(val)] : "")
18540             );
18541         });
18542         if (temp == NULL)
18543             break;
18544         scan = temp;
18545     }
18546
18547     if (reg_off_by_arg[OP(scan)]) {
18548         ARG_SET(scan, val - scan);
18549     }
18550     else {
18551         NEXT_OFF(scan) = val - scan;
18552     }
18553 }
18554
18555 #ifdef DEBUGGING
18556 /*
18557 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18558 - Look for optimizable sequences at the same time.
18559 - currently only looks for EXACT chains.
18560
18561 This is experimental code. The idea is to use this routine to perform
18562 in place optimizations on branches and groups as they are constructed,
18563 with the long term intention of removing optimization from study_chunk so
18564 that it is purely analytical.
18565
18566 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18567 to control which is which.
18568
18569 */
18570 /* TODO: All four parms should be const */
18571
18572 STATIC U8
18573 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18574                       const regnode *val,U32 depth)
18575 {
18576     regnode *scan;
18577     U8 exact = PSEUDO;
18578 #ifdef EXPERIMENTAL_INPLACESCAN
18579     I32 min = 0;
18580 #endif
18581     GET_RE_DEBUG_FLAGS_DECL;
18582
18583     PERL_ARGS_ASSERT_REGTAIL_STUDY;
18584
18585
18586     if (SIZE_ONLY)
18587         return exact;
18588
18589     /* Find last node. */
18590
18591     scan = p;
18592     for (;;) {
18593         regnode * const temp = regnext(scan);
18594 #ifdef EXPERIMENTAL_INPLACESCAN
18595         if (PL_regkind[OP(scan)] == EXACT) {
18596             bool unfolded_multi_char;   /* Unexamined in this routine */
18597             if (join_exact(pRExC_state, scan, &min,
18598                            &unfolded_multi_char, 1, val, depth+1))
18599                 return EXACT;
18600         }
18601 #endif
18602         if ( exact ) {
18603             switch (OP(scan)) {
18604                 case EXACT:
18605                 case EXACTL:
18606                 case EXACTF:
18607                 case EXACTFA_NO_TRIE:
18608                 case EXACTFA:
18609                 case EXACTFU:
18610                 case EXACTFLU8:
18611                 case EXACTFU_SS:
18612                 case EXACTFL:
18613                         if( exact == PSEUDO )
18614                             exact= OP(scan);
18615                         else if ( exact != OP(scan) )
18616                             exact= 0;
18617                 case NOTHING:
18618                     break;
18619                 default:
18620                     exact= 0;
18621             }
18622         }
18623         DEBUG_PARSE_r({
18624             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18625             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18626             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
18627                 SvPV_nolen_const(RExC_mysv),
18628                 REG_NODE_NUM(scan),
18629                 PL_reg_name[exact]);
18630         });
18631         if (temp == NULL)
18632             break;
18633         scan = temp;
18634     }
18635     DEBUG_PARSE_r({
18636         DEBUG_PARSE_MSG("");
18637         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
18638         Perl_re_printf( aTHX_
18639                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
18640                       SvPV_nolen_const(RExC_mysv),
18641                       (IV)REG_NODE_NUM(val),
18642                       (IV)(val - scan)
18643         );
18644     });
18645     if (reg_off_by_arg[OP(scan)]) {
18646         ARG_SET(scan, val - scan);
18647     }
18648     else {
18649         NEXT_OFF(scan) = val - scan;
18650     }
18651
18652     return exact;
18653 }
18654 #endif
18655
18656 /*
18657  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
18658  */
18659 #ifdef DEBUGGING
18660
18661 static void
18662 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
18663 {
18664     int bit;
18665     int set=0;
18666
18667     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18668
18669     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
18670         if (flags & (1<<bit)) {
18671             if (!set++ && lead)
18672                 Perl_re_printf( aTHX_  "%s",lead);
18673             Perl_re_printf( aTHX_  "%s ",PL_reg_intflags_name[bit]);
18674         }
18675     }
18676     if (lead)  {
18677         if (set)
18678             Perl_re_printf( aTHX_  "\n");
18679         else
18680             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18681     }
18682 }
18683
18684 static void
18685 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
18686 {
18687     int bit;
18688     int set=0;
18689     regex_charset cs;
18690
18691     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18692
18693     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
18694         if (flags & (1<<bit)) {
18695             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
18696                 continue;
18697             }
18698             if (!set++ && lead)
18699                 Perl_re_printf( aTHX_  "%s",lead);
18700             Perl_re_printf( aTHX_  "%s ",PL_reg_extflags_name[bit]);
18701         }
18702     }
18703     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
18704             if (!set++ && lead) {
18705                 Perl_re_printf( aTHX_  "%s",lead);
18706             }
18707             switch (cs) {
18708                 case REGEX_UNICODE_CHARSET:
18709                     Perl_re_printf( aTHX_  "UNICODE");
18710                     break;
18711                 case REGEX_LOCALE_CHARSET:
18712                     Perl_re_printf( aTHX_  "LOCALE");
18713                     break;
18714                 case REGEX_ASCII_RESTRICTED_CHARSET:
18715                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
18716                     break;
18717                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
18718                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
18719                     break;
18720                 default:
18721                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
18722                     break;
18723             }
18724     }
18725     if (lead)  {
18726         if (set)
18727             Perl_re_printf( aTHX_  "\n");
18728         else
18729             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18730     }
18731 }
18732 #endif
18733
18734 void
18735 Perl_regdump(pTHX_ const regexp *r)
18736 {
18737 #ifdef DEBUGGING
18738     SV * const sv = sv_newmortal();
18739     SV *dsv= sv_newmortal();
18740     RXi_GET_DECL(r,ri);
18741     GET_RE_DEBUG_FLAGS_DECL;
18742
18743     PERL_ARGS_ASSERT_REGDUMP;
18744
18745     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
18746
18747     /* Header fields of interest. */
18748     if (r->anchored_substr) {
18749         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
18750             RE_SV_DUMPLEN(r->anchored_substr), 30);
18751         Perl_re_printf( aTHX_
18752                       "anchored %s%s at %"IVdf" ",
18753                       s, RE_SV_TAIL(r->anchored_substr),
18754                       (IV)r->anchored_offset);
18755     } else if (r->anchored_utf8) {
18756         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
18757             RE_SV_DUMPLEN(r->anchored_utf8), 30);
18758         Perl_re_printf( aTHX_
18759                       "anchored utf8 %s%s at %"IVdf" ",
18760                       s, RE_SV_TAIL(r->anchored_utf8),
18761                       (IV)r->anchored_offset);
18762     }
18763     if (r->float_substr) {
18764         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
18765             RE_SV_DUMPLEN(r->float_substr), 30);
18766         Perl_re_printf( aTHX_
18767                       "floating %s%s at %"IVdf"..%"UVuf" ",
18768                       s, RE_SV_TAIL(r->float_substr),
18769                       (IV)r->float_min_offset, (UV)r->float_max_offset);
18770     } else if (r->float_utf8) {
18771         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
18772             RE_SV_DUMPLEN(r->float_utf8), 30);
18773         Perl_re_printf( aTHX_
18774                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
18775                       s, RE_SV_TAIL(r->float_utf8),
18776                       (IV)r->float_min_offset, (UV)r->float_max_offset);
18777     }
18778     if (r->check_substr || r->check_utf8)
18779         Perl_re_printf( aTHX_
18780                       (const char *)
18781                       (r->check_substr == r->float_substr
18782                        && r->check_utf8 == r->float_utf8
18783                        ? "(checking floating" : "(checking anchored"));
18784     if (r->intflags & PREGf_NOSCAN)
18785         Perl_re_printf( aTHX_  " noscan");
18786     if (r->extflags & RXf_CHECK_ALL)
18787         Perl_re_printf( aTHX_  " isall");
18788     if (r->check_substr || r->check_utf8)
18789         Perl_re_printf( aTHX_  ") ");
18790
18791     if (ri->regstclass) {
18792         regprop(r, sv, ri->regstclass, NULL, NULL);
18793         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
18794     }
18795     if (r->intflags & PREGf_ANCH) {
18796         Perl_re_printf( aTHX_  "anchored");
18797         if (r->intflags & PREGf_ANCH_MBOL)
18798             Perl_re_printf( aTHX_  "(MBOL)");
18799         if (r->intflags & PREGf_ANCH_SBOL)
18800             Perl_re_printf( aTHX_  "(SBOL)");
18801         if (r->intflags & PREGf_ANCH_GPOS)
18802             Perl_re_printf( aTHX_  "(GPOS)");
18803         Perl_re_printf( aTHX_ " ");
18804     }
18805     if (r->intflags & PREGf_GPOS_SEEN)
18806         Perl_re_printf( aTHX_  "GPOS:%"UVuf" ", (UV)r->gofs);
18807     if (r->intflags & PREGf_SKIP)
18808         Perl_re_printf( aTHX_  "plus ");
18809     if (r->intflags & PREGf_IMPLICIT)
18810         Perl_re_printf( aTHX_  "implicit ");
18811     Perl_re_printf( aTHX_  "minlen %"IVdf" ", (IV)r->minlen);
18812     if (r->extflags & RXf_EVAL_SEEN)
18813         Perl_re_printf( aTHX_  "with eval ");
18814     Perl_re_printf( aTHX_  "\n");
18815     DEBUG_FLAGS_r({
18816         regdump_extflags("r->extflags: ",r->extflags);
18817         regdump_intflags("r->intflags: ",r->intflags);
18818     });
18819 #else
18820     PERL_ARGS_ASSERT_REGDUMP;
18821     PERL_UNUSED_CONTEXT;
18822     PERL_UNUSED_ARG(r);
18823 #endif  /* DEBUGGING */
18824 }
18825
18826 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
18827 #ifdef DEBUGGING
18828
18829 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
18830      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
18831      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
18832      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
18833      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
18834      || _CC_VERTSPACE != 15
18835 #   error Need to adjust order of anyofs[]
18836 #  endif
18837 static const char * const anyofs[] = {
18838     "\\w",
18839     "\\W",
18840     "\\d",
18841     "\\D",
18842     "[:alpha:]",
18843     "[:^alpha:]",
18844     "[:lower:]",
18845     "[:^lower:]",
18846     "[:upper:]",
18847     "[:^upper:]",
18848     "[:punct:]",
18849     "[:^punct:]",
18850     "[:print:]",
18851     "[:^print:]",
18852     "[:alnum:]",
18853     "[:^alnum:]",
18854     "[:graph:]",
18855     "[:^graph:]",
18856     "[:cased:]",
18857     "[:^cased:]",
18858     "\\s",
18859     "\\S",
18860     "[:blank:]",
18861     "[:^blank:]",
18862     "[:xdigit:]",
18863     "[:^xdigit:]",
18864     "[:cntrl:]",
18865     "[:^cntrl:]",
18866     "[:ascii:]",
18867     "[:^ascii:]",
18868     "\\v",
18869     "\\V"
18870 };
18871 #endif
18872
18873 /*
18874 - regprop - printable representation of opcode, with run time support
18875 */
18876
18877 void
18878 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
18879 {
18880 #ifdef DEBUGGING
18881     int k;
18882     RXi_GET_DECL(prog,progi);
18883     GET_RE_DEBUG_FLAGS_DECL;
18884
18885     PERL_ARGS_ASSERT_REGPROP;
18886
18887     sv_setpvn(sv, "", 0);
18888
18889     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
18890         /* It would be nice to FAIL() here, but this may be called from
18891            regexec.c, and it would be hard to supply pRExC_state. */
18892         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
18893                                               (int)OP(o), (int)REGNODE_MAX);
18894     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
18895
18896     k = PL_regkind[OP(o)];
18897
18898     if (k == EXACT) {
18899         sv_catpvs(sv, " ");
18900         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
18901          * is a crude hack but it may be the best for now since
18902          * we have no flag "this EXACTish node was UTF-8"
18903          * --jhi */
18904         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
18905                   PERL_PV_ESCAPE_UNI_DETECT |
18906                   PERL_PV_ESCAPE_NONASCII   |
18907                   PERL_PV_PRETTY_ELLIPSES   |
18908                   PERL_PV_PRETTY_LTGT       |
18909                   PERL_PV_PRETTY_NOCLEAR
18910                   );
18911     } else if (k == TRIE) {
18912         /* print the details of the trie in dumpuntil instead, as
18913          * progi->data isn't available here */
18914         const char op = OP(o);
18915         const U32 n = ARG(o);
18916         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
18917                (reg_ac_data *)progi->data->data[n] :
18918                NULL;
18919         const reg_trie_data * const trie
18920             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
18921
18922         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
18923         DEBUG_TRIE_COMPILE_r(
18924           Perl_sv_catpvf(aTHX_ sv,
18925             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
18926             (UV)trie->startstate,
18927             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
18928             (UV)trie->wordcount,
18929             (UV)trie->minlen,
18930             (UV)trie->maxlen,
18931             (UV)TRIE_CHARCOUNT(trie),
18932             (UV)trie->uniquecharcount
18933           );
18934         );
18935         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
18936             sv_catpvs(sv, "[");
18937             (void) put_charclass_bitmap_innards(sv,
18938                                                 ((IS_ANYOF_TRIE(op))
18939                                                  ? ANYOF_BITMAP(o)
18940                                                  : TRIE_BITMAP(trie)),
18941                                                 NULL,
18942                                                 NULL,
18943                                                 NULL,
18944                                                 FALSE
18945                                                );
18946             sv_catpvs(sv, "]");
18947         }
18948
18949     } else if (k == CURLY) {
18950         U32 lo = ARG1(o), hi = ARG2(o);
18951         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
18952             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
18953         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
18954         if (hi == REG_INFTY)
18955             sv_catpvs(sv, "INFTY");
18956         else
18957             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
18958         sv_catpvs(sv, "}");
18959     }
18960     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
18961         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
18962     else if (k == REF || k == OPEN || k == CLOSE
18963              || k == GROUPP || OP(o)==ACCEPT)
18964     {
18965         AV *name_list= NULL;
18966         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
18967         Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno);        /* Parenth number */
18968         if ( RXp_PAREN_NAMES(prog) ) {
18969             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
18970         } else if ( pRExC_state ) {
18971             name_list= RExC_paren_name_list;
18972         }
18973         if (name_list) {
18974             if ( k != REF || (OP(o) < NREF)) {
18975                 SV **name= av_fetch(name_list, parno, 0 );
18976                 if (name)
18977                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18978             }
18979             else {
18980                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
18981                 I32 *nums=(I32*)SvPVX(sv_dat);
18982                 SV **name= av_fetch(name_list, nums[0], 0 );
18983                 I32 n;
18984                 if (name) {
18985                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
18986                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
18987                                     (n ? "," : ""), (IV)nums[n]);
18988                     }
18989                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18990                 }
18991             }
18992         }
18993         if ( k == REF && reginfo) {
18994             U32 n = ARG(o);  /* which paren pair */
18995             I32 ln = prog->offs[n].start;
18996             if (prog->lastparen < n || ln == -1)
18997                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
18998             else if (ln == prog->offs[n].end)
18999                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19000             else {
19001                 const char *s = reginfo->strbeg + ln;
19002                 Perl_sv_catpvf(aTHX_ sv, ": ");
19003                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19004                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19005             }
19006         }
19007     } else if (k == GOSUB) {
19008         AV *name_list= NULL;
19009         if ( RXp_PAREN_NAMES(prog) ) {
19010             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19011         } else if ( pRExC_state ) {
19012             name_list= RExC_paren_name_list;
19013         }
19014
19015         /* Paren and offset */
19016         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19017                 (int)((o + (int)ARG2L(o)) - progi->program) );
19018         if (name_list) {
19019             SV **name= av_fetch(name_list, ARG(o), 0 );
19020             if (name)
19021                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
19022         }
19023     }
19024     else if (k == LOGICAL)
19025         /* 2: embedded, otherwise 1 */
19026         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19027     else if (k == ANYOF) {
19028         const U8 flags = ANYOF_FLAGS(o);
19029         bool do_sep = FALSE;    /* Do we need to separate various components of
19030                                    the output? */
19031         /* Set if there is still an unresolved user-defined property */
19032         SV *unresolved                = NULL;
19033
19034         /* Things that are ignored except when the runtime locale is UTF-8 */
19035         SV *only_utf8_locale_invlist = NULL;
19036
19037         /* Code points that don't fit in the bitmap */
19038         SV *nonbitmap_invlist = NULL;
19039
19040         /* And things that aren't in the bitmap, but are small enough to be */
19041         SV* bitmap_range_not_in_bitmap = NULL;
19042
19043         const bool inverted = flags & ANYOF_INVERT;
19044
19045         if (OP(o) == ANYOFL) {
19046             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
19047                 sv_catpvs(sv, "{utf8-locale-reqd}");
19048             }
19049             if (flags & ANYOFL_FOLD) {
19050                 sv_catpvs(sv, "{i}");
19051             }
19052         }
19053
19054         /* If there is stuff outside the bitmap, get it */
19055         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
19056             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
19057                                                 &unresolved,
19058                                                 &only_utf8_locale_invlist,
19059                                                 &nonbitmap_invlist);
19060             /* The non-bitmap data may contain stuff that could fit in the
19061              * bitmap.  This could come from a user-defined property being
19062              * finally resolved when this call was done; or much more likely
19063              * because there are matches that require UTF-8 to be valid, and so
19064              * aren't in the bitmap.  This is teased apart later */
19065             _invlist_intersection(nonbitmap_invlist,
19066                                   PL_InBitmap,
19067                                   &bitmap_range_not_in_bitmap);
19068             /* Leave just the things that don't fit into the bitmap */
19069             _invlist_subtract(nonbitmap_invlist,
19070                               PL_InBitmap,
19071                               &nonbitmap_invlist);
19072         }
19073
19074         /* Obey this flag to add all above-the-bitmap code points */
19075         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
19076             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
19077                                                       NUM_ANYOF_CODE_POINTS,
19078                                                       UV_MAX);
19079         }
19080
19081         /* Ready to start outputting.  First, the initial left bracket */
19082         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19083
19084         /* Then all the things that could fit in the bitmap */
19085         do_sep = put_charclass_bitmap_innards(sv,
19086                                               ANYOF_BITMAP(o),
19087                                               bitmap_range_not_in_bitmap,
19088                                               only_utf8_locale_invlist,
19089                                               o,
19090
19091                                               /* Can't try inverting for a
19092                                                * better display if there are
19093                                                * things that haven't been
19094                                                * resolved */
19095                                               unresolved != NULL);
19096         SvREFCNT_dec(bitmap_range_not_in_bitmap);
19097
19098         /* If there are user-defined properties which haven't been defined yet,
19099          * output them.  If the result is not to be inverted, it is clearest to
19100          * output them in a separate [] from the bitmap range stuff.  If the
19101          * result is to be complemented, we have to show everything in one [],
19102          * as the inversion applies to the whole thing.  Use {braces} to
19103          * separate them from anything in the bitmap and anything above the
19104          * bitmap. */
19105         if (unresolved) {
19106             if (inverted) {
19107                 if (! do_sep) { /* If didn't output anything in the bitmap */
19108                     sv_catpvs(sv, "^");
19109                 }
19110                 sv_catpvs(sv, "{");
19111             }
19112             else if (do_sep) {
19113                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19114             }
19115             sv_catsv(sv, unresolved);
19116             if (inverted) {
19117                 sv_catpvs(sv, "}");
19118             }
19119             do_sep = ! inverted;
19120         }
19121
19122         /* And, finally, add the above-the-bitmap stuff */
19123         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
19124             SV* contents;
19125
19126             /* See if truncation size is overridden */
19127             const STRLEN dump_len = (PL_dump_re_max_len)
19128                                     ? PL_dump_re_max_len
19129                                     : 256;
19130
19131             /* This is output in a separate [] */
19132             if (do_sep) {
19133                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
19134             }
19135
19136             /* And, for easy of understanding, it is shown in the
19137              * uncomplemented form if possible.  The one exception being if
19138              * there are unresolved items, where the inversion has to be
19139              * delayed until runtime */
19140             if (inverted && ! unresolved) {
19141                 _invlist_invert(nonbitmap_invlist);
19142                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
19143             }
19144
19145             contents = invlist_contents(nonbitmap_invlist,
19146                                         FALSE /* output suitable for catsv */
19147                                        );
19148
19149             /* If the output is shorter than the permissible maximum, just do it. */
19150             if (SvCUR(contents) <= dump_len) {
19151                 sv_catsv(sv, contents);
19152             }
19153             else {
19154                 const char * contents_string = SvPVX(contents);
19155                 STRLEN i = dump_len;
19156
19157                 /* Otherwise, start at the permissible max and work back to the
19158                  * first break possibility */
19159                 while (i > 0 && contents_string[i] != ' ') {
19160                     i--;
19161                 }
19162                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
19163                                        find a legal break */
19164                     i = dump_len;
19165                 }
19166
19167                 sv_catpvn(sv, contents_string, i);
19168                 sv_catpvs(sv, "...");
19169             }
19170
19171             SvREFCNT_dec_NN(contents);
19172             SvREFCNT_dec_NN(nonbitmap_invlist);
19173         }
19174
19175         /* And finally the matching, closing ']' */
19176         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19177
19178         SvREFCNT_dec(unresolved);
19179     }
19180     else if (k == POSIXD || k == NPOSIXD) {
19181         U8 index = FLAGS(o) * 2;
19182         if (index < C_ARRAY_LENGTH(anyofs)) {
19183             if (*anyofs[index] != '[')  {
19184                 sv_catpv(sv, "[");
19185             }
19186             sv_catpv(sv, anyofs[index]);
19187             if (*anyofs[index] != '[')  {
19188                 sv_catpv(sv, "]");
19189             }
19190         }
19191         else {
19192             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19193         }
19194     }
19195     else if (k == BOUND || k == NBOUND) {
19196         /* Must be synced with order of 'bound_type' in regcomp.h */
19197         const char * const bounds[] = {
19198             "",      /* Traditional */
19199             "{gcb}",
19200             "{lb}",
19201             "{sb}",
19202             "{wb}"
19203         };
19204         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19205         sv_catpv(sv, bounds[FLAGS(o)]);
19206     }
19207     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19208         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19209     else if (OP(o) == SBOL)
19210         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19211
19212     /* add on the verb argument if there is one */
19213     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19214         Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
19215                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19216     }
19217 #else
19218     PERL_UNUSED_CONTEXT;
19219     PERL_UNUSED_ARG(sv);
19220     PERL_UNUSED_ARG(o);
19221     PERL_UNUSED_ARG(prog);
19222     PERL_UNUSED_ARG(reginfo);
19223     PERL_UNUSED_ARG(pRExC_state);
19224 #endif  /* DEBUGGING */
19225 }
19226
19227
19228
19229 SV *
19230 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19231 {                               /* Assume that RE_INTUIT is set */
19232     struct regexp *const prog = ReANY(r);
19233     GET_RE_DEBUG_FLAGS_DECL;
19234
19235     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19236     PERL_UNUSED_CONTEXT;
19237
19238     DEBUG_COMPILE_r(
19239         {
19240             const char * const s = SvPV_nolen_const(RX_UTF8(r)
19241                       ? prog->check_utf8 : prog->check_substr);
19242
19243             if (!PL_colorset) reginitcolors();
19244             Perl_re_printf( aTHX_
19245                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19246                       PL_colors[4],
19247                       RX_UTF8(r) ? "utf8 " : "",
19248                       PL_colors[5],PL_colors[0],
19249                       s,
19250                       PL_colors[1],
19251                       (strlen(s) > 60 ? "..." : ""));
19252         } );
19253
19254     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19255     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19256 }
19257
19258 /*
19259    pregfree()
19260
19261    handles refcounting and freeing the perl core regexp structure. When
19262    it is necessary to actually free the structure the first thing it
19263    does is call the 'free' method of the regexp_engine associated to
19264    the regexp, allowing the handling of the void *pprivate; member
19265    first. (This routine is not overridable by extensions, which is why
19266    the extensions free is called first.)
19267
19268    See regdupe and regdupe_internal if you change anything here.
19269 */
19270 #ifndef PERL_IN_XSUB_RE
19271 void
19272 Perl_pregfree(pTHX_ REGEXP *r)
19273 {
19274     SvREFCNT_dec(r);
19275 }
19276
19277 void
19278 Perl_pregfree2(pTHX_ REGEXP *rx)
19279 {
19280     struct regexp *const r = ReANY(rx);
19281     GET_RE_DEBUG_FLAGS_DECL;
19282
19283     PERL_ARGS_ASSERT_PREGFREE2;
19284
19285     if (r->mother_re) {
19286         ReREFCNT_dec(r->mother_re);
19287     } else {
19288         CALLREGFREE_PVT(rx); /* free the private data */
19289         SvREFCNT_dec(RXp_PAREN_NAMES(r));
19290         Safefree(r->xpv_len_u.xpvlenu_pv);
19291     }
19292     if (r->substrs) {
19293         SvREFCNT_dec(r->anchored_substr);
19294         SvREFCNT_dec(r->anchored_utf8);
19295         SvREFCNT_dec(r->float_substr);
19296         SvREFCNT_dec(r->float_utf8);
19297         Safefree(r->substrs);
19298     }
19299     RX_MATCH_COPY_FREE(rx);
19300 #ifdef PERL_ANY_COW
19301     SvREFCNT_dec(r->saved_copy);
19302 #endif
19303     Safefree(r->offs);
19304     SvREFCNT_dec(r->qr_anoncv);
19305     if (r->recurse_locinput)
19306         Safefree(r->recurse_locinput);
19307     rx->sv_u.svu_rx = 0;
19308 }
19309
19310 /*  reg_temp_copy()
19311
19312     This is a hacky workaround to the structural issue of match results
19313     being stored in the regexp structure which is in turn stored in
19314     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19315     could be PL_curpm in multiple contexts, and could require multiple
19316     result sets being associated with the pattern simultaneously, such
19317     as when doing a recursive match with (??{$qr})
19318
19319     The solution is to make a lightweight copy of the regexp structure
19320     when a qr// is returned from the code executed by (??{$qr}) this
19321     lightweight copy doesn't actually own any of its data except for
19322     the starp/end and the actual regexp structure itself.
19323
19324 */
19325
19326
19327 REGEXP *
19328 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
19329 {
19330     struct regexp *ret;
19331     struct regexp *const r = ReANY(rx);
19332     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
19333
19334     PERL_ARGS_ASSERT_REG_TEMP_COPY;
19335
19336     if (!ret_x)
19337         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
19338     else {
19339         SvOK_off((SV *)ret_x);
19340         if (islv) {
19341             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
19342                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
19343                made both spots point to the same regexp body.) */
19344             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19345             assert(!SvPVX(ret_x));
19346             ret_x->sv_u.svu_rx = temp->sv_any;
19347             temp->sv_any = NULL;
19348             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19349             SvREFCNT_dec_NN(temp);
19350             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19351                ing below will not set it. */
19352             SvCUR_set(ret_x, SvCUR(rx));
19353         }
19354     }
19355     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19356        sv_force_normal(sv) is called.  */
19357     SvFAKE_on(ret_x);
19358     ret = ReANY(ret_x);
19359
19360     SvFLAGS(ret_x) |= SvUTF8(rx);
19361     /* We share the same string buffer as the original regexp, on which we
19362        hold a reference count, incremented when mother_re is set below.
19363        The string pointer is copied here, being part of the regexp struct.
19364      */
19365     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
19366            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19367     if (r->offs) {
19368         const I32 npar = r->nparens+1;
19369         Newx(ret->offs, npar, regexp_paren_pair);
19370         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19371     }
19372     if (r->substrs) {
19373         Newx(ret->substrs, 1, struct reg_substr_data);
19374         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19375
19376         SvREFCNT_inc_void(ret->anchored_substr);
19377         SvREFCNT_inc_void(ret->anchored_utf8);
19378         SvREFCNT_inc_void(ret->float_substr);
19379         SvREFCNT_inc_void(ret->float_utf8);
19380
19381         /* check_substr and check_utf8, if non-NULL, point to either their
19382            anchored or float namesakes, and don't hold a second reference.  */
19383     }
19384     RX_MATCH_COPIED_off(ret_x);
19385 #ifdef PERL_ANY_COW
19386     ret->saved_copy = NULL;
19387 #endif
19388     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
19389     SvREFCNT_inc_void(ret->qr_anoncv);
19390     if (r->recurse_locinput)
19391         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19392
19393     return ret_x;
19394 }
19395 #endif
19396
19397 /* regfree_internal()
19398
19399    Free the private data in a regexp. This is overloadable by
19400    extensions. Perl takes care of the regexp structure in pregfree(),
19401    this covers the *pprivate pointer which technically perl doesn't
19402    know about, however of course we have to handle the
19403    regexp_internal structure when no extension is in use.
19404
19405    Note this is called before freeing anything in the regexp
19406    structure.
19407  */
19408
19409 void
19410 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19411 {
19412     struct regexp *const r = ReANY(rx);
19413     RXi_GET_DECL(r,ri);
19414     GET_RE_DEBUG_FLAGS_DECL;
19415
19416     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19417
19418     DEBUG_COMPILE_r({
19419         if (!PL_colorset)
19420             reginitcolors();
19421         {
19422             SV *dsv= sv_newmortal();
19423             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19424                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
19425             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19426                 PL_colors[4],PL_colors[5],s);
19427         }
19428     });
19429 #ifdef RE_TRACK_PATTERN_OFFSETS
19430     if (ri->u.offsets)
19431         Safefree(ri->u.offsets);             /* 20010421 MJD */
19432 #endif
19433     if (ri->code_blocks) {
19434         int n;
19435         for (n = 0; n < ri->num_code_blocks; n++)
19436             SvREFCNT_dec(ri->code_blocks[n].src_regex);
19437         Safefree(ri->code_blocks);
19438     }
19439
19440     if (ri->data) {
19441         int n = ri->data->count;
19442
19443         while (--n >= 0) {
19444           /* If you add a ->what type here, update the comment in regcomp.h */
19445             switch (ri->data->what[n]) {
19446             case 'a':
19447             case 'r':
19448             case 's':
19449             case 'S':
19450             case 'u':
19451                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19452                 break;
19453             case 'f':
19454                 Safefree(ri->data->data[n]);
19455                 break;
19456             case 'l':
19457             case 'L':
19458                 break;
19459             case 'T':
19460                 { /* Aho Corasick add-on structure for a trie node.
19461                      Used in stclass optimization only */
19462                     U32 refcount;
19463                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19464 #ifdef USE_ITHREADS
19465                     dVAR;
19466 #endif
19467                     OP_REFCNT_LOCK;
19468                     refcount = --aho->refcount;
19469                     OP_REFCNT_UNLOCK;
19470                     if ( !refcount ) {
19471                         PerlMemShared_free(aho->states);
19472                         PerlMemShared_free(aho->fail);
19473                          /* do this last!!!! */
19474                         PerlMemShared_free(ri->data->data[n]);
19475                         /* we should only ever get called once, so
19476                          * assert as much, and also guard the free
19477                          * which /might/ happen twice. At the least
19478                          * it will make code anlyzers happy and it
19479                          * doesn't cost much. - Yves */
19480                         assert(ri->regstclass);
19481                         if (ri->regstclass) {
19482                             PerlMemShared_free(ri->regstclass);
19483                             ri->regstclass = 0;
19484                         }
19485                     }
19486                 }
19487                 break;
19488             case 't':
19489                 {
19490                     /* trie structure. */
19491                     U32 refcount;
19492                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19493 #ifdef USE_ITHREADS
19494                     dVAR;
19495 #endif
19496                     OP_REFCNT_LOCK;
19497                     refcount = --trie->refcount;
19498                     OP_REFCNT_UNLOCK;
19499                     if ( !refcount ) {
19500                         PerlMemShared_free(trie->charmap);
19501                         PerlMemShared_free(trie->states);
19502                         PerlMemShared_free(trie->trans);
19503                         if (trie->bitmap)
19504                             PerlMemShared_free(trie->bitmap);
19505                         if (trie->jump)
19506                             PerlMemShared_free(trie->jump);
19507                         PerlMemShared_free(trie->wordinfo);
19508                         /* do this last!!!! */
19509                         PerlMemShared_free(ri->data->data[n]);
19510                     }
19511                 }
19512                 break;
19513             default:
19514                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
19515                                                     ri->data->what[n]);
19516             }
19517         }
19518         Safefree(ri->data->what);
19519         Safefree(ri->data);
19520     }
19521
19522     Safefree(ri);
19523 }
19524
19525 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19526 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19527 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
19528
19529 /*
19530    re_dup_guts - duplicate a regexp.
19531
19532    This routine is expected to clone a given regexp structure. It is only
19533    compiled under USE_ITHREADS.
19534
19535    After all of the core data stored in struct regexp is duplicated
19536    the regexp_engine.dupe method is used to copy any private data
19537    stored in the *pprivate pointer. This allows extensions to handle
19538    any duplication it needs to do.
19539
19540    See pregfree() and regfree_internal() if you change anything here.
19541 */
19542 #if defined(USE_ITHREADS)
19543 #ifndef PERL_IN_XSUB_RE
19544 void
19545 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19546 {
19547     dVAR;
19548     I32 npar;
19549     const struct regexp *r = ReANY(sstr);
19550     struct regexp *ret = ReANY(dstr);
19551
19552     PERL_ARGS_ASSERT_RE_DUP_GUTS;
19553
19554     npar = r->nparens+1;
19555     Newx(ret->offs, npar, regexp_paren_pair);
19556     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19557
19558     if (ret->substrs) {
19559         /* Do it this way to avoid reading from *r after the StructCopy().
19560            That way, if any of the sv_dup_inc()s dislodge *r from the L1
19561            cache, it doesn't matter.  */
19562         const bool anchored = r->check_substr
19563             ? r->check_substr == r->anchored_substr
19564             : r->check_utf8 == r->anchored_utf8;
19565         Newx(ret->substrs, 1, struct reg_substr_data);
19566         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19567
19568         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
19569         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
19570         ret->float_substr = sv_dup_inc(ret->float_substr, param);
19571         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
19572
19573         /* check_substr and check_utf8, if non-NULL, point to either their
19574            anchored or float namesakes, and don't hold a second reference.  */
19575
19576         if (ret->check_substr) {
19577             if (anchored) {
19578                 assert(r->check_utf8 == r->anchored_utf8);
19579                 ret->check_substr = ret->anchored_substr;
19580                 ret->check_utf8 = ret->anchored_utf8;
19581             } else {
19582                 assert(r->check_substr == r->float_substr);
19583                 assert(r->check_utf8 == r->float_utf8);
19584                 ret->check_substr = ret->float_substr;
19585                 ret->check_utf8 = ret->float_utf8;
19586             }
19587         } else if (ret->check_utf8) {
19588             if (anchored) {
19589                 ret->check_utf8 = ret->anchored_utf8;
19590             } else {
19591                 ret->check_utf8 = ret->float_utf8;
19592             }
19593         }
19594     }
19595
19596     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19597     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19598     if (r->recurse_locinput)
19599         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19600
19601     if (ret->pprivate)
19602         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19603
19604     if (RX_MATCH_COPIED(dstr))
19605         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
19606     else
19607         ret->subbeg = NULL;
19608 #ifdef PERL_ANY_COW
19609     ret->saved_copy = NULL;
19610 #endif
19611
19612     /* Whether mother_re be set or no, we need to copy the string.  We
19613        cannot refrain from copying it when the storage points directly to
19614        our mother regexp, because that's
19615                1: a buffer in a different thread
19616                2: something we no longer hold a reference on
19617                so we need to copy it locally.  */
19618     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
19619     ret->mother_re   = NULL;
19620 }
19621 #endif /* PERL_IN_XSUB_RE */
19622
19623 /*
19624    regdupe_internal()
19625
19626    This is the internal complement to regdupe() which is used to copy
19627    the structure pointed to by the *pprivate pointer in the regexp.
19628    This is the core version of the extension overridable cloning hook.
19629    The regexp structure being duplicated will be copied by perl prior
19630    to this and will be provided as the regexp *r argument, however
19631    with the /old/ structures pprivate pointer value. Thus this routine
19632    may override any copying normally done by perl.
19633
19634    It returns a pointer to the new regexp_internal structure.
19635 */
19636
19637 void *
19638 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
19639 {
19640     dVAR;
19641     struct regexp *const r = ReANY(rx);
19642     regexp_internal *reti;
19643     int len;
19644     RXi_GET_DECL(r,ri);
19645
19646     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
19647
19648     len = ProgLen(ri);
19649
19650     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
19651           char, regexp_internal);
19652     Copy(ri->program, reti->program, len+1, regnode);
19653
19654
19655     reti->num_code_blocks = ri->num_code_blocks;
19656     if (ri->code_blocks) {
19657         int n;
19658         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
19659                 struct reg_code_block);
19660         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
19661                 struct reg_code_block);
19662         for (n = 0; n < ri->num_code_blocks; n++)
19663              reti->code_blocks[n].src_regex = (REGEXP*)
19664                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
19665     }
19666     else
19667         reti->code_blocks = NULL;
19668
19669     reti->regstclass = NULL;
19670
19671     if (ri->data) {
19672         struct reg_data *d;
19673         const int count = ri->data->count;
19674         int i;
19675
19676         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
19677                 char, struct reg_data);
19678         Newx(d->what, count, U8);
19679
19680         d->count = count;
19681         for (i = 0; i < count; i++) {
19682             d->what[i] = ri->data->what[i];
19683             switch (d->what[i]) {
19684                 /* see also regcomp.h and regfree_internal() */
19685             case 'a': /* actually an AV, but the dup function is identical.  */
19686             case 'r':
19687             case 's':
19688             case 'S':
19689             case 'u': /* actually an HV, but the dup function is identical.  */
19690                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
19691                 break;
19692             case 'f':
19693                 /* This is cheating. */
19694                 Newx(d->data[i], 1, regnode_ssc);
19695                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
19696                 reti->regstclass = (regnode*)d->data[i];
19697                 break;
19698             case 'T':
19699                 /* Trie stclasses are readonly and can thus be shared
19700                  * without duplication. We free the stclass in pregfree
19701                  * when the corresponding reg_ac_data struct is freed.
19702                  */
19703                 reti->regstclass= ri->regstclass;
19704                 /* FALLTHROUGH */
19705             case 't':
19706                 OP_REFCNT_LOCK;
19707                 ((reg_trie_data*)ri->data->data[i])->refcount++;
19708                 OP_REFCNT_UNLOCK;
19709                 /* FALLTHROUGH */
19710             case 'l':
19711             case 'L':
19712                 d->data[i] = ri->data->data[i];
19713                 break;
19714             default:
19715                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
19716                                                            ri->data->what[i]);
19717             }
19718         }
19719
19720         reti->data = d;
19721     }
19722     else
19723         reti->data = NULL;
19724
19725     reti->name_list_idx = ri->name_list_idx;
19726
19727 #ifdef RE_TRACK_PATTERN_OFFSETS
19728     if (ri->u.offsets) {
19729         Newx(reti->u.offsets, 2*len+1, U32);
19730         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
19731     }
19732 #else
19733     SetProgLen(reti,len);
19734 #endif
19735
19736     return (void*)reti;
19737 }
19738
19739 #endif    /* USE_ITHREADS */
19740
19741 #ifndef PERL_IN_XSUB_RE
19742
19743 /*
19744  - regnext - dig the "next" pointer out of a node
19745  */
19746 regnode *
19747 Perl_regnext(pTHX_ regnode *p)
19748 {
19749     I32 offset;
19750
19751     if (!p)
19752         return(NULL);
19753
19754     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
19755         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19756                                                 (int)OP(p), (int)REGNODE_MAX);
19757     }
19758
19759     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
19760     if (offset == 0)
19761         return(NULL);
19762
19763     return(p+offset);
19764 }
19765 #endif
19766
19767 STATIC void
19768 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
19769 {
19770     va_list args;
19771     STRLEN l1 = strlen(pat1);
19772     STRLEN l2 = strlen(pat2);
19773     char buf[512];
19774     SV *msv;
19775     const char *message;
19776
19777     PERL_ARGS_ASSERT_RE_CROAK2;
19778
19779     if (l1 > 510)
19780         l1 = 510;
19781     if (l1 + l2 > 510)
19782         l2 = 510 - l1;
19783     Copy(pat1, buf, l1 , char);
19784     Copy(pat2, buf + l1, l2 , char);
19785     buf[l1 + l2] = '\n';
19786     buf[l1 + l2 + 1] = '\0';
19787     va_start(args, pat2);
19788     msv = vmess(buf, &args);
19789     va_end(args);
19790     message = SvPV_const(msv,l1);
19791     if (l1 > 512)
19792         l1 = 512;
19793     Copy(message, buf, l1 , char);
19794     /* l1-1 to avoid \n */
19795     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
19796 }
19797
19798 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
19799
19800 #ifndef PERL_IN_XSUB_RE
19801 void
19802 Perl_save_re_context(pTHX)
19803 {
19804     I32 nparens = -1;
19805     I32 i;
19806
19807     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
19808
19809     if (PL_curpm) {
19810         const REGEXP * const rx = PM_GETRE(PL_curpm);
19811         if (rx)
19812             nparens = RX_NPARENS(rx);
19813     }
19814
19815     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
19816      * that PL_curpm will be null, but that utf8.pm and the modules it
19817      * loads will only use $1..$3.
19818      * The t/porting/re_context.t test file checks this assumption.
19819      */
19820     if (nparens == -1)
19821         nparens = 3;
19822
19823     for (i = 1; i <= nparens; i++) {
19824         char digits[TYPE_CHARS(long)];
19825         const STRLEN len = my_snprintf(digits, sizeof(digits),
19826                                        "%lu", (long)i);
19827         GV *const *const gvp
19828             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
19829
19830         if (gvp) {
19831             GV * const gv = *gvp;
19832             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
19833                 save_scalar(gv);
19834         }
19835     }
19836 }
19837 #endif
19838
19839 #ifdef DEBUGGING
19840
19841 STATIC void
19842 S_put_code_point(pTHX_ SV *sv, UV c)
19843 {
19844     PERL_ARGS_ASSERT_PUT_CODE_POINT;
19845
19846     if (c > 255) {
19847         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
19848     }
19849     else if (isPRINT(c)) {
19850         const char string = (char) c;
19851
19852         /* We use {phrase} as metanotation in the class, so also escape literal
19853          * braces */
19854         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
19855             sv_catpvs(sv, "\\");
19856         sv_catpvn(sv, &string, 1);
19857     }
19858     else if (isMNEMONIC_CNTRL(c)) {
19859         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
19860     }
19861     else {
19862         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
19863     }
19864 }
19865
19866 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
19867
19868 STATIC void
19869 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
19870 {
19871     /* Appends to 'sv' a displayable version of the range of code points from
19872      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
19873      * that have them, when they occur at the beginning or end of the range.
19874      * It uses hex to output the remaining code points, unless 'allow_literals'
19875      * is true, in which case the printable ASCII ones are output as-is (though
19876      * some of these will be escaped by put_code_point()).
19877      *
19878      * NOTE:  This is designed only for printing ranges of code points that fit
19879      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
19880      */
19881
19882     const unsigned int min_range_count = 3;
19883
19884     assert(start <= end);
19885
19886     PERL_ARGS_ASSERT_PUT_RANGE;
19887
19888     while (start <= end) {
19889         UV this_end;
19890         const char * format;
19891
19892         if (end - start < min_range_count) {
19893
19894             /* Output chars individually when they occur in short ranges */
19895             for (; start <= end; start++) {
19896                 put_code_point(sv, start);
19897             }
19898             break;
19899         }
19900
19901         /* If permitted by the input options, and there is a possibility that
19902          * this range contains a printable literal, look to see if there is
19903          * one. */
19904         if (allow_literals && start <= MAX_PRINT_A) {
19905
19906             /* If the character at the beginning of the range isn't an ASCII
19907              * printable, effectively split the range into two parts:
19908              *  1) the portion before the first such printable,
19909              *  2) the rest
19910              * and output them separately. */
19911             if (! isPRINT_A(start)) {
19912                 UV temp_end = start + 1;
19913
19914                 /* There is no point looking beyond the final possible
19915                  * printable, in MAX_PRINT_A */
19916                 UV max = MIN(end, MAX_PRINT_A);
19917
19918                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
19919                     temp_end++;
19920                 }
19921
19922                 /* Here, temp_end points to one beyond the first printable if
19923                  * found, or to one beyond 'max' if not.  If none found, make
19924                  * sure that we use the entire range */
19925                 if (temp_end > MAX_PRINT_A) {
19926                     temp_end = end + 1;
19927                 }
19928
19929                 /* Output the first part of the split range: the part that
19930                  * doesn't have printables, with the parameter set to not look
19931                  * for literals (otherwise we would infinitely recurse) */
19932                 put_range(sv, start, temp_end - 1, FALSE);
19933
19934                 /* The 2nd part of the range (if any) starts here. */
19935                 start = temp_end;
19936
19937                 /* We do a continue, instead of dropping down, because even if
19938                  * the 2nd part is non-empty, it could be so short that we want
19939                  * to output it as individual characters, as tested for at the
19940                  * top of this loop.  */
19941                 continue;
19942             }
19943
19944             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
19945              * output a sub-range of just the digits or letters, then process
19946              * the remaining portion as usual. */
19947             if (isALPHANUMERIC_A(start)) {
19948                 UV mask = (isDIGIT_A(start))
19949                            ? _CC_DIGIT
19950                              : isUPPER_A(start)
19951                                ? _CC_UPPER
19952                                : _CC_LOWER;
19953                 UV temp_end = start + 1;
19954
19955                 /* Find the end of the sub-range that includes just the
19956                  * characters in the same class as the first character in it */
19957                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
19958                     temp_end++;
19959                 }
19960                 temp_end--;
19961
19962                 /* For short ranges, don't duplicate the code above to output
19963                  * them; just call recursively */
19964                 if (temp_end - start < min_range_count) {
19965                     put_range(sv, start, temp_end, FALSE);
19966                 }
19967                 else {  /* Output as a range */
19968                     put_code_point(sv, start);
19969                     sv_catpvs(sv, "-");
19970                     put_code_point(sv, temp_end);
19971                 }
19972                 start = temp_end + 1;
19973                 continue;
19974             }
19975
19976             /* We output any other printables as individual characters */
19977             if (isPUNCT_A(start) || isSPACE_A(start)) {
19978                 while (start <= end && (isPUNCT_A(start)
19979                                         || isSPACE_A(start)))
19980                 {
19981                     put_code_point(sv, start);
19982                     start++;
19983                 }
19984                 continue;
19985             }
19986         } /* End of looking for literals */
19987
19988         /* Here is not to output as a literal.  Some control characters have
19989          * mnemonic names.  Split off any of those at the beginning and end of
19990          * the range to print mnemonically.  It isn't possible for many of
19991          * these to be in a row, so this won't overwhelm with output */
19992         if (   start <= end
19993             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
19994         {
19995             while (isMNEMONIC_CNTRL(start) && start <= end) {
19996                 put_code_point(sv, start);
19997                 start++;
19998             }
19999
20000             /* If this didn't take care of the whole range ... */
20001             if (start <= end) {
20002
20003                 /* Look backwards from the end to find the final non-mnemonic
20004                  * */
20005                 UV temp_end = end;
20006                 while (isMNEMONIC_CNTRL(temp_end)) {
20007                     temp_end--;
20008                 }
20009
20010                 /* And separately output the interior range that doesn't start
20011                  * or end with mnemonics */
20012                 put_range(sv, start, temp_end, FALSE);
20013
20014                 /* Then output the mnemonic trailing controls */
20015                 start = temp_end + 1;
20016                 while (start <= end) {
20017                     put_code_point(sv, start);
20018                     start++;
20019                 }
20020                 break;
20021             }
20022         }
20023
20024         /* As a final resort, output the range or subrange as hex. */
20025
20026         this_end = (end < NUM_ANYOF_CODE_POINTS)
20027                     ? end
20028                     : NUM_ANYOF_CODE_POINTS - 1;
20029 #if NUM_ANYOF_CODE_POINTS > 256
20030         format = (this_end < 256)
20031                  ? "\\x%02"UVXf"-\\x%02"UVXf""
20032                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
20033 #else
20034         format = "\\x%02"UVXf"-\\x%02"UVXf"";
20035 #endif
20036         GCC_DIAG_IGNORE(-Wformat-nonliteral);
20037         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
20038         GCC_DIAG_RESTORE;
20039         break;
20040     }
20041 }
20042
20043 STATIC void
20044 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
20045 {
20046     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
20047      * 'invlist' */
20048
20049     UV start, end;
20050     bool allow_literals = TRUE;
20051
20052     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
20053
20054     /* Generally, it is more readable if printable characters are output as
20055      * literals, but if a range (nearly) spans all of them, it's best to output
20056      * it as a single range.  This code will use a single range if all but 2
20057      * ASCII printables are in it */
20058     invlist_iterinit(invlist);
20059     while (invlist_iternext(invlist, &start, &end)) {
20060
20061         /* If the range starts beyond the final printable, it doesn't have any
20062          * in it */
20063         if (start > MAX_PRINT_A) {
20064             break;
20065         }
20066
20067         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
20068          * all but two, the range must start and end no later than 2 from
20069          * either end */
20070         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
20071             if (end > MAX_PRINT_A) {
20072                 end = MAX_PRINT_A;
20073             }
20074             if (start < ' ') {
20075                 start = ' ';
20076             }
20077             if (end - start >= MAX_PRINT_A - ' ' - 2) {
20078                 allow_literals = FALSE;
20079             }
20080             break;
20081         }
20082     }
20083     invlist_iterfinish(invlist);
20084
20085     /* Here we have figured things out.  Output each range */
20086     invlist_iterinit(invlist);
20087     while (invlist_iternext(invlist, &start, &end)) {
20088         if (start >= NUM_ANYOF_CODE_POINTS) {
20089             break;
20090         }
20091         put_range(sv, start, end, allow_literals);
20092     }
20093     invlist_iterfinish(invlist);
20094
20095     return;
20096 }
20097
20098 STATIC SV*
20099 S_put_charclass_bitmap_innards_common(pTHX_
20100         SV* invlist,            /* The bitmap */
20101         SV* posixes,            /* Under /l, things like [:word:], \S */
20102         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
20103         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
20104         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
20105         const bool invert       /* Is the result to be inverted? */
20106 )
20107 {
20108     /* Create and return an SV containing a displayable version of the bitmap
20109      * and associated information determined by the input parameters.  If the
20110      * output would have been only the inversion indicator '^', NULL is instead
20111      * returned. */
20112
20113     SV * output;
20114
20115     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
20116
20117     if (invert) {
20118         output = newSVpvs("^");
20119     }
20120     else {
20121         output = newSVpvs("");
20122     }
20123
20124     /* First, the code points in the bitmap that are unconditionally there */
20125     put_charclass_bitmap_innards_invlist(output, invlist);
20126
20127     /* Traditionally, these have been placed after the main code points */
20128     if (posixes) {
20129         sv_catsv(output, posixes);
20130     }
20131
20132     if (only_utf8 && _invlist_len(only_utf8)) {
20133         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
20134         put_charclass_bitmap_innards_invlist(output, only_utf8);
20135     }
20136
20137     if (not_utf8 && _invlist_len(not_utf8)) {
20138         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
20139         put_charclass_bitmap_innards_invlist(output, not_utf8);
20140     }
20141
20142     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
20143         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
20144         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
20145
20146         /* This is the only list in this routine that can legally contain code
20147          * points outside the bitmap range.  The call just above to
20148          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
20149          * output them here.  There's about a half-dozen possible, and none in
20150          * contiguous ranges longer than 2 */
20151         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20152             UV start, end;
20153             SV* above_bitmap = NULL;
20154
20155             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
20156
20157             invlist_iterinit(above_bitmap);
20158             while (invlist_iternext(above_bitmap, &start, &end)) {
20159                 UV i;
20160
20161                 for (i = start; i <= end; i++) {
20162                     put_code_point(output, i);
20163                 }
20164             }
20165             invlist_iterfinish(above_bitmap);
20166             SvREFCNT_dec_NN(above_bitmap);
20167         }
20168     }
20169
20170     if (invert && SvCUR(output) == 1) {
20171         return NULL;
20172     }
20173
20174     return output;
20175 }
20176
20177 STATIC bool
20178 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20179                                      char *bitmap,
20180                                      SV *nonbitmap_invlist,
20181                                      SV *only_utf8_locale_invlist,
20182                                      const regnode * const node,
20183                                      const bool force_as_is_display)
20184 {
20185     /* Appends to 'sv' a displayable version of the innards of the bracketed
20186      * character class defined by the other arguments:
20187      *  'bitmap' points to the bitmap.
20188      *  'nonbitmap_invlist' is an inversion list of the code points that are in
20189      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
20190      *      none.  The reasons for this could be that they require some
20191      *      condition such as the target string being or not being in UTF-8
20192      *      (under /d), or because they came from a user-defined property that
20193      *      was not resolved at the time of the regex compilation (under /u)
20194      *  'only_utf8_locale_invlist' is an inversion list of the code points that
20195      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
20196      *  'node' is the regex pattern node.  It is needed only when the above two
20197      *      parameters are not null, and is passed so that this routine can
20198      *      tease apart the various reasons for them.
20199      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
20200      *      to invert things to see if that leads to a cleaner display.  If
20201      *      FALSE, this routine is free to use its judgment about doing this.
20202      *
20203      * It returns TRUE if there was actually something output.  (It may be that
20204      * the bitmap, etc is empty.)
20205      *
20206      * When called for outputting the bitmap of a non-ANYOF node, just pass the
20207      * bitmap, with the succeeding parameters set to NULL, and the final one to
20208      * FALSE.
20209      */
20210
20211     /* In general, it tries to display the 'cleanest' representation of the
20212      * innards, choosing whether to display them inverted or not, regardless of
20213      * whether the class itself is to be inverted.  However,  there are some
20214      * cases where it can't try inverting, as what actually matches isn't known
20215      * until runtime, and hence the inversion isn't either. */
20216     bool inverting_allowed = ! force_as_is_display;
20217
20218     int i;
20219     STRLEN orig_sv_cur = SvCUR(sv);
20220
20221     SV* invlist;            /* Inversion list we accumulate of code points that
20222                                are unconditionally matched */
20223     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
20224                                UTF-8 */
20225     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
20226                              */
20227     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
20228     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
20229                                        is UTF-8 */
20230
20231     SV* as_is_display;      /* The output string when we take the inputs
20232                                literally */
20233     SV* inverted_display;   /* The output string when we invert the inputs */
20234
20235     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20236
20237     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
20238                                                    to match? */
20239     /* We are biased in favor of displaying things without them being inverted,
20240      * as that is generally easier to understand */
20241     const int bias = 5;
20242
20243     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20244
20245     /* Start off with whatever code points are passed in.  (We clone, so we
20246      * don't change the caller's list) */
20247     if (nonbitmap_invlist) {
20248         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20249         invlist = invlist_clone(nonbitmap_invlist);
20250     }
20251     else {  /* Worst case size is every other code point is matched */
20252         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20253     }
20254
20255     if (flags) {
20256         if (OP(node) == ANYOFD) {
20257
20258             /* This flag indicates that the code points below 0x100 in the
20259              * nonbitmap list are precisely the ones that match only when the
20260              * target is UTF-8 (they should all be non-ASCII). */
20261             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20262             {
20263                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20264                 _invlist_subtract(invlist, only_utf8, &invlist);
20265             }
20266
20267             /* And this flag for matching all non-ASCII 0xFF and below */
20268             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20269             {
20270                 not_utf8 = invlist_clone(PL_UpperLatin1);
20271             }
20272         }
20273         else if (OP(node) == ANYOFL) {
20274
20275             /* If either of these flags are set, what matches isn't
20276              * determinable except during execution, so don't know enough here
20277              * to invert */
20278             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20279                 inverting_allowed = FALSE;
20280             }
20281
20282             /* What the posix classes match also varies at runtime, so these
20283              * will be output symbolically. */
20284             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20285                 int i;
20286
20287                 posixes = newSVpvs("");
20288                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20289                     if (ANYOF_POSIXL_TEST(node,i)) {
20290                         sv_catpv(posixes, anyofs[i]);
20291                     }
20292                 }
20293             }
20294         }
20295     }
20296
20297     /* Accumulate the bit map into the unconditional match list */
20298     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20299         if (BITMAP_TEST(bitmap, i)) {
20300             int start = i++;
20301             for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20302                 /* empty */
20303             }
20304             invlist = _add_range_to_invlist(invlist, start, i-1);
20305         }
20306     }
20307
20308     /* Make sure that the conditional match lists don't have anything in them
20309      * that match unconditionally; otherwise the output is quite confusing.
20310      * This could happen if the code that populates these misses some
20311      * duplication. */
20312     if (only_utf8) {
20313         _invlist_subtract(only_utf8, invlist, &only_utf8);
20314     }
20315     if (not_utf8) {
20316         _invlist_subtract(not_utf8, invlist, &not_utf8);
20317     }
20318
20319     if (only_utf8_locale_invlist) {
20320
20321         /* Since this list is passed in, we have to make a copy before
20322          * modifying it */
20323         only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20324
20325         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20326
20327         /* And, it can get really weird for us to try outputting an inverted
20328          * form of this list when it has things above the bitmap, so don't even
20329          * try */
20330         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20331             inverting_allowed = FALSE;
20332         }
20333     }
20334
20335     /* Calculate what the output would be if we take the input as-is */
20336     as_is_display = put_charclass_bitmap_innards_common(invlist,
20337                                                     posixes,
20338                                                     only_utf8,
20339                                                     not_utf8,
20340                                                     only_utf8_locale,
20341                                                     invert);
20342
20343     /* If have to take the output as-is, just do that */
20344     if (! inverting_allowed) {
20345         if (as_is_display) {
20346             sv_catsv(sv, as_is_display);
20347             SvREFCNT_dec_NN(as_is_display);
20348         }
20349     }
20350     else { /* But otherwise, create the output again on the inverted input, and
20351               use whichever version is shorter */
20352
20353         int inverted_bias, as_is_bias;
20354
20355         /* We will apply our bias to whichever of the the results doesn't have
20356          * the '^' */
20357         if (invert) {
20358             invert = FALSE;
20359             as_is_bias = bias;
20360             inverted_bias = 0;
20361         }
20362         else {
20363             invert = TRUE;
20364             as_is_bias = 0;
20365             inverted_bias = bias;
20366         }
20367
20368         /* Now invert each of the lists that contribute to the output,
20369          * excluding from the result things outside the possible range */
20370
20371         /* For the unconditional inversion list, we have to add in all the
20372          * conditional code points, so that when inverted, they will be gone
20373          * from it */
20374         _invlist_union(only_utf8, invlist, &invlist);
20375         _invlist_union(not_utf8, invlist, &invlist);
20376         _invlist_union(only_utf8_locale, invlist, &invlist);
20377         _invlist_invert(invlist);
20378         _invlist_intersection(invlist, PL_InBitmap, &invlist);
20379
20380         if (only_utf8) {
20381             _invlist_invert(only_utf8);
20382             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20383         }
20384         else if (not_utf8) {
20385
20386             /* If a code point matches iff the target string is not in UTF-8,
20387              * then complementing the result has it not match iff not in UTF-8,
20388              * which is the same thing as matching iff it is UTF-8. */
20389             only_utf8 = not_utf8;
20390             not_utf8 = NULL;
20391         }
20392
20393         if (only_utf8_locale) {
20394             _invlist_invert(only_utf8_locale);
20395             _invlist_intersection(only_utf8_locale,
20396                                   PL_InBitmap,
20397                                   &only_utf8_locale);
20398         }
20399
20400         inverted_display = put_charclass_bitmap_innards_common(
20401                                             invlist,
20402                                             posixes,
20403                                             only_utf8,
20404                                             not_utf8,
20405                                             only_utf8_locale, invert);
20406
20407         /* Use the shortest representation, taking into account our bias
20408          * against showing it inverted */
20409         if (   inverted_display
20410             && (   ! as_is_display
20411                 || (  SvCUR(inverted_display) + inverted_bias
20412                     < SvCUR(as_is_display)    + as_is_bias)))
20413         {
20414             sv_catsv(sv, inverted_display);
20415         }
20416         else if (as_is_display) {
20417             sv_catsv(sv, as_is_display);
20418         }
20419
20420         SvREFCNT_dec(as_is_display);
20421         SvREFCNT_dec(inverted_display);
20422     }
20423
20424     SvREFCNT_dec_NN(invlist);
20425     SvREFCNT_dec(only_utf8);
20426     SvREFCNT_dec(not_utf8);
20427     SvREFCNT_dec(posixes);
20428     SvREFCNT_dec(only_utf8_locale);
20429
20430     return SvCUR(sv) > orig_sv_cur;
20431 }
20432
20433 #define CLEAR_OPTSTART                                                       \
20434     if (optstart) STMT_START {                                               \
20435         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
20436                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
20437         optstart=NULL;                                                       \
20438     } STMT_END
20439
20440 #define DUMPUNTIL(b,e)                                                       \
20441                     CLEAR_OPTSTART;                                          \
20442                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20443
20444 STATIC const regnode *
20445 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20446             const regnode *last, const regnode *plast,
20447             SV* sv, I32 indent, U32 depth)
20448 {
20449     U8 op = PSEUDO;     /* Arbitrary non-END op. */
20450     const regnode *next;
20451     const regnode *optstart= NULL;
20452
20453     RXi_GET_DECL(r,ri);
20454     GET_RE_DEBUG_FLAGS_DECL;
20455
20456     PERL_ARGS_ASSERT_DUMPUNTIL;
20457
20458 #ifdef DEBUG_DUMPUNTIL
20459     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n",indent,node-start,
20460         last ? last-start : 0,plast ? plast-start : 0);
20461 #endif
20462
20463     if (plast && plast < last)
20464         last= plast;
20465
20466     while (PL_regkind[op] != END && (!last || node < last)) {
20467         assert(node);
20468         /* While that wasn't END last time... */
20469         NODE_ALIGN(node);
20470         op = OP(node);
20471         if (op == CLOSE || op == WHILEM)
20472             indent--;
20473         next = regnext((regnode *)node);
20474
20475         /* Where, what. */
20476         if (OP(node) == OPTIMIZED) {
20477             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20478                 optstart = node;
20479             else
20480                 goto after_print;
20481         } else
20482             CLEAR_OPTSTART;
20483
20484         regprop(r, sv, node, NULL, NULL);
20485         Perl_re_printf( aTHX_  "%4"IVdf":%*s%s", (IV)(node - start),
20486                       (int)(2*indent + 1), "", SvPVX_const(sv));
20487
20488         if (OP(node) != OPTIMIZED) {
20489             if (next == NULL)           /* Next ptr. */
20490                 Perl_re_printf( aTHX_  " (0)");
20491             else if (PL_regkind[(U8)op] == BRANCH
20492                      && PL_regkind[OP(next)] != BRANCH )
20493                 Perl_re_printf( aTHX_  " (FAIL)");
20494             else
20495                 Perl_re_printf( aTHX_  " (%"IVdf")", (IV)(next - start));
20496             Perl_re_printf( aTHX_ "\n");
20497         }
20498
20499       after_print:
20500         if (PL_regkind[(U8)op] == BRANCHJ) {
20501             assert(next);
20502             {
20503                 const regnode *nnode = (OP(next) == LONGJMP
20504                                        ? regnext((regnode *)next)
20505                                        : next);
20506                 if (last && nnode > last)
20507                     nnode = last;
20508                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20509             }
20510         }
20511         else if (PL_regkind[(U8)op] == BRANCH) {
20512             assert(next);
20513             DUMPUNTIL(NEXTOPER(node), next);
20514         }
20515         else if ( PL_regkind[(U8)op]  == TRIE ) {
20516             const regnode *this_trie = node;
20517             const char op = OP(node);
20518             const U32 n = ARG(node);
20519             const reg_ac_data * const ac = op>=AHOCORASICK ?
20520                (reg_ac_data *)ri->data->data[n] :
20521                NULL;
20522             const reg_trie_data * const trie =
20523                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20524 #ifdef DEBUGGING
20525             AV *const trie_words
20526                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20527 #endif
20528             const regnode *nextbranch= NULL;
20529             I32 word_idx;
20530             sv_setpvs(sv, "");
20531             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20532                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20533
20534                 Perl_re_indentf( aTHX_  "%s ",
20535                     indent+3,
20536                     elem_ptr
20537                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20538                                 SvCUR(*elem_ptr), 60,
20539                                 PL_colors[0], PL_colors[1],
20540                                 (SvUTF8(*elem_ptr)
20541                                  ? PERL_PV_ESCAPE_UNI
20542                                  : 0)
20543                                 | PERL_PV_PRETTY_ELLIPSES
20544                                 | PERL_PV_PRETTY_LTGT
20545                             )
20546                     : "???"
20547                 );
20548                 if (trie->jump) {
20549                     U16 dist= trie->jump[word_idx+1];
20550                     Perl_re_printf( aTHX_  "(%"UVuf")\n",
20551                                (UV)((dist ? this_trie + dist : next) - start));
20552                     if (dist) {
20553                         if (!nextbranch)
20554                             nextbranch= this_trie + trie->jump[0];
20555                         DUMPUNTIL(this_trie + dist, nextbranch);
20556                     }
20557                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20558                         nextbranch= regnext((regnode *)nextbranch);
20559                 } else {
20560                     Perl_re_printf( aTHX_  "\n");
20561                 }
20562             }
20563             if (last && next > last)
20564                 node= last;
20565             else
20566                 node= next;
20567         }
20568         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
20569             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20570                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20571         }
20572         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20573             assert(next);
20574             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20575         }
20576         else if ( op == PLUS || op == STAR) {
20577             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20578         }
20579         else if (PL_regkind[(U8)op] == ANYOF) {
20580             /* arglen 1 + class block */
20581             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20582                           ? ANYOF_POSIXL_SKIP
20583                           : ANYOF_SKIP);
20584             node = NEXTOPER(node);
20585         }
20586         else if (PL_regkind[(U8)op] == EXACT) {
20587             /* Literal string, where present. */
20588             node += NODE_SZ_STR(node) - 1;
20589             node = NEXTOPER(node);
20590         }
20591         else {
20592             node = NEXTOPER(node);
20593             node += regarglen[(U8)op];
20594         }
20595         if (op == CURLYX || op == OPEN)
20596             indent++;
20597     }
20598     CLEAR_OPTSTART;
20599 #ifdef DEBUG_DUMPUNTIL
20600     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
20601 #endif
20602     return node;
20603 }
20604
20605 #endif  /* DEBUGGING */
20606
20607 /*
20608  * ex: set ts=8 sts=4 sw=4 et:
20609  */