regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 #ifndef MIN
 105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 106 #endif
 107
 108 /* this is a chain of data about sub patterns we are processing that
 109    need to be handled separately/specially in study_chunk. Its so
 110    we can simulate recursion without losing state.  */
 111 struct scan_frame;
 112 typedef struct scan_frame {
 113     regnode *last_regnode;      /* last node to process in this frame */
 114     regnode *next_regnode;      /* next node to process when last is reached */
 115     U32 prev_recursed_depth;
 116     I32 stopparen;              /* what stopparen do we use */
 117     U32 is_top_frame;           /* what flags do we use? */
 118
 119     struct scan_frame *this_prev_frame; /* this previous frame */
 120     struct scan_frame *prev_frame;      /* previous frame */
 121     struct scan_frame *next_frame;      /* next frame */
 122 } scan_frame;
 123
 124 /* Certain characters are output as a sequence with the first being a
 125  * backslash. */
 126 #define isBACKSLASHED_PUNCT(c)                                              \
 127                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 128
 129
 130 struct RExC_state_t {
 131     U32         flags;                  /* RXf_* are we folding, multilining? */
 132     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 133     char        *precomp;               /* uncompiled string. */
 134     char        *precomp_end;           /* pointer to end of uncompiled string. */
 135     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 136     regexp      *rx;                    /* perl core regexp structure */
 137     regexp_internal     *rxi;           /* internal data for regexp object
 138                                            pprivate field */
 139     char        *start;                 /* Start of input for compile */
 140     char        *end;                   /* End of input for compile */
 141     char        *parse;                 /* Input-scan pointer. */
 142     char        *adjusted_start;        /* 'start', adjusted.  See code use */
 143     STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
 144     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 145     regnode     *emit_start;            /* Start of emitted-code area */
 146     regnode     *emit_bound;            /* First regnode outside of the
 147                                            allocated space */
 148     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 149                                            implies compiling, so don't emit */
 150     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 151                                            large enough for the largest
 152                                            non-EXACTish node, so can use it as
 153                                            scratch in pass1 */
 154     I32         naughty;                /* How bad is this pattern? */
 155     I32         sawback;                /* Did we see \1, ...? */
 156     U32         seen;
 157     SSize_t     size;                   /* Code size. */
 158     I32                npar;            /* Capture buffer count, (OPEN) plus
 159                                            one. ("par" 0 is the whole
 160                                            pattern)*/
 161     I32         nestroot;               /* root parens we are in - used by
 162                                            accept */
 163     I32         extralen;
 164     I32         seen_zerolen;
 165     regnode     **open_parens;          /* pointers to open parens */
 166     regnode     **close_parens;         /* pointers to close parens */
 167     regnode     *opend;                 /* END node in program */
 168     I32         utf8;           /* whether the pattern is utf8 or not */
 169     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 170                                 /* XXX use this for future optimisation of case
 171                                  * where pattern must be upgraded to utf8. */
 172     I32         uni_semantics;  /* If a d charset modifier should use unicode
 173                                    rules, even if the pattern is not in
 174                                    utf8 */
 175     HV          *paren_names;           /* Paren names */
 176
 177     regnode     **recurse;              /* Recurse regops */
 178     I32         recurse_count;          /* Number of recurse regops */
 179     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 180                                            through */
 181     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 182     I32         in_lookbehind;
 183     I32         contains_locale;
 184     I32         contains_i;
 185     I32         override_recoding;
 186 #ifdef EBCDIC
 187     I32         recode_x_to_native;
 188 #endif
 189     I32         in_multi_char_class;
 190     struct reg_code_block *code_blocks; /* positions of literal (?{})
 191                                             within pattern */
 192     int         num_code_blocks;        /* size of code_blocks[] */
 193     int         code_index;             /* next code_blocks[] slot */
 194     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 195     scan_frame *frame_head;
 196     scan_frame *frame_last;
 197     U32         frame_count;
 198     U32         strict;
 199 #ifdef ADD_TO_REGEXEC
 200     char        *starttry;              /* -Dr: where regtry was called. */
 201 #define RExC_starttry   (pRExC_state->starttry)
 202 #endif
 203     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205     const char  *lastparse;
 206     I32         lastnum;
 207     AV          *paren_name_list;       /* idx -> name */
 208     U32         study_chunk_recursed_count;
 209     SV          *mysv1;
 210     SV          *mysv2;
 211 #define RExC_lastparse  (pRExC_state->lastparse)
 212 #define RExC_lastnum    (pRExC_state->lastnum)
 213 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 214 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 215 #define RExC_mysv       (pRExC_state->mysv1)
 216 #define RExC_mysv1      (pRExC_state->mysv1)
 217 #define RExC_mysv2      (pRExC_state->mysv2)
 218
 219 #endif
 220     bool        seen_unfolded_sharp_s;
 221 };
 222
 223 #define RExC_flags      (pRExC_state->flags)
 224 #define RExC_pm_flags   (pRExC_state->pm_flags)
 225 #define RExC_precomp    (pRExC_state->precomp)
 226 #define RExC_precomp_adj (pRExC_state->precomp_adj)
 227 #define RExC_adjusted_start  (pRExC_state->adjusted_start)
 228 #define RExC_precomp_end (pRExC_state->precomp_end)
 229 #define RExC_rx_sv      (pRExC_state->rx_sv)
 230 #define RExC_rx         (pRExC_state->rx)
 231 #define RExC_rxi        (pRExC_state->rxi)
 232 #define RExC_start      (pRExC_state->start)
 233 #define RExC_end        (pRExC_state->end)
 234 #define RExC_parse      (pRExC_state->parse)
 235 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 236
 237 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
 238  * EXACTF node, hence was parsed under /di rules.  If later in the parse,
 239  * something forces the pattern into using /ui rules, the sharp s should be
 240  * folded into the sequence 'ss', which takes up more space than previously
 241  * calculated.  This means that the sizing pass needs to be restarted.  (The
 242  * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
 243  * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
 244  * so there is no need to resize [perl #125990]. */
 245 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
 246
 247 #ifdef RE_TRACK_PATTERN_OFFSETS
 248 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 249                                                          others */
 250 #endif
 251 #define RExC_emit       (pRExC_state->emit)
 252 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 253 #define RExC_emit_start (pRExC_state->emit_start)
 254 #define RExC_emit_bound (pRExC_state->emit_bound)
 255 #define RExC_sawback    (pRExC_state->sawback)
 256 #define RExC_seen       (pRExC_state->seen)
 257 #define RExC_size       (pRExC_state->size)
 258 #define RExC_maxlen        (pRExC_state->maxlen)
 259 #define RExC_npar       (pRExC_state->npar)
 260 #define RExC_nestroot   (pRExC_state->nestroot)
 261 #define RExC_extralen   (pRExC_state->extralen)
 262 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 263 #define RExC_utf8       (pRExC_state->utf8)
 264 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 265 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 266 #define RExC_open_parens        (pRExC_state->open_parens)
 267 #define RExC_close_parens       (pRExC_state->close_parens)
 268 #define RExC_opend      (pRExC_state->opend)
 269 #define RExC_paren_names        (pRExC_state->paren_names)
 270 #define RExC_recurse    (pRExC_state->recurse)
 271 #define RExC_recurse_count      (pRExC_state->recurse_count)
 272 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 273 #define RExC_study_chunk_recursed_bytes  \
 274                                    (pRExC_state->study_chunk_recursed_bytes)
 275 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 276 #define RExC_contains_locale    (pRExC_state->contains_locale)
 277 #define RExC_contains_i (pRExC_state->contains_i)
 278 #define RExC_override_recoding (pRExC_state->override_recoding)
 279 #ifdef EBCDIC
 280 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 281 #endif
 282 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 283 #define RExC_frame_head (pRExC_state->frame_head)
 284 #define RExC_frame_last (pRExC_state->frame_last)
 285 #define RExC_frame_count (pRExC_state->frame_count)
 286 #define RExC_strict (pRExC_state->strict)
 287
 288 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 289  * a flag to disable back-off on the fixed/floating substrings - if it's
 290  * a high complexity pattern we assume the benefit of avoiding a full match
 291  * is worth the cost of checking for the substrings even if they rarely help.
 292  */
 293 #define RExC_naughty    (pRExC_state->naughty)
 294 #define TOO_NAUGHTY (10)
 295 #define MARK_NAUGHTY(add) \
 296     if (RExC_naughty < TOO_NAUGHTY) \
 297         RExC_naughty += (add)
 298 #define MARK_NAUGHTY_EXP(exp, add) \
 299     if (RExC_naughty < TOO_NAUGHTY) \
 300         RExC_naughty += RExC_naughty / (exp) + (add)
 301
 302 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 303 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 304         ((*s) == '{' && regcurly(s)))
 305
 306 /*
 307  * Flags to be passed up and down.
 308  */
 309 #define WORST           0       /* Worst case. */
 310 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 311
 312 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 313  * character.  (There needs to be a case: in the switch statement in regexec.c
 314  * for any node marked SIMPLE.)  Note that this is not the same thing as
 315  * REGNODE_SIMPLE */
 316 #define SIMPLE          0x02
 317 #define SPSTART         0x04    /* Starts with * or + */
 318 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 319 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 320 #define RESTART_PASS1   0x20    /* Need to restart sizing pass */
 321 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
 322                                    calcuate sizes as UTF-8 */
 323
 324 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 325
 326 /* whether trie related optimizations are enabled */
 327 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 328 #define TRIE_STUDY_OPT
 329 #define FULL_TRIE_STUDY
 330 #define TRIE_STCLASS
 331 #endif
 332
 333
 334
 335 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 336 #define PBITVAL(paren) (1 << ((paren) & 7))
 337 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 338 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 339 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 340
 341 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 342                                      if (!UTF) {                           \
 343                                          assert(PASS1);                    \
 344                                          *flagp = RESTART_PASS1|NEED_UTF8; \
 345                                          return NULL;                      \
 346                                      }                                     \
 347                              } STMT_END
 348
 349 /* Change from /d into /u rules, and restart the parse if we've already seen
 350  * something whose size would increase as a result, by setting *flagp and
 351  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
 352  * we've change to /u during the parse.  */
 353 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 354     STMT_START {                                                            \
 355             if (DEPENDS_SEMANTICS) {                                        \
 356                 assert(PASS1);                                              \
 357                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 358                 RExC_uni_semantics = 1;                                     \
 359                 if (RExC_seen_unfolded_sharp_s) {                           \
 360                     *flagp |= RESTART_PASS1;                                \
 361                     return restart_retval;                                  \
 362                 }                                                           \
 363             }                                                               \
 364     } STMT_END
 365
 366 /* This converts the named class defined in regcomp.h to its equivalent class
 367  * number defined in handy.h. */
 368 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 369 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 370
 371 #define _invlist_union_complement_2nd(a, b, output) \
 372                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 373 #define _invlist_intersection_complement_2nd(a, b, output) \
 374                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 375
 376 /* About scan_data_t.
 377
 378   During optimisation we recurse through the regexp program performing
 379   various inplace (keyhole style) optimisations. In addition study_chunk
 380   and scan_commit populate this data structure with information about
 381   what strings MUST appear in the pattern. We look for the longest
 382   string that must appear at a fixed location, and we look for the
 383   longest string that may appear at a floating location. So for instance
 384   in the pattern:
 385
 386     /FOO[xX]A.*B[xX]BAR/
 387
 388   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 389   strings (because they follow a .* construct). study_chunk will identify
 390   both FOO and BAR as being the longest fixed and floating strings respectively.
 391
 392   The strings can be composites, for instance
 393
 394      /(f)(o)(o)/
 395
 396   will result in a composite fixed substring 'foo'.
 397
 398   For each string some basic information is maintained:
 399
 400   - offset or min_offset
 401     This is the position the string must appear at, or not before.
 402     It also implicitly (when combined with minlenp) tells us how many
 403     characters must match before the string we are searching for.
 404     Likewise when combined with minlenp and the length of the string it
 405     tells us how many characters must appear after the string we have
 406     found.
 407
 408   - max_offset
 409     Only used for floating strings. This is the rightmost point that
 410     the string can appear at. If set to SSize_t_MAX it indicates that the
 411     string can occur infinitely far to the right.
 412
 413   - minlenp
 414     A pointer to the minimum number of characters of the pattern that the
 415     string was found inside. This is important as in the case of positive
 416     lookahead or positive lookbehind we can have multiple patterns
 417     involved. Consider
 418
 419     /(?=FOO).*F/
 420
 421     The minimum length of the pattern overall is 3, the minimum length
 422     of the lookahead part is 3, but the minimum length of the part that
 423     will actually match is 1. So 'FOO's minimum length is 3, but the
 424     minimum length for the F is 1. This is important as the minimum length
 425     is used to determine offsets in front of and behind the string being
 426     looked for.  Since strings can be composites this is the length of the
 427     pattern at the time it was committed with a scan_commit. Note that
 428     the length is calculated by study_chunk, so that the minimum lengths
 429     are not known until the full pattern has been compiled, thus the
 430     pointer to the value.
 431
 432   - lookbehind
 433
 434     In the case of lookbehind the string being searched for can be
 435     offset past the start point of the final matching string.
 436     If this value was just blithely removed from the min_offset it would
 437     invalidate some of the calculations for how many chars must match
 438     before or after (as they are derived from min_offset and minlen and
 439     the length of the string being searched for).
 440     When the final pattern is compiled and the data is moved from the
 441     scan_data_t structure into the regexp structure the information
 442     about lookbehind is factored in, with the information that would
 443     have been lost precalculated in the end_shift field for the
 444     associated string.
 445
 446   The fields pos_min and pos_delta are used to store the minimum offset
 447   and the delta to the maximum offset at the current point in the pattern.
 448
 449 */
 450
 451 typedef struct scan_data_t {
 452     /*I32 len_min;      unused */
 453     /*I32 len_delta;    unused */
 454     SSize_t pos_min;
 455     SSize_t pos_delta;
 456     SV *last_found;
 457     SSize_t last_end;       /* min value, <0 unless valid. */
 458     SSize_t last_start_min;
 459     SSize_t last_start_max;
 460     SV **longest;           /* Either &l_fixed, or &l_float. */
 461     SV *longest_fixed;      /* longest fixed string found in pattern */
 462     SSize_t offset_fixed;   /* offset where it starts */
 463     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 464     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 465     SV *longest_float;      /* longest floating string found in pattern */
 466     SSize_t offset_float_min; /* earliest point in string it can appear */
 467     SSize_t offset_float_max; /* latest point in string it can appear */
 468     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 469     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 470     I32 flags;
 471     I32 whilem_c;
 472     SSize_t *last_closep;
 473     regnode_ssc *start_class;
 474 } scan_data_t;
 475
 476 /*
 477  * Forward declarations for pregcomp()'s friends.
 478  */
 479
 480 static const scan_data_t zero_scan_data =
 481   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 482
 483 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 484 #define SF_BEFORE_SEOL          0x0001
 485 #define SF_BEFORE_MEOL          0x0002
 486 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 487 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 488
 489 #define SF_FIX_SHIFT_EOL        (+2)
 490 #define SF_FL_SHIFT_EOL         (+4)
 491
 492 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 493 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 494
 495 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 496 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 497 #define SF_IS_INF               0x0040
 498 #define SF_HAS_PAR              0x0080
 499 #define SF_IN_PAR               0x0100
 500 #define SF_HAS_EVAL             0x0200
 501 #define SCF_DO_SUBSTR           0x0400
 502 #define SCF_DO_STCLASS_AND      0x0800
 503 #define SCF_DO_STCLASS_OR       0x1000
 504 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 505 #define SCF_WHILEM_VISITED_POS  0x2000
 506
 507 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 508 #define SCF_SEEN_ACCEPT         0x8000
 509 #define SCF_TRIE_DOING_RESTUDY 0x10000
 510 #define SCF_IN_DEFINE          0x20000
 511
 512
 513
 514
 515 #define UTF cBOOL(RExC_utf8)
 516
 517 /* The enums for all these are ordered so things work out correctly */
 518 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 519 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 520                                                      == REGEX_DEPENDS_CHARSET)
 521 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 522 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 523                                                      >= REGEX_UNICODE_CHARSET)
 524 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 525                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 526 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 527                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 528 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 529                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 530
 531 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 532
 533 /* For programs that want to be strictly Unicode compatible by dying if any
 534  * attempt is made to match a non-Unicode code point against a Unicode
 535  * property.  */
 536 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 537
 538 #define OOB_NAMEDCLASS          -1
 539
 540 /* There is no code point that is out-of-bounds, so this is problematic.  But
 541  * its only current use is to initialize a variable that is always set before
 542  * looked at. */
 543 #define OOB_UNICODE             0xDEADBEEF
 544
 545 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 546 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 547
 548
 549 /* length of regex to show in messages that don't mark a position within */
 550 #define RegexLengthToShowInErrorMessages 127
 551
 552 /*
 553  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 554  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 555  * op/pragma/warn/regcomp.
 556  */
 557 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 558 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 559
 560 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 561                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 562
 563 /* The code in this file in places uses one level of recursion with parsing
 564  * rebased to an alternate string constructed by us in memory.  This can take
 565  * the form of something that is completely different from the input, or
 566  * something that uses the input as part of the alternate.  In the first case,
 567  * there should be no possibility of an error, as we are in complete control of
 568  * the alternate string.  But in the second case we don't control the input
 569  * portion, so there may be errors in that.  Here's an example:
 570  *      /[abc\x{DF}def]/ui
 571  * is handled specially because \x{df} folds to a sequence of more than one
 572  * character, 'ss'.  What is done is to create and parse an alternate string,
 573  * which looks like this:
 574  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 575  * where it uses the input unchanged in the middle of something it constructs,
 576  * which is a branch for the DF outside the character class, and clustering
 577  * parens around the whole thing. (It knows enough to skip the DF inside the
 578  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 579  * need to be reported.  The general situation looks like this:
 580  *
 581  *              sI                       tI               xI       eI
 582  * Input:       ----------------------------------------------------
 583  * Constructed:         ---------------------------------------------------
 584  *                      sC               tC               xC       eC     EC
 585  *
 586  * The input string sI..eI is the input pattern.  The string sC..EC is the
 587  * constructed substitute parse string.  The portions sC..tC and eC..EC are
 588  * constructed by us.  The portion tC..eC is an exact duplicate of the input
 589  * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
 590  * while parsing, we find an error at xC.  We want to display a message showing
 591  * the real input string.  Thus we need to find the point xI in it which
 592  * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
 593  * been constructed by us, and so shouldn't have errors.  We get:
 594  *
 595  *      xI = sI + (tI - sI) + (xC - tC)
 596  *
 597  * and, the offset into sI is:
 598  *
 599  *      (xI - sI) = (tI - sI) + (xC - tC)
 600  *
 601  * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
 602  * and we save tC as RExC_adjusted_start.
 603  *
 604  * During normal processing of the input pattern, everything points to that,
 605  * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
 606  */
 607
 608 #define tI_sI           RExC_precomp_adj
 609 #define tC              RExC_adjusted_start
 610 #define sC              RExC_precomp
 611 #define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
 612 #define xI(xC)          (sC + xI_offset(xC))
 613 #define eC              RExC_precomp_end
 614
 615 #define REPORT_LOCATION_ARGS(xC)                                            \
 616     UTF8fARG(UTF,                                                           \
 617              (xI(xC) > eC) /* Don't run off end */                          \
 618               ? eC - sC   /* Length before the <--HERE */                   \
 619               : xI_offset(xC),                                              \
 620              sC),         /* The input pattern printed up to the <--HERE */ \
 621     UTF8fARG(UTF,                                                           \
 622              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
 623              (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 624
 625 /* Used to point after bad bytes for an error message, but avoid skipping
 626  * past a nul byte. */
 627 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 628
 629 /*
 630  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 631  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 632  * "...".
 633  */
 634 #define _FAIL(code) STMT_START {                                        \
 635     const char *ellipses = "";                                          \
 636     IV len = RExC_precomp_end - RExC_precomp;                                   \
 637                                                                         \
 638     if (!SIZE_ONLY)                                                     \
 639         SAVEFREESV(RExC_rx_sv);                                         \
 640     if (len > RegexLengthToShowInErrorMessages) {                       \
 641         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 642         len = RegexLengthToShowInErrorMessages - 10;                    \
 643         ellipses = "...";                                               \
 644     }                                                                   \
 645     code;                                                               \
 646 } STMT_END
 647
 648 #define FAIL(msg) _FAIL(                            \
 649     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 650             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 651
 652 #define FAIL2(msg,arg) _FAIL(                       \
 653     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 654             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 655
 656 /*
 657  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 658  */
 659 #define Simple_vFAIL(m) STMT_START {                                    \
 660     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 661             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 662 } STMT_END
 663
 664 /*
 665  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 666  */
 667 #define vFAIL(m) STMT_START {                           \
 668     if (!SIZE_ONLY)                                     \
 669         SAVEFREESV(RExC_rx_sv);                         \
 670     Simple_vFAIL(m);                                    \
 671 } STMT_END
 672
 673 /*
 674  * Like Simple_vFAIL(), but accepts two arguments.
 675  */
 676 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 677     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 678                       REPORT_LOCATION_ARGS(RExC_parse));        \
 679 } STMT_END
 680
 681 /*
 682  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 683  */
 684 #define vFAIL2(m,a1) STMT_START {                       \
 685     if (!SIZE_ONLY)                                     \
 686         SAVEFREESV(RExC_rx_sv);                         \
 687     Simple_vFAIL2(m, a1);                               \
 688 } STMT_END
 689
 690
 691 /*
 692  * Like Simple_vFAIL(), but accepts three arguments.
 693  */
 694 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 695     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 696             REPORT_LOCATION_ARGS(RExC_parse));                  \
 697 } STMT_END
 698
 699 /*
 700  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 701  */
 702 #define vFAIL3(m,a1,a2) STMT_START {                    \
 703     if (!SIZE_ONLY)                                     \
 704         SAVEFREESV(RExC_rx_sv);                         \
 705     Simple_vFAIL3(m, a1, a2);                           \
 706 } STMT_END
 707
 708 /*
 709  * Like Simple_vFAIL(), but accepts four arguments.
 710  */
 711 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 712     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 713             REPORT_LOCATION_ARGS(RExC_parse));                  \
 714 } STMT_END
 715
 716 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 717     if (!SIZE_ONLY)                                     \
 718         SAVEFREESV(RExC_rx_sv);                         \
 719     Simple_vFAIL4(m, a1, a2, a3);                       \
 720 } STMT_END
 721
 722 /* A specialized version of vFAIL2 that works with UTF8f */
 723 #define vFAIL2utf8f(m, a1) STMT_START {             \
 724     if (!SIZE_ONLY)                                 \
 725         SAVEFREESV(RExC_rx_sv);                     \
 726     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 727             REPORT_LOCATION_ARGS(RExC_parse));      \
 728 } STMT_END
 729
 730 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 731     if (!SIZE_ONLY)                                     \
 732         SAVEFREESV(RExC_rx_sv);                         \
 733     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 734             REPORT_LOCATION_ARGS(RExC_parse));          \
 735 } STMT_END
 736
 737 /* These have asserts in them because of [perl #122671] Many warnings in
 738  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 739  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 740  * would get output again.  So they should be output in pass2, and these
 741  * asserts make sure new warnings follow that paradigm. */
 742
 743 /* m is not necessarily a "literal string", in this macro */
 744 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 745     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 746                                        "%s" REPORT_LOCATION,            \
 747                                   m, REPORT_LOCATION_ARGS(loc));        \
 748 } STMT_END
 749
 750 #define ckWARNreg(loc,m) STMT_START {                                   \
 751     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 752                                           m REPORT_LOCATION,            \
 753                                           REPORT_LOCATION_ARGS(loc));   \
 754 } STMT_END
 755
 756 #define vWARN(loc, m) STMT_START {                                      \
 757     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 758                                        m REPORT_LOCATION,               \
 759                                        REPORT_LOCATION_ARGS(loc));      \
 760 } STMT_END
 761
 762 #define vWARN_dep(loc, m) STMT_START {                                  \
 763     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),       \
 764                                        m REPORT_LOCATION,               \
 765                                        REPORT_LOCATION_ARGS(loc));      \
 766 } STMT_END
 767
 768 #define ckWARNdep(loc,m) STMT_START {                                   \
 769     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),  \
 770                                             m REPORT_LOCATION,          \
 771                                             REPORT_LOCATION_ARGS(loc)); \
 772 } STMT_END
 773
 774 #define ckWARNregdep(loc,m) STMT_START {                                    \
 775     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,      \
 776                                                       WARN_REGEXP),         \
 777                                              m REPORT_LOCATION,             \
 778                                              REPORT_LOCATION_ARGS(loc));    \
 779 } STMT_END
 780
 781 #define ckWARN2reg_d(loc,m, a1) STMT_START {                                \
 782     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),          \
 783                                             m REPORT_LOCATION,              \
 784                                             a1, REPORT_LOCATION_ARGS(loc)); \
 785 } STMT_END
 786
 787 #define ckWARN2reg(loc, m, a1) STMT_START {                                 \
 788     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 789                                           m REPORT_LOCATION,                \
 790                                           a1, REPORT_LOCATION_ARGS(loc));   \
 791 } STMT_END
 792
 793 #define vWARN3(loc, m, a1, a2) STMT_START {                                 \
 794     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),               \
 795                                        m REPORT_LOCATION,                   \
 796                                        a1, a2, REPORT_LOCATION_ARGS(loc));  \
 797 } STMT_END
 798
 799 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                             \
 800     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
 801                                           m REPORT_LOCATION,                \
 802                                           a1, a2,                           \
 803                                           REPORT_LOCATION_ARGS(loc));       \
 804 } STMT_END
 805
 806 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 807     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 808                                        m REPORT_LOCATION,               \
 809                                        a1, a2, a3,                      \
 810                                        REPORT_LOCATION_ARGS(loc));      \
 811 } STMT_END
 812
 813 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 814     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
 815                                           m REPORT_LOCATION,            \
 816                                           a1, a2, a3,                   \
 817                                           REPORT_LOCATION_ARGS(loc));   \
 818 } STMT_END
 819
 820 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 821     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
 822                                        m REPORT_LOCATION,               \
 823                                        a1, a2, a3, a4,                  \
 824                                        REPORT_LOCATION_ARGS(loc));      \
 825 } STMT_END
 826
 827 /* Macros for recording node offsets.   20001227 mjd@plover.com
 828  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 829  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 830  * Element 0 holds the number n.
 831  * Position is 1 indexed.
 832  */
 833 #ifndef RE_TRACK_PATTERN_OFFSETS
 834 #define Set_Node_Offset_To_R(node,byte)
 835 #define Set_Node_Offset(node,byte)
 836 #define Set_Cur_Node_Offset
 837 #define Set_Node_Length_To_R(node,len)
 838 #define Set_Node_Length(node,len)
 839 #define Set_Node_Cur_Length(node,start)
 840 #define Node_Offset(n)
 841 #define Node_Length(n)
 842 #define Set_Node_Offset_Length(node,offset,len)
 843 #define ProgLen(ri) ri->u.proglen
 844 #define SetProgLen(ri,x) ri->u.proglen = x
 845 #else
 846 #define ProgLen(ri) ri->u.offsets[0]
 847 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 848 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 849     if (! SIZE_ONLY) {                                                  \
 850         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 851                     __LINE__, (int)(node), (int)(byte)));               \
 852         if((node) < 0) {                                                \
 853             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 854                                          (int)(node));                  \
 855         } else {                                                        \
 856             RExC_offsets[2*(node)-1] = (byte);                          \
 857         }                                                               \
 858     }                                                                   \
 859 } STMT_END
 860
 861 #define Set_Node_Offset(node,byte) \
 862     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 863 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 864
 865 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 866     if (! SIZE_ONLY) {                                                  \
 867         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 868                 __LINE__, (int)(node), (int)(len)));                    \
 869         if((node) < 0) {                                                \
 870             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 871                                          (int)(node));                  \
 872         } else {                                                        \
 873             RExC_offsets[2*(node)] = (len);                             \
 874         }                                                               \
 875     }                                                                   \
 876 } STMT_END
 877
 878 #define Set_Node_Length(node,len) \
 879     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 880 #define Set_Node_Cur_Length(node, start)                \
 881     Set_Node_Length(node, RExC_parse - start)
 882
 883 /* Get offsets and lengths */
 884 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 885 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 886
 887 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 888     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 889     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 890 } STMT_END
 891 #endif
 892
 893 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 894 #define EXPERIMENTAL_INPLACESCAN
 895 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 896
 897 #define DEBUG_RExC_seen() \
 898         DEBUG_OPTIMISE_MORE_r({                                             \
 899             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 900                                                                             \
 901             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 902                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 903                                                                             \
 904             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 905                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 906                                                                             \
 907             if (RExC_seen & REG_GPOS_SEEN)                                  \
 908                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 909                                                                             \
 910             if (RExC_seen & REG_RECURSE_SEEN)                               \
 911                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 912                                                                             \
 913             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 914                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 915                                                                             \
 916             if (RExC_seen & REG_VERBARG_SEEN)                               \
 917                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 918                                                                             \
 919             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 920                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 921                                                                             \
 922             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 923                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 924                                                                             \
 925             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 926                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 927                                                                             \
 928             if (RExC_seen & REG_GOSTART_SEEN)                               \
 929                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 930                                                                             \
 931             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 932                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 933                                                                             \
 934             PerlIO_printf(Perl_debug_log,"\n");                             \
 935         });
 936
 937 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 938   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 939
 940 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 941     if ( ( flags ) ) {                                                      \
 942         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 943         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 944         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 945         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 946         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 947         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 948         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 949         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 950         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 951         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 952         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 953         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 954         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 955         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 956         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 957         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 958         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 959     }
 960
 961
 962 #define DEBUG_STUDYDATA(str,data,depth)                              \
 963 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 964     PerlIO_printf(Perl_debug_log,                                    \
 965         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 966         " Flags: 0x%"UVXf,                                           \
 967         (int)(depth)*2, "",                                          \
 968         (IV)((data)->pos_min),                                       \
 969         (IV)((data)->pos_delta),                                     \
 970         (UV)((data)->flags)                                          \
 971     );                                                               \
 972     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 973     PerlIO_printf(Perl_debug_log,                                    \
 974         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 975         (IV)((data)->whilem_c),                                      \
 976         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 977         is_inf ? "INF " : ""                                         \
 978     );                                                               \
 979     if ((data)->last_found)                                          \
 980         PerlIO_printf(Perl_debug_log,                                \
 981             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 982             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 983             SvPVX_const((data)->last_found),                         \
 984             (IV)((data)->last_end),                                  \
 985             (IV)((data)->last_start_min),                            \
 986             (IV)((data)->last_start_max),                            \
 987             ((data)->longest &&                                      \
 988              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 989             SvPVX_const((data)->longest_fixed),                      \
 990             (IV)((data)->offset_fixed),                              \
 991             ((data)->longest &&                                      \
 992              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 993             SvPVX_const((data)->longest_float),                      \
 994             (IV)((data)->offset_float_min),                          \
 995             (IV)((data)->offset_float_max)                           \
 996         );                                                           \
 997     PerlIO_printf(Perl_debug_log,"\n");                              \
 998 });
 999
1000 /* is c a control character for which we have a mnemonic? */
1001 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1002
1003 STATIC const char *
1004 S_cntrl_to_mnemonic(const U8 c)
1005 {
1006     /* Returns the mnemonic string that represents character 'c', if one
1007      * exists; NULL otherwise.  The only ones that exist for the purposes of
1008      * this routine are a few control characters */
1009
1010     switch (c) {
1011         case '\a':       return "\\a";
1012         case '\b':       return "\\b";
1013         case ESC_NATIVE: return "\\e";
1014         case '\f':       return "\\f";
1015         case '\n':       return "\\n";
1016         case '\r':       return "\\r";
1017         case '\t':       return "\\t";
1018     }
1019
1020     return NULL;
1021 }
1022
1023 /* Mark that we cannot extend a found fixed substring at this point.
1024    Update the longest found anchored substring and the longest found
1025    floating substrings if needed. */
1026
1027 STATIC void
1028 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1029                     SSize_t *minlenp, int is_inf)
1030 {
1031     const STRLEN l = CHR_SVLEN(data->last_found);
1032     const STRLEN old_l = CHR_SVLEN(*data->longest);
1033     GET_RE_DEBUG_FLAGS_DECL;
1034
1035     PERL_ARGS_ASSERT_SCAN_COMMIT;
1036
1037     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1038         SvSetMagicSV(*data->longest, data->last_found);
1039         if (*data->longest == data->longest_fixed) {
1040             data->offset_fixed = l ? data->last_start_min : data->pos_min;
1041             if (data->flags & SF_BEFORE_EOL)
1042                 data->flags
1043                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1044             else
1045                 data->flags &= ~SF_FIX_BEFORE_EOL;
1046             data->minlen_fixed=minlenp;
1047             data->lookbehind_fixed=0;
1048         }
1049         else { /* *data->longest == data->longest_float */
1050             data->offset_float_min = l ? data->last_start_min : data->pos_min;
1051             data->offset_float_max = (l
1052                           ? data->last_start_max
1053                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1054                                          ? SSize_t_MAX
1055                                          : data->pos_min + data->pos_delta));
1056             if (is_inf
1057                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1058                 data->offset_float_max = SSize_t_MAX;
1059             if (data->flags & SF_BEFORE_EOL)
1060                 data->flags
1061                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1062             else
1063                 data->flags &= ~SF_FL_BEFORE_EOL;
1064             data->minlen_float=minlenp;
1065             data->lookbehind_float=0;
1066         }
1067     }
1068     SvCUR_set(data->last_found, 0);
1069     {
1070         SV * const sv = data->last_found;
1071         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1072             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1073             if (mg)
1074                 mg->mg_len = 0;
1075         }
1076     }
1077     data->last_end = -1;
1078     data->flags &= ~SF_BEFORE_EOL;
1079     DEBUG_STUDYDATA("commit: ",data,0);
1080 }
1081
1082 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1083  * list that describes which code points it matches */
1084
1085 STATIC void
1086 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1087 {
1088     /* Set the SSC 'ssc' to match an empty string or any code point */
1089
1090     PERL_ARGS_ASSERT_SSC_ANYTHING;
1091
1092     assert(is_ANYOF_SYNTHETIC(ssc));
1093
1094     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1095     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1096     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1097 }
1098
1099 STATIC int
1100 S_ssc_is_anything(const regnode_ssc *ssc)
1101 {
1102     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1103      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1104      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1105      * in any way, so there's no point in using it */
1106
1107     UV start, end;
1108     bool ret;
1109
1110     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1111
1112     assert(is_ANYOF_SYNTHETIC(ssc));
1113
1114     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1115         return FALSE;
1116     }
1117
1118     /* See if the list consists solely of the range 0 - Infinity */
1119     invlist_iterinit(ssc->invlist);
1120     ret = invlist_iternext(ssc->invlist, &start, &end)
1121           && start == 0
1122           && end == UV_MAX;
1123
1124     invlist_iterfinish(ssc->invlist);
1125
1126     if (ret) {
1127         return TRUE;
1128     }
1129
1130     /* If e.g., both \w and \W are set, matches everything */
1131     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1132         int i;
1133         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1134             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1135                 return TRUE;
1136             }
1137         }
1138     }
1139
1140     return FALSE;
1141 }
1142
1143 STATIC void
1144 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1145 {
1146     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1147      * string, any code point, or any posix class under locale */
1148
1149     PERL_ARGS_ASSERT_SSC_INIT;
1150
1151     Zero(ssc, 1, regnode_ssc);
1152     set_ANYOF_SYNTHETIC(ssc);
1153     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1154     ssc_anything(ssc);
1155
1156     /* If any portion of the regex is to operate under locale rules that aren't
1157      * fully known at compile time, initialization includes it.  The reason
1158      * this isn't done for all regexes is that the optimizer was written under
1159      * the assumption that locale was all-or-nothing.  Given the complexity and
1160      * lack of documentation in the optimizer, and that there are inadequate
1161      * test cases for locale, many parts of it may not work properly, it is
1162      * safest to avoid locale unless necessary. */
1163     if (RExC_contains_locale) {
1164         ANYOF_POSIXL_SETALL(ssc);
1165     }
1166     else {
1167         ANYOF_POSIXL_ZERO(ssc);
1168     }
1169 }
1170
1171 STATIC int
1172 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1173                         const regnode_ssc *ssc)
1174 {
1175     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1176      * to the list of code points matched, and locale posix classes; hence does
1177      * not check its flags) */
1178
1179     UV start, end;
1180     bool ret;
1181
1182     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1183
1184     assert(is_ANYOF_SYNTHETIC(ssc));
1185
1186     invlist_iterinit(ssc->invlist);
1187     ret = invlist_iternext(ssc->invlist, &start, &end)
1188           && start == 0
1189           && end == UV_MAX;
1190
1191     invlist_iterfinish(ssc->invlist);
1192
1193     if (! ret) {
1194         return FALSE;
1195     }
1196
1197     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1198         return FALSE;
1199     }
1200
1201     return TRUE;
1202 }
1203
1204 STATIC SV*
1205 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1206                                const regnode_charclass* const node)
1207 {
1208     /* Returns a mortal inversion list defining which code points are matched
1209      * by 'node', which is of type ANYOF.  Handles complementing the result if
1210      * appropriate.  If some code points aren't knowable at this time, the
1211      * returned list must, and will, contain every code point that is a
1212      * possibility. */
1213
1214     SV* invlist = sv_2mortal(_new_invlist(0));
1215     SV* only_utf8_locale_invlist = NULL;
1216     unsigned int i;
1217     const U32 n = ARG(node);
1218     bool new_node_has_latin1 = FALSE;
1219
1220     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1221
1222     /* Look at the data structure created by S_set_ANYOF_arg() */
1223     if (n != ANYOF_ONLY_HAS_BITMAP) {
1224         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1225         AV * const av = MUTABLE_AV(SvRV(rv));
1226         SV **const ary = AvARRAY(av);
1227         assert(RExC_rxi->data->what[n] == 's');
1228
1229         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1230             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1231         }
1232         else if (ary[0] && ary[0] != &PL_sv_undef) {
1233
1234             /* Here, no compile-time swash, and there are things that won't be
1235              * known until runtime -- we have to assume it could be anything */
1236             return _add_range_to_invlist(invlist, 0, UV_MAX);
1237         }
1238         else if (ary[3] && ary[3] != &PL_sv_undef) {
1239
1240             /* Here no compile-time swash, and no run-time only data.  Use the
1241              * node's inversion list */
1242             invlist = sv_2mortal(invlist_clone(ary[3]));
1243         }
1244
1245         /* Get the code points valid only under UTF-8 locales */
1246         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1247             && ary[2] && ary[2] != &PL_sv_undef)
1248         {
1249             only_utf8_locale_invlist = ary[2];
1250         }
1251     }
1252
1253     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1254      * code points, and an inversion list for the others, but if there are code
1255      * points that should match only conditionally on the target string being
1256      * UTF-8, those are placed in the inversion list, and not the bitmap.
1257      * Since there are circumstances under which they could match, they are
1258      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1259      * to exclude them here, so that when we invert below, the end result
1260      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1261      * have to do this here before we add the unconditionally matched code
1262      * points */
1263     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1264         _invlist_intersection_complement_2nd(invlist,
1265                                              PL_UpperLatin1,
1266                                              &invlist);
1267     }
1268
1269     /* Add in the points from the bit map */
1270     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1271         if (ANYOF_BITMAP_TEST(node, i)) {
1272             invlist = add_cp_to_invlist(invlist, i);
1273             new_node_has_latin1 = TRUE;
1274         }
1275     }
1276
1277     /* If this can match all upper Latin1 code points, have to add them
1278      * as well */
1279     if (OP(node) == ANYOFD
1280         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1281     {
1282         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1283     }
1284
1285     /* Similarly for these */
1286     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1287         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1288     }
1289
1290     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1291         _invlist_invert(invlist);
1292     }
1293     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1294
1295         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1296          * locale.  We can skip this if there are no 0-255 at all. */
1297         _invlist_union(invlist, PL_Latin1, &invlist);
1298     }
1299
1300     /* Similarly add the UTF-8 locale possible matches.  These have to be
1301      * deferred until after the non-UTF-8 locale ones are taken care of just
1302      * above, or it leads to wrong results under ANYOF_INVERT */
1303     if (only_utf8_locale_invlist) {
1304         _invlist_union_maybe_complement_2nd(invlist,
1305                                             only_utf8_locale_invlist,
1306                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1307                                             &invlist);
1308     }
1309
1310     return invlist;
1311 }
1312
1313 /* These two functions currently do the exact same thing */
1314 #define ssc_init_zero           ssc_init
1315
1316 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1317 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1318
1319 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1320  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1321  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1322
1323 STATIC void
1324 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1325                 const regnode_charclass *and_with)
1326 {
1327     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1328      * another SSC or a regular ANYOF class.  Can create false positives. */
1329
1330     SV* anded_cp_list;
1331     U8  anded_flags;
1332
1333     PERL_ARGS_ASSERT_SSC_AND;
1334
1335     assert(is_ANYOF_SYNTHETIC(ssc));
1336
1337     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1338      * the code point inversion list and just the relevant flags */
1339     if (is_ANYOF_SYNTHETIC(and_with)) {
1340         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1341         anded_flags = ANYOF_FLAGS(and_with);
1342
1343         /* XXX This is a kludge around what appears to be deficiencies in the
1344          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1345          * there are paths through the optimizer where it doesn't get weeded
1346          * out when it should.  And if we don't make some extra provision for
1347          * it like the code just below, it doesn't get added when it should.
1348          * This solution is to add it only when AND'ing, which is here, and
1349          * only when what is being AND'ed is the pristine, original node
1350          * matching anything.  Thus it is like adding it to ssc_anything() but
1351          * only when the result is to be AND'ed.  Probably the same solution
1352          * could be adopted for the same problem we have with /l matching,
1353          * which is solved differently in S_ssc_init(), and that would lead to
1354          * fewer false positives than that solution has.  But if this solution
1355          * creates bugs, the consequences are only that a warning isn't raised
1356          * that should be; while the consequences for having /l bugs is
1357          * incorrect matches */
1358         if (ssc_is_anything((regnode_ssc *)and_with)) {
1359             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1360         }
1361     }
1362     else {
1363         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1364         if (OP(and_with) == ANYOFD) {
1365             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1366         }
1367         else {
1368             anded_flags = ANYOF_FLAGS(and_with)
1369             &( ANYOF_COMMON_FLAGS
1370               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1371               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1372             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1373                 anded_flags &=
1374                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1375             }
1376         }
1377     }
1378
1379     ANYOF_FLAGS(ssc) &= anded_flags;
1380
1381     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1382      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1383      * 'and_with' may be inverted.  When not inverted, we have the situation of
1384      * computing:
1385      *  (C1 | P1) & (C2 | P2)
1386      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1387      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1388      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1389      *                    <=  ((C1 & C2) | P1 | P2)
1390      * Alternatively, the last few steps could be:
1391      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1392      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1393      *                    <=  (C1 | C2 | (P1 & P2))
1394      * We favor the second approach if either P1 or P2 is non-empty.  This is
1395      * because these components are a barrier to doing optimizations, as what
1396      * they match cannot be known until the moment of matching as they are
1397      * dependent on the current locale, 'AND"ing them likely will reduce or
1398      * eliminate them.
1399      * But we can do better if we know that C1,P1 are in their initial state (a
1400      * frequent occurrence), each matching everything:
1401      *  (<everything>) & (C2 | P2) =  C2 | P2
1402      * Similarly, if C2,P2 are in their initial state (again a frequent
1403      * occurrence), the result is a no-op
1404      *  (C1 | P1) & (<everything>) =  C1 | P1
1405      *
1406      * Inverted, we have
1407      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1408      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1409      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1410      * */
1411
1412     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1413         && ! is_ANYOF_SYNTHETIC(and_with))
1414     {
1415         unsigned int i;
1416
1417         ssc_intersection(ssc,
1418                          anded_cp_list,
1419                          FALSE /* Has already been inverted */
1420                          );
1421
1422         /* If either P1 or P2 is empty, the intersection will be also; can skip
1423          * the loop */
1424         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1425             ANYOF_POSIXL_ZERO(ssc);
1426         }
1427         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1428
1429             /* Note that the Posix class component P from 'and_with' actually
1430              * looks like:
1431              *      P = Pa | Pb | ... | Pn
1432              * where each component is one posix class, such as in [\w\s].
1433              * Thus
1434              *      ~P = ~(Pa | Pb | ... | Pn)
1435              *         = ~Pa & ~Pb & ... & ~Pn
1436              *        <= ~Pa | ~Pb | ... | ~Pn
1437              * The last is something we can easily calculate, but unfortunately
1438              * is likely to have many false positives.  We could do better
1439              * in some (but certainly not all) instances if two classes in
1440              * P have known relationships.  For example
1441              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1442              * So
1443              *      :lower: & :print: = :lower:
1444              * And similarly for classes that must be disjoint.  For example,
1445              * since \s and \w can have no elements in common based on rules in
1446              * the POSIX standard,
1447              *      \w & ^\S = nothing
1448              * Unfortunately, some vendor locales do not meet the Posix
1449              * standard, in particular almost everything by Microsoft.
1450              * The loop below just changes e.g., \w into \W and vice versa */
1451
1452             regnode_charclass_posixl temp;
1453             int add = 1;    /* To calculate the index of the complement */
1454
1455             ANYOF_POSIXL_ZERO(&temp);
1456             for (i = 0; i < ANYOF_MAX; i++) {
1457                 assert(i % 2 != 0
1458                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1459                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1460
1461                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1462                     ANYOF_POSIXL_SET(&temp, i + add);
1463                 }
1464                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1465             }
1466             ANYOF_POSIXL_AND(&temp, ssc);
1467
1468         } /* else ssc already has no posixes */
1469     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1470          in its initial state */
1471     else if (! is_ANYOF_SYNTHETIC(and_with)
1472              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1473     {
1474         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1475          * copy it over 'ssc' */
1476         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1477             if (is_ANYOF_SYNTHETIC(and_with)) {
1478                 StructCopy(and_with, ssc, regnode_ssc);
1479             }
1480             else {
1481                 ssc->invlist = anded_cp_list;
1482                 ANYOF_POSIXL_ZERO(ssc);
1483                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1484                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1485                 }
1486             }
1487         }
1488         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1489                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1490         {
1491             /* One or the other of P1, P2 is non-empty. */
1492             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1493                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1494             }
1495             ssc_union(ssc, anded_cp_list, FALSE);
1496         }
1497         else { /* P1 = P2 = empty */
1498             ssc_intersection(ssc, anded_cp_list, FALSE);
1499         }
1500     }
1501 }
1502
1503 STATIC void
1504 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1505                const regnode_charclass *or_with)
1506 {
1507     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1508      * another SSC or a regular ANYOF class.  Can create false positives if
1509      * 'or_with' is to be inverted. */
1510
1511     SV* ored_cp_list;
1512     U8 ored_flags;
1513
1514     PERL_ARGS_ASSERT_SSC_OR;
1515
1516     assert(is_ANYOF_SYNTHETIC(ssc));
1517
1518     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1519      * the code point inversion list and just the relevant flags */
1520     if (is_ANYOF_SYNTHETIC(or_with)) {
1521         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1522         ored_flags = ANYOF_FLAGS(or_with);
1523     }
1524     else {
1525         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1526         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1527         if (OP(or_with) != ANYOFD) {
1528             ored_flags
1529             |= ANYOF_FLAGS(or_with)
1530              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1531                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1532             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1533                 ored_flags |=
1534                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1535             }
1536         }
1537     }
1538
1539     ANYOF_FLAGS(ssc) |= ored_flags;
1540
1541     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1542      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1543      * 'or_with' may be inverted.  When not inverted, we have the simple
1544      * situation of computing:
1545      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1546      * If P1|P2 yields a situation with both a class and its complement are
1547      * set, like having both \w and \W, this matches all code points, and we
1548      * can delete these from the P component of the ssc going forward.  XXX We
1549      * might be able to delete all the P components, but I (khw) am not certain
1550      * about this, and it is better to be safe.
1551      *
1552      * Inverted, we have
1553      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1554      *                         <=  (C1 | P1) | ~C2
1555      *                         <=  (C1 | ~C2) | P1
1556      * (which results in actually simpler code than the non-inverted case)
1557      * */
1558
1559     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1560         && ! is_ANYOF_SYNTHETIC(or_with))
1561     {
1562         /* We ignore P2, leaving P1 going forward */
1563     }   /* else  Not inverted */
1564     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1565         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1566         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1567             unsigned int i;
1568             for (i = 0; i < ANYOF_MAX; i += 2) {
1569                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1570                 {
1571                     ssc_match_all_cp(ssc);
1572                     ANYOF_POSIXL_CLEAR(ssc, i);
1573                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1574                 }
1575             }
1576         }
1577     }
1578
1579     ssc_union(ssc,
1580               ored_cp_list,
1581               FALSE /* Already has been inverted */
1582               );
1583 }
1584
1585 PERL_STATIC_INLINE void
1586 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1587 {
1588     PERL_ARGS_ASSERT_SSC_UNION;
1589
1590     assert(is_ANYOF_SYNTHETIC(ssc));
1591
1592     _invlist_union_maybe_complement_2nd(ssc->invlist,
1593                                         invlist,
1594                                         invert2nd,
1595                                         &ssc->invlist);
1596 }
1597
1598 PERL_STATIC_INLINE void
1599 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1600                          SV* const invlist,
1601                          const bool invert2nd)
1602 {
1603     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1604
1605     assert(is_ANYOF_SYNTHETIC(ssc));
1606
1607     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1608                                                invlist,
1609                                                invert2nd,
1610                                                &ssc->invlist);
1611 }
1612
1613 PERL_STATIC_INLINE void
1614 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1615 {
1616     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1617
1618     assert(is_ANYOF_SYNTHETIC(ssc));
1619
1620     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1621 }
1622
1623 PERL_STATIC_INLINE void
1624 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1625 {
1626     /* AND just the single code point 'cp' into the SSC 'ssc' */
1627
1628     SV* cp_list = _new_invlist(2);
1629
1630     PERL_ARGS_ASSERT_SSC_CP_AND;
1631
1632     assert(is_ANYOF_SYNTHETIC(ssc));
1633
1634     cp_list = add_cp_to_invlist(cp_list, cp);
1635     ssc_intersection(ssc, cp_list,
1636                      FALSE /* Not inverted */
1637                      );
1638     SvREFCNT_dec_NN(cp_list);
1639 }
1640
1641 PERL_STATIC_INLINE void
1642 S_ssc_clear_locale(regnode_ssc *ssc)
1643 {
1644     /* Set the SSC 'ssc' to not match any locale things */
1645     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1646
1647     assert(is_ANYOF_SYNTHETIC(ssc));
1648
1649     ANYOF_POSIXL_ZERO(ssc);
1650     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1651 }
1652
1653 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1654
1655 STATIC bool
1656 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1657 {
1658     /* The synthetic start class is used to hopefully quickly winnow down
1659      * places where a pattern could start a match in the target string.  If it
1660      * doesn't really narrow things down that much, there isn't much point to
1661      * having the overhead of using it.  This function uses some very crude
1662      * heuristics to decide if to use the ssc or not.
1663      *
1664      * It returns TRUE if 'ssc' rules out more than half what it considers to
1665      * be the "likely" possible matches, but of course it doesn't know what the
1666      * actual things being matched are going to be; these are only guesses
1667      *
1668      * For /l matches, it assumes that the only likely matches are going to be
1669      *      in the 0-255 range, uniformly distributed, so half of that is 127
1670      * For /a and /d matches, it assumes that the likely matches will be just
1671      *      the ASCII range, so half of that is 63
1672      * For /u and there isn't anything matching above the Latin1 range, it
1673      *      assumes that that is the only range likely to be matched, and uses
1674      *      half that as the cut-off: 127.  If anything matches above Latin1,
1675      *      it assumes that all of Unicode could match (uniformly), except for
1676      *      non-Unicode code points and things in the General Category "Other"
1677      *      (unassigned, private use, surrogates, controls and formats).  This
1678      *      is a much large number. */
1679
1680     const U32 max_match = (LOC)
1681                           ? 127
1682                           : (! UNI_SEMANTICS)
1683                             ? 63
1684                             : (invlist_highest(ssc->invlist) < 256)
1685                               ? 127
1686                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1687     U32 count = 0;      /* Running total of number of code points matched by
1688                            'ssc' */
1689     UV start, end;      /* Start and end points of current range in inversion
1690                            list */
1691
1692     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1693
1694     invlist_iterinit(ssc->invlist);
1695     while (invlist_iternext(ssc->invlist, &start, &end)) {
1696
1697         /* /u is the only thing that we expect to match above 255; so if not /u
1698          * and even if there are matches above 255, ignore them.  This catches
1699          * things like \d under /d which does match the digits above 255, but
1700          * since the pattern is /d, it is not likely to be expecting them */
1701         if (! UNI_SEMANTICS) {
1702             if (start > 255) {
1703                 break;
1704             }
1705             end = MIN(end, 255);
1706         }
1707         count += end - start + 1;
1708         if (count > max_match) {
1709             invlist_iterfinish(ssc->invlist);
1710             return FALSE;
1711         }
1712     }
1713
1714     return TRUE;
1715 }
1716
1717
1718 STATIC void
1719 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1720 {
1721     /* The inversion list in the SSC is marked mortal; now we need a more
1722      * permanent copy, which is stored the same way that is done in a regular
1723      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1724      * map */
1725
1726     SV* invlist = invlist_clone(ssc->invlist);
1727
1728     PERL_ARGS_ASSERT_SSC_FINALIZE;
1729
1730     assert(is_ANYOF_SYNTHETIC(ssc));
1731
1732     /* The code in this file assumes that all but these flags aren't relevant
1733      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1734      * by the time we reach here */
1735     assert(! (ANYOF_FLAGS(ssc)
1736         & ~( ANYOF_COMMON_FLAGS
1737             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1738             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1739
1740     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1741
1742     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1743                                 NULL, NULL, NULL, FALSE);
1744
1745     /* Make sure is clone-safe */
1746     ssc->invlist = NULL;
1747
1748     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1749         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1750     }
1751
1752     if (RExC_contains_locale) {
1753         OP(ssc) = ANYOFL;
1754     }
1755
1756     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1757 }
1758
1759 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1760 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1761 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1762 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1763                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1764                                : 0 )
1765
1766
1767 #ifdef DEBUGGING
1768 /*
1769    dump_trie(trie,widecharmap,revcharmap)
1770    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1771    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1772
1773    These routines dump out a trie in a somewhat readable format.
1774    The _interim_ variants are used for debugging the interim
1775    tables that are used to generate the final compressed
1776    representation which is what dump_trie expects.
1777
1778    Part of the reason for their existence is to provide a form
1779    of documentation as to how the different representations function.
1780
1781 */
1782
1783 /*
1784   Dumps the final compressed table form of the trie to Perl_debug_log.
1785   Used for debugging make_trie().
1786 */
1787
1788 STATIC void
1789 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1790             AV *revcharmap, U32 depth)
1791 {
1792     U32 state;
1793     SV *sv=sv_newmortal();
1794     int colwidth= widecharmap ? 6 : 4;
1795     U16 word;
1796     GET_RE_DEBUG_FLAGS_DECL;
1797
1798     PERL_ARGS_ASSERT_DUMP_TRIE;
1799
1800     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1801         (int)depth * 2 + 2,"",
1802         "Match","Base","Ofs" );
1803
1804     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1805         SV ** const tmp = av_fetch( revcharmap, state, 0);
1806         if ( tmp ) {
1807             PerlIO_printf( Perl_debug_log, "%*s",
1808                 colwidth,
1809                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1810                             PL_colors[0], PL_colors[1],
1811                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1812                             PERL_PV_ESCAPE_FIRSTCHAR
1813                 )
1814             );
1815         }
1816     }
1817     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1818         (int)depth * 2 + 2,"");
1819
1820     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1821         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1822     PerlIO_printf( Perl_debug_log, "\n");
1823
1824     for( state = 1 ; state < trie->statecount ; state++ ) {
1825         const U32 base = trie->states[ state ].trans.base;
1826
1827         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1828                                        (int)depth * 2 + 2,"", (UV)state);
1829
1830         if ( trie->states[ state ].wordnum ) {
1831             PerlIO_printf( Perl_debug_log, " W%4X",
1832                                            trie->states[ state ].wordnum );
1833         } else {
1834             PerlIO_printf( Perl_debug_log, "%6s", "" );
1835         }
1836
1837         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1838
1839         if ( base ) {
1840             U32 ofs = 0;
1841
1842             while( ( base + ofs  < trie->uniquecharcount ) ||
1843                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1844                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1845                                                                     != state))
1846                     ofs++;
1847
1848             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1849
1850             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1851                 if ( ( base + ofs >= trie->uniquecharcount )
1852                         && ( base + ofs - trie->uniquecharcount
1853                                                         < trie->lasttrans )
1854                         && trie->trans[ base + ofs
1855                                     - trie->uniquecharcount ].check == state )
1856                 {
1857                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1858                     colwidth,
1859                     (UV)trie->trans[ base + ofs
1860                                              - trie->uniquecharcount ].next );
1861                 } else {
1862                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1863                 }
1864             }
1865
1866             PerlIO_printf( Perl_debug_log, "]");
1867
1868         }
1869         PerlIO_printf( Perl_debug_log, "\n" );
1870     }
1871     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1872                                 (int)depth*2, "");
1873     for (word=1; word <= trie->wordcount; word++) {
1874         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1875             (int)word, (int)(trie->wordinfo[word].prev),
1876             (int)(trie->wordinfo[word].len));
1877     }
1878     PerlIO_printf(Perl_debug_log, "\n" );
1879 }
1880 /*
1881   Dumps a fully constructed but uncompressed trie in list form.
1882   List tries normally only are used for construction when the number of
1883   possible chars (trie->uniquecharcount) is very high.
1884   Used for debugging make_trie().
1885 */
1886 STATIC void
1887 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1888                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1889                          U32 depth)
1890 {
1891     U32 state;
1892     SV *sv=sv_newmortal();
1893     int colwidth= widecharmap ? 6 : 4;
1894     GET_RE_DEBUG_FLAGS_DECL;
1895
1896     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1897
1898     /* print out the table precompression.  */
1899     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1900         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1901         "------:-----+-----------------\n" );
1902
1903     for( state=1 ; state < next_alloc ; state ++ ) {
1904         U16 charid;
1905
1906         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1907             (int)depth * 2 + 2,"", (UV)state  );
1908         if ( ! trie->states[ state ].wordnum ) {
1909             PerlIO_printf( Perl_debug_log, "%5s| ","");
1910         } else {
1911             PerlIO_printf( Perl_debug_log, "W%4x| ",
1912                 trie->states[ state ].wordnum
1913             );
1914         }
1915         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1916             SV ** const tmp = av_fetch( revcharmap,
1917                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1918             if ( tmp ) {
1919                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1920                     colwidth,
1921                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1922                               colwidth,
1923                               PL_colors[0], PL_colors[1],
1924                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1925                               | PERL_PV_ESCAPE_FIRSTCHAR
1926                     ) ,
1927                     TRIE_LIST_ITEM(state,charid).forid,
1928                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1929                 );
1930                 if (!(charid % 10))
1931                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1932                         (int)((depth * 2) + 14), "");
1933             }
1934         }
1935         PerlIO_printf( Perl_debug_log, "\n");
1936     }
1937 }
1938
1939 /*
1940   Dumps a fully constructed but uncompressed trie in table form.
1941   This is the normal DFA style state transition table, with a few
1942   twists to facilitate compression later.
1943   Used for debugging make_trie().
1944 */
1945 STATIC void
1946 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1947                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1948                           U32 depth)
1949 {
1950     U32 state;
1951     U16 charid;
1952     SV *sv=sv_newmortal();
1953     int colwidth= widecharmap ? 6 : 4;
1954     GET_RE_DEBUG_FLAGS_DECL;
1955
1956     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1957
1958     /*
1959        print out the table precompression so that we can do a visual check
1960        that they are identical.
1961      */
1962
1963     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1964
1965     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1966         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1967         if ( tmp ) {
1968             PerlIO_printf( Perl_debug_log, "%*s",
1969                 colwidth,
1970                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1971                             PL_colors[0], PL_colors[1],
1972                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1973                             PERL_PV_ESCAPE_FIRSTCHAR
1974                 )
1975             );
1976         }
1977     }
1978
1979     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1980
1981     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1982         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1983     }
1984
1985     PerlIO_printf( Perl_debug_log, "\n" );
1986
1987     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1988
1989         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1990             (int)depth * 2 + 2,"",
1991             (UV)TRIE_NODENUM( state ) );
1992
1993         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1994             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1995             if (v)
1996                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1997             else
1998                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1999         }
2000         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2001             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
2002                                             (UV)trie->trans[ state ].check );
2003         } else {
2004             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
2005                                             (UV)trie->trans[ state ].check,
2006             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2007         }
2008     }
2009 }
2010
2011 #endif
2012
2013
2014 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2015   startbranch: the first branch in the whole branch sequence
2016   first      : start branch of sequence of branch-exact nodes.
2017                May be the same as startbranch
2018   last       : Thing following the last branch.
2019                May be the same as tail.
2020   tail       : item following the branch sequence
2021   count      : words in the sequence
2022   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2023   depth      : indent depth
2024
2025 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2026
2027 A trie is an N'ary tree where the branches are determined by digital
2028 decomposition of the key. IE, at the root node you look up the 1st character and
2029 follow that branch repeat until you find the end of the branches. Nodes can be
2030 marked as "accepting" meaning they represent a complete word. Eg:
2031
2032   /he|she|his|hers/
2033
2034 would convert into the following structure. Numbers represent states, letters
2035 following numbers represent valid transitions on the letter from that state, if
2036 the number is in square brackets it represents an accepting state, otherwise it
2037 will be in parenthesis.
2038
2039       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2040       |    |
2041       |   (2)
2042       |    |
2043      (1)   +-i->(6)-+-s->[7]
2044       |
2045       +-s->(3)-+-h->(4)-+-e->[5]
2046
2047       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2048
2049 This shows that when matching against the string 'hers' we will begin at state 1
2050 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2051 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2052 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2053 single traverse. We store a mapping from accepting to state to which word was
2054 matched, and then when we have multiple possibilities we try to complete the
2055 rest of the regex in the order in which they occurred in the alternation.
2056
2057 The only prior NFA like behaviour that would be changed by the TRIE support is
2058 the silent ignoring of duplicate alternations which are of the form:
2059
2060  / (DUPE|DUPE) X? (?{ ... }) Y /x
2061
2062 Thus EVAL blocks following a trie may be called a different number of times with
2063 and without the optimisation. With the optimisations dupes will be silently
2064 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2065 the following demonstrates:
2066
2067  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2068
2069 which prints out 'word' three times, but
2070
2071  'words'=~/(word|word|word)(?{ print $1 })S/
2072
2073 which doesnt print it out at all. This is due to other optimisations kicking in.
2074
2075 Example of what happens on a structural level:
2076
2077 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2078
2079    1: CURLYM[1] {1,32767}(18)
2080    5:   BRANCH(8)
2081    6:     EXACT <ac>(16)
2082    8:   BRANCH(11)
2083    9:     EXACT <ad>(16)
2084   11:   BRANCH(14)
2085   12:     EXACT <ab>(16)
2086   16:   SUCCEED(0)
2087   17:   NOTHING(18)
2088   18: END(0)
2089
2090 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2091 and should turn into:
2092
2093    1: CURLYM[1] {1,32767}(18)
2094    5:   TRIE(16)
2095         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2096           <ac>
2097           <ad>
2098           <ab>
2099   16:   SUCCEED(0)
2100   17:   NOTHING(18)
2101   18: END(0)
2102
2103 Cases where tail != last would be like /(?foo|bar)baz/:
2104
2105    1: BRANCH(4)
2106    2:   EXACT <foo>(8)
2107    4: BRANCH(7)
2108    5:   EXACT <bar>(8)
2109    7: TAIL(8)
2110    8: EXACT <baz>(10)
2111   10: END(0)
2112
2113 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2114 and would end up looking like:
2115
2116     1: TRIE(8)
2117       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2118         <foo>
2119         <bar>
2120    7: TAIL(8)
2121    8: EXACT <baz>(10)
2122   10: END(0)
2123
2124     d = uvchr_to_utf8_flags(d, uv, 0);
2125
2126 is the recommended Unicode-aware way of saying
2127
2128     *(d++) = uv;
2129 */
2130
2131 #define TRIE_STORE_REVCHAR(val)                                            \
2132     STMT_START {                                                           \
2133         if (UTF) {                                                         \
2134             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2135             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2136             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2137             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2138             SvPOK_on(zlopp);                                               \
2139             SvUTF8_on(zlopp);                                              \
2140             av_push(revcharmap, zlopp);                                    \
2141         } else {                                                           \
2142             char ooooff = (char)val;                                           \
2143             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2144         }                                                                  \
2145         } STMT_END
2146
2147 /* This gets the next character from the input, folding it if not already
2148  * folded. */
2149 #define TRIE_READ_CHAR STMT_START {                                           \
2150     wordlen++;                                                                \
2151     if ( UTF ) {                                                              \
2152         /* if it is UTF then it is either already folded, or does not need    \
2153          * folding */                                                         \
2154         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2155     }                                                                         \
2156     else if (folder == PL_fold_latin1) {                                      \
2157         /* This folder implies Unicode rules, which in the range expressible  \
2158          *  by not UTF is the lower case, with the two exceptions, one of     \
2159          *  which should have been taken care of before calling this */       \
2160         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2161         uvc = toLOWER_L1(*uc);                                                \
2162         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2163         len = 1;                                                              \
2164     } else {                                                                  \
2165         /* raw data, will be folded later if needed */                        \
2166         uvc = (U32)*uc;                                                       \
2167         len = 1;                                                              \
2168     }                                                                         \
2169 } STMT_END
2170
2171
2172
2173 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2174     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2175         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2176         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2177     }                                                           \
2178     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2179     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2180     TRIE_LIST_CUR( state )++;                                   \
2181 } STMT_END
2182
2183 #define TRIE_LIST_NEW(state) STMT_START {                       \
2184     Newxz( trie->states[ state ].trans.list,               \
2185         4, reg_trie_trans_le );                                 \
2186      TRIE_LIST_CUR( state ) = 1;                                \
2187      TRIE_LIST_LEN( state ) = 4;                                \
2188 } STMT_END
2189
2190 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2191     U16 dupe= trie->states[ state ].wordnum;                    \
2192     regnode * const noper_next = regnext( noper );              \
2193                                                                 \
2194     DEBUG_r({                                                   \
2195         /* store the word for dumping */                        \
2196         SV* tmp;                                                \
2197         if (OP(noper) != NOTHING)                               \
2198             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2199         else                                                    \
2200             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2201         av_push( trie_words, tmp );                             \
2202     });                                                         \
2203                                                                 \
2204     curword++;                                                  \
2205     trie->wordinfo[curword].prev   = 0;                         \
2206     trie->wordinfo[curword].len    = wordlen;                   \
2207     trie->wordinfo[curword].accept = state;                     \
2208                                                                 \
2209     if ( noper_next < tail ) {                                  \
2210         if (!trie->jump)                                        \
2211             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2212                                                  sizeof(U16) ); \
2213         trie->jump[curword] = (U16)(noper_next - convert);      \
2214         if (!jumper)                                            \
2215             jumper = noper_next;                                \
2216         if (!nextbranch)                                        \
2217             nextbranch= regnext(cur);                           \
2218     }                                                           \
2219                                                                 \
2220     if ( dupe ) {                                               \
2221         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2222         /* chain, so that when the bits of chain are later    */\
2223         /* linked together, the dups appear in the chain      */\
2224         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2225         trie->wordinfo[dupe].prev = curword;                    \
2226     } else {                                                    \
2227         /* we haven't inserted this word yet.                */ \
2228         trie->states[ state ].wordnum = curword;                \
2229     }                                                           \
2230 } STMT_END
2231
2232
2233 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2234      ( ( base + charid >=  ucharcount                                   \
2235          && base + charid < ubound                                      \
2236          && state == trie->trans[ base - ucharcount + charid ].check    \
2237          && trie->trans[ base - ucharcount + charid ].next )            \
2238            ? trie->trans[ base - ucharcount + charid ].next             \
2239            : ( state==1 ? special : 0 )                                 \
2240       )
2241
2242 #define MADE_TRIE       1
2243 #define MADE_JUMP_TRIE  2
2244 #define MADE_EXACT_TRIE 4
2245
2246 STATIC I32
2247 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2248                   regnode *first, regnode *last, regnode *tail,
2249                   U32 word_count, U32 flags, U32 depth)
2250 {
2251     /* first pass, loop through and scan words */
2252     reg_trie_data *trie;
2253     HV *widecharmap = NULL;
2254     AV *revcharmap = newAV();
2255     regnode *cur;
2256     STRLEN len = 0;
2257     UV uvc = 0;
2258     U16 curword = 0;
2259     U32 next_alloc = 0;
2260     regnode *jumper = NULL;
2261     regnode *nextbranch = NULL;
2262     regnode *convert = NULL;
2263     U32 *prev_states; /* temp array mapping each state to previous one */
2264     /* we just use folder as a flag in utf8 */
2265     const U8 * folder = NULL;
2266
2267 #ifdef DEBUGGING
2268     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2269     AV *trie_words = NULL;
2270     /* along with revcharmap, this only used during construction but both are
2271      * useful during debugging so we store them in the struct when debugging.
2272      */
2273 #else
2274     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2275     STRLEN trie_charcount=0;
2276 #endif
2277     SV *re_trie_maxbuff;
2278     GET_RE_DEBUG_FLAGS_DECL;
2279
2280     PERL_ARGS_ASSERT_MAKE_TRIE;
2281 #ifndef DEBUGGING
2282     PERL_UNUSED_ARG(depth);
2283 #endif
2284
2285     switch (flags) {
2286         case EXACT: case EXACTL: break;
2287         case EXACTFA:
2288         case EXACTFU_SS:
2289         case EXACTFU:
2290         case EXACTFLU8: folder = PL_fold_latin1; break;
2291         case EXACTF:  folder = PL_fold; break;
2292         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2293     }
2294
2295     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2296     trie->refcount = 1;
2297     trie->startstate = 1;
2298     trie->wordcount = word_count;
2299     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2300     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2301     if (flags == EXACT || flags == EXACTL)
2302         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2303     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2304                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2305
2306     DEBUG_r({
2307         trie_words = newAV();
2308     });
2309
2310     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2311     assert(re_trie_maxbuff);
2312     if (!SvIOK(re_trie_maxbuff)) {
2313         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2314     }
2315     DEBUG_TRIE_COMPILE_r({
2316         PerlIO_printf( Perl_debug_log,
2317           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2318           (int)depth * 2 + 2, "",
2319           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2320           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2321     });
2322
2323    /* Find the node we are going to overwrite */
2324     if ( first == startbranch && OP( last ) != BRANCH ) {
2325         /* whole branch chain */
2326         convert = first;
2327     } else {
2328         /* branch sub-chain */
2329         convert = NEXTOPER( first );
2330     }
2331
2332     /*  -- First loop and Setup --
2333
2334        We first traverse the branches and scan each word to determine if it
2335        contains widechars, and how many unique chars there are, this is
2336        important as we have to build a table with at least as many columns as we
2337        have unique chars.
2338
2339        We use an array of integers to represent the character codes 0..255
2340        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2341        the native representation of the character value as the key and IV's for
2342        the coded index.
2343
2344        *TODO* If we keep track of how many times each character is used we can
2345        remap the columns so that the table compression later on is more
2346        efficient in terms of memory by ensuring the most common value is in the
2347        middle and the least common are on the outside.  IMO this would be better
2348        than a most to least common mapping as theres a decent chance the most
2349        common letter will share a node with the least common, meaning the node
2350        will not be compressible. With a middle is most common approach the worst
2351        case is when we have the least common nodes twice.
2352
2353      */
2354
2355     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2356         regnode *noper = NEXTOPER( cur );
2357         const U8 *uc = (U8*)STRING( noper );
2358         const U8 *e  = uc + STR_LEN( noper );
2359         int foldlen = 0;
2360         U32 wordlen      = 0;         /* required init */
2361         STRLEN minchars = 0;
2362         STRLEN maxchars = 0;
2363         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2364                                                bitmap?*/
2365
2366         if (OP(noper) == NOTHING) {
2367             regnode *noper_next= regnext(noper);
2368             if (noper_next != tail && OP(noper_next) == flags) {
2369                 noper = noper_next;
2370                 uc= (U8*)STRING(noper);
2371                 e= uc + STR_LEN(noper);
2372                 trie->minlen= STR_LEN(noper);
2373             } else {
2374                 trie->minlen= 0;
2375                 continue;
2376             }
2377         }
2378
2379         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2380             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2381                                           regardless of encoding */
2382             if (OP( noper ) == EXACTFU_SS) {
2383                 /* false positives are ok, so just set this */
2384                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2385             }
2386         }
2387         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2388                                            branch */
2389             TRIE_CHARCOUNT(trie)++;
2390             TRIE_READ_CHAR;
2391
2392             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2393              * is in effect.  Under /i, this character can match itself, or
2394              * anything that folds to it.  If not under /i, it can match just
2395              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2396              * all fold to k, and all are single characters.   But some folds
2397              * expand to more than one character, so for example LATIN SMALL
2398              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2399              * the string beginning at 'uc' is 'ffi', it could be matched by
2400              * three characters, or just by the one ligature character. (It
2401              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2402              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2403              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2404              * match.)  The trie needs to know the minimum and maximum number
2405              * of characters that could match so that it can use size alone to
2406              * quickly reject many match attempts.  The max is simple: it is
2407              * the number of folded characters in this branch (since a fold is
2408              * never shorter than what folds to it. */
2409
2410             maxchars++;
2411
2412             /* And the min is equal to the max if not under /i (indicated by
2413              * 'folder' being NULL), or there are no multi-character folds.  If
2414              * there is a multi-character fold, the min is incremented just
2415              * once, for the character that folds to the sequence.  Each
2416              * character in the sequence needs to be added to the list below of
2417              * characters in the trie, but we count only the first towards the
2418              * min number of characters needed.  This is done through the
2419              * variable 'foldlen', which is returned by the macros that look
2420              * for these sequences as the number of bytes the sequence
2421              * occupies.  Each time through the loop, we decrement 'foldlen' by
2422              * how many bytes the current char occupies.  Only when it reaches
2423              * 0 do we increment 'minchars' or look for another multi-character
2424              * sequence. */
2425             if (folder == NULL) {
2426                 minchars++;
2427             }
2428             else if (foldlen > 0) {
2429                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2430             }
2431             else {
2432                 minchars++;
2433
2434                 /* See if *uc is the beginning of a multi-character fold.  If
2435                  * so, we decrement the length remaining to look at, to account
2436                  * for the current character this iteration.  (We can use 'uc'
2437                  * instead of the fold returned by TRIE_READ_CHAR because for
2438                  * non-UTF, the latin1_safe macro is smart enough to account
2439                  * for all the unfolded characters, and because for UTF, the
2440                  * string will already have been folded earlier in the
2441                  * compilation process */
2442                 if (UTF) {
2443                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2444                         foldlen -= UTF8SKIP(uc);
2445                     }
2446                 }
2447                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2448                     foldlen--;
2449                 }
2450             }
2451
2452             /* The current character (and any potential folds) should be added
2453              * to the possible matching characters for this position in this
2454              * branch */
2455             if ( uvc < 256 ) {
2456                 if ( folder ) {
2457                     U8 folded= folder[ (U8) uvc ];
2458                     if ( !trie->charmap[ folded ] ) {
2459                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2460                         TRIE_STORE_REVCHAR( folded );
2461                     }
2462                 }
2463                 if ( !trie->charmap[ uvc ] ) {
2464                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2465                     TRIE_STORE_REVCHAR( uvc );
2466                 }
2467                 if ( set_bit ) {
2468                     /* store the codepoint in the bitmap, and its folded
2469                      * equivalent. */
2470                     TRIE_BITMAP_SET(trie, uvc);
2471
2472                     /* store the folded codepoint */
2473                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2474
2475                     if ( !UTF ) {
2476                         /* store first byte of utf8 representation of
2477                            variant codepoints */
2478                         if (! UVCHR_IS_INVARIANT(uvc)) {
2479                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2480                         }
2481                     }
2482                     set_bit = 0; /* We've done our bit :-) */
2483                 }
2484             } else {
2485
2486                 /* XXX We could come up with the list of code points that fold
2487                  * to this using PL_utf8_foldclosures, except not for
2488                  * multi-char folds, as there may be multiple combinations
2489                  * there that could work, which needs to wait until runtime to
2490                  * resolve (The comment about LIGATURE FFI above is such an
2491                  * example */
2492
2493                 SV** svpp;
2494                 if ( !widecharmap )
2495                     widecharmap = newHV();
2496
2497                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2498
2499                 if ( !svpp )
2500                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2501
2502                 if ( !SvTRUE( *svpp ) ) {
2503                     sv_setiv( *svpp, ++trie->uniquecharcount );
2504                     TRIE_STORE_REVCHAR(uvc);
2505                 }
2506             }
2507         } /* end loop through characters in this branch of the trie */
2508
2509         /* We take the min and max for this branch and combine to find the min
2510          * and max for all branches processed so far */
2511         if( cur == first ) {
2512             trie->minlen = minchars;
2513             trie->maxlen = maxchars;
2514         } else if (minchars < trie->minlen) {
2515             trie->minlen = minchars;
2516         } else if (maxchars > trie->maxlen) {
2517             trie->maxlen = maxchars;
2518         }
2519     } /* end first pass */
2520     DEBUG_TRIE_COMPILE_r(
2521         PerlIO_printf( Perl_debug_log,
2522                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2523                 (int)depth * 2 + 2,"",
2524                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2525                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2526                 (int)trie->minlen, (int)trie->maxlen )
2527     );
2528
2529     /*
2530         We now know what we are dealing with in terms of unique chars and
2531         string sizes so we can calculate how much memory a naive
2532         representation using a flat table  will take. If it's over a reasonable
2533         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2534         conservative but potentially much slower representation using an array
2535         of lists.
2536
2537         At the end we convert both representations into the same compressed
2538         form that will be used in regexec.c for matching with. The latter
2539         is a form that cannot be used to construct with but has memory
2540         properties similar to the list form and access properties similar
2541         to the table form making it both suitable for fast searches and
2542         small enough that its feasable to store for the duration of a program.
2543
2544         See the comment in the code where the compressed table is produced
2545         inplace from the flat tabe representation for an explanation of how
2546         the compression works.
2547
2548     */
2549
2550
2551     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2552     prev_states[1] = 0;
2553
2554     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2555                                                     > SvIV(re_trie_maxbuff) )
2556     {
2557         /*
2558             Second Pass -- Array Of Lists Representation
2559
2560             Each state will be represented by a list of charid:state records
2561             (reg_trie_trans_le) the first such element holds the CUR and LEN
2562             points of the allocated array. (See defines above).
2563
2564             We build the initial structure using the lists, and then convert
2565             it into the compressed table form which allows faster lookups
2566             (but cant be modified once converted).
2567         */
2568
2569         STRLEN transcount = 1;
2570
2571         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2572             "%*sCompiling trie using list compiler\n",
2573             (int)depth * 2 + 2, ""));
2574
2575         trie->states = (reg_trie_state *)
2576             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2577                                   sizeof(reg_trie_state) );
2578         TRIE_LIST_NEW(1);
2579         next_alloc = 2;
2580
2581         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2582
2583             regnode *noper   = NEXTOPER( cur );
2584             U8 *uc           = (U8*)STRING( noper );
2585             const U8 *e      = uc + STR_LEN( noper );
2586             U32 state        = 1;         /* required init */
2587             U16 charid       = 0;         /* sanity init */
2588             U32 wordlen      = 0;         /* required init */
2589
2590             if (OP(noper) == NOTHING) {
2591                 regnode *noper_next= regnext(noper);
2592                 if (noper_next != tail && OP(noper_next) == flags) {
2593                     noper = noper_next;
2594                     uc= (U8*)STRING(noper);
2595                     e= uc + STR_LEN(noper);
2596                 }
2597             }
2598
2599             if (OP(noper) != NOTHING) {
2600                 for ( ; uc < e ; uc += len ) {
2601
2602                     TRIE_READ_CHAR;
2603
2604                     if ( uvc < 256 ) {
2605                         charid = trie->charmap[ uvc ];
2606                     } else {
2607                         SV** const svpp = hv_fetch( widecharmap,
2608                                                     (char*)&uvc,
2609                                                     sizeof( UV ),
2610                                                     0);
2611                         if ( !svpp ) {
2612                             charid = 0;
2613                         } else {
2614                             charid=(U16)SvIV( *svpp );
2615                         }
2616                     }
2617                     /* charid is now 0 if we dont know the char read, or
2618                      * nonzero if we do */
2619                     if ( charid ) {
2620
2621                         U16 check;
2622                         U32 newstate = 0;
2623
2624                         charid--;
2625                         if ( !trie->states[ state ].trans.list ) {
2626                             TRIE_LIST_NEW( state );
2627                         }
2628                         for ( check = 1;
2629                               check <= TRIE_LIST_USED( state );
2630                               check++ )
2631                         {
2632                             if ( TRIE_LIST_ITEM( state, check ).forid
2633                                                                     == charid )
2634                             {
2635                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2636                                 break;
2637                             }
2638                         }
2639                         if ( ! newstate ) {
2640                             newstate = next_alloc++;
2641                             prev_states[newstate] = state;
2642                             TRIE_LIST_PUSH( state, charid, newstate );
2643                             transcount++;
2644                         }
2645                         state = newstate;
2646                     } else {
2647                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2648                     }
2649                 }
2650             }
2651             TRIE_HANDLE_WORD(state);
2652
2653         } /* end second pass */
2654
2655         /* next alloc is the NEXT state to be allocated */
2656         trie->statecount = next_alloc;
2657         trie->states = (reg_trie_state *)
2658             PerlMemShared_realloc( trie->states,
2659                                    next_alloc
2660                                    * sizeof(reg_trie_state) );
2661
2662         /* and now dump it out before we compress it */
2663         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2664                                                          revcharmap, next_alloc,
2665                                                          depth+1)
2666         );
2667
2668         trie->trans = (reg_trie_trans *)
2669             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2670         {
2671             U32 state;
2672             U32 tp = 0;
2673             U32 zp = 0;
2674
2675
2676             for( state=1 ; state < next_alloc ; state ++ ) {
2677                 U32 base=0;
2678
2679                 /*
2680                 DEBUG_TRIE_COMPILE_MORE_r(
2681                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2682                 );
2683                 */
2684
2685                 if (trie->states[state].trans.list) {
2686                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2687                     U16 maxid=minid;
2688                     U16 idx;
2689
2690                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2691                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2692                         if ( forid < minid ) {
2693                             minid=forid;
2694                         } else if ( forid > maxid ) {
2695                             maxid=forid;
2696                         }
2697                     }
2698                     if ( transcount < tp + maxid - minid + 1) {
2699                         transcount *= 2;
2700                         trie->trans = (reg_trie_trans *)
2701                             PerlMemShared_realloc( trie->trans,
2702                                                      transcount
2703                                                      * sizeof(reg_trie_trans) );
2704                         Zero( trie->trans + (transcount / 2),
2705                               transcount / 2,
2706                               reg_trie_trans );
2707                     }
2708                     base = trie->uniquecharcount + tp - minid;
2709                     if ( maxid == minid ) {
2710                         U32 set = 0;
2711                         for ( ; zp < tp ; zp++ ) {
2712                             if ( ! trie->trans[ zp ].next ) {
2713                                 base = trie->uniquecharcount + zp - minid;
2714                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2715                                                                    1).newstate;
2716                                 trie->trans[ zp ].check = state;
2717                                 set = 1;
2718                                 break;
2719                             }
2720                         }
2721                         if ( !set ) {
2722                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2723                                                                    1).newstate;
2724                             trie->trans[ tp ].check = state;
2725                             tp++;
2726                             zp = tp;
2727                         }
2728                     } else {
2729                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2730                             const U32 tid = base
2731                                            - trie->uniquecharcount
2732                                            + TRIE_LIST_ITEM( state, idx ).forid;
2733                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2734                                                                 idx ).newstate;
2735                             trie->trans[ tid ].check = state;
2736                         }
2737                         tp += ( maxid - minid + 1 );
2738                     }
2739                     Safefree(trie->states[ state ].trans.list);
2740                 }
2741                 /*
2742                 DEBUG_TRIE_COMPILE_MORE_r(
2743                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2744                 );
2745                 */
2746                 trie->states[ state ].trans.base=base;
2747             }
2748             trie->lasttrans = tp + 1;
2749         }
2750     } else {
2751         /*
2752            Second Pass -- Flat Table Representation.
2753
2754            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2755            each.  We know that we will need Charcount+1 trans at most to store
2756            the data (one row per char at worst case) So we preallocate both
2757            structures assuming worst case.
2758
2759            We then construct the trie using only the .next slots of the entry
2760            structs.
2761
2762            We use the .check field of the first entry of the node temporarily
2763            to make compression both faster and easier by keeping track of how
2764            many non zero fields are in the node.
2765
2766            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2767            transition.
2768
2769            There are two terms at use here: state as a TRIE_NODEIDX() which is
2770            a number representing the first entry of the node, and state as a
2771            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2772            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2773            if there are 2 entrys per node. eg:
2774
2775              A B       A B
2776           1. 2 4    1. 3 7
2777           2. 0 3    3. 0 5
2778           3. 0 0    5. 0 0
2779           4. 0 0    7. 0 0
2780
2781            The table is internally in the right hand, idx form. However as we
2782            also have to deal with the states array which is indexed by nodenum
2783            we have to use TRIE_NODENUM() to convert.
2784
2785         */
2786         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2787             "%*sCompiling trie using table compiler\n",
2788             (int)depth * 2 + 2, ""));
2789
2790         trie->trans = (reg_trie_trans *)
2791             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2792                                   * trie->uniquecharcount + 1,
2793                                   sizeof(reg_trie_trans) );
2794         trie->states = (reg_trie_state *)
2795             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2796                                   sizeof(reg_trie_state) );
2797         next_alloc = trie->uniquecharcount + 1;
2798
2799
2800         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2801
2802             regnode *noper   = NEXTOPER( cur );
2803             const U8 *uc     = (U8*)STRING( noper );
2804             const U8 *e      = uc + STR_LEN( noper );
2805
2806             U32 state        = 1;         /* required init */
2807
2808             U16 charid       = 0;         /* sanity init */
2809             U32 accept_state = 0;         /* sanity init */
2810
2811             U32 wordlen      = 0;         /* required init */
2812
2813             if (OP(noper) == NOTHING) {
2814                 regnode *noper_next= regnext(noper);
2815                 if (noper_next != tail && OP(noper_next) == flags) {
2816                     noper = noper_next;
2817                     uc= (U8*)STRING(noper);
2818                     e= uc + STR_LEN(noper);
2819                 }
2820             }
2821
2822             if ( OP(noper) != NOTHING ) {
2823                 for ( ; uc < e ; uc += len ) {
2824
2825                     TRIE_READ_CHAR;
2826
2827                     if ( uvc < 256 ) {
2828                         charid = trie->charmap[ uvc ];
2829                     } else {
2830                         SV* const * const svpp = hv_fetch( widecharmap,
2831                                                            (char*)&uvc,
2832                                                            sizeof( UV ),
2833                                                            0);
2834                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2835                     }
2836                     if ( charid ) {
2837                         charid--;
2838                         if ( !trie->trans[ state + charid ].next ) {
2839                             trie->trans[ state + charid ].next = next_alloc;
2840                             trie->trans[ state ].check++;
2841                             prev_states[TRIE_NODENUM(next_alloc)]
2842                                     = TRIE_NODENUM(state);
2843                             next_alloc += trie->uniquecharcount;
2844                         }
2845                         state = trie->trans[ state + charid ].next;
2846                     } else {
2847                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2848                     }
2849                     /* charid is now 0 if we dont know the char read, or
2850                      * nonzero if we do */
2851                 }
2852             }
2853             accept_state = TRIE_NODENUM( state );
2854             TRIE_HANDLE_WORD(accept_state);
2855
2856         } /* end second pass */
2857
2858         /* and now dump it out before we compress it */
2859         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2860                                                           revcharmap,
2861                                                           next_alloc, depth+1));
2862
2863         {
2864         /*
2865            * Inplace compress the table.*
2866
2867            For sparse data sets the table constructed by the trie algorithm will
2868            be mostly 0/FAIL transitions or to put it another way mostly empty.
2869            (Note that leaf nodes will not contain any transitions.)
2870
2871            This algorithm compresses the tables by eliminating most such
2872            transitions, at the cost of a modest bit of extra work during lookup:
2873
2874            - Each states[] entry contains a .base field which indicates the
2875            index in the state[] array wheres its transition data is stored.
2876
2877            - If .base is 0 there are no valid transitions from that node.
2878
2879            - If .base is nonzero then charid is added to it to find an entry in
2880            the trans array.
2881
2882            -If trans[states[state].base+charid].check!=state then the
2883            transition is taken to be a 0/Fail transition. Thus if there are fail
2884            transitions at the front of the node then the .base offset will point
2885            somewhere inside the previous nodes data (or maybe even into a node
2886            even earlier), but the .check field determines if the transition is
2887            valid.
2888
2889            XXX - wrong maybe?
2890            The following process inplace converts the table to the compressed
2891            table: We first do not compress the root node 1,and mark all its
2892            .check pointers as 1 and set its .base pointer as 1 as well. This
2893            allows us to do a DFA construction from the compressed table later,
2894            and ensures that any .base pointers we calculate later are greater
2895            than 0.
2896
2897            - We set 'pos' to indicate the first entry of the second node.
2898
2899            - We then iterate over the columns of the node, finding the first and
2900            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2901            and set the .check pointers accordingly, and advance pos
2902            appropriately and repreat for the next node. Note that when we copy
2903            the next pointers we have to convert them from the original
2904            NODEIDX form to NODENUM form as the former is not valid post
2905            compression.
2906
2907            - If a node has no transitions used we mark its base as 0 and do not
2908            advance the pos pointer.
2909
2910            - If a node only has one transition we use a second pointer into the
2911            structure to fill in allocated fail transitions from other states.
2912            This pointer is independent of the main pointer and scans forward
2913            looking for null transitions that are allocated to a state. When it
2914            finds one it writes the single transition into the "hole".  If the
2915            pointer doesnt find one the single transition is appended as normal.
2916
2917            - Once compressed we can Renew/realloc the structures to release the
2918            excess space.
2919
2920            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2921            specifically Fig 3.47 and the associated pseudocode.
2922
2923            demq
2924         */
2925         const U32 laststate = TRIE_NODENUM( next_alloc );
2926         U32 state, charid;
2927         U32 pos = 0, zp=0;
2928         trie->statecount = laststate;
2929
2930         for ( state = 1 ; state < laststate ; state++ ) {
2931             U8 flag = 0;
2932             const U32 stateidx = TRIE_NODEIDX( state );
2933             const U32 o_used = trie->trans[ stateidx ].check;
2934             U32 used = trie->trans[ stateidx ].check;
2935             trie->trans[ stateidx ].check = 0;
2936
2937             for ( charid = 0;
2938                   used && charid < trie->uniquecharcount;
2939                   charid++ )
2940             {
2941                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2942                     if ( trie->trans[ stateidx + charid ].next ) {
2943                         if (o_used == 1) {
2944                             for ( ; zp < pos ; zp++ ) {
2945                                 if ( ! trie->trans[ zp ].next ) {
2946                                     break;
2947                                 }
2948                             }
2949                             trie->states[ state ].trans.base
2950                                                     = zp
2951                                                       + trie->uniquecharcount
2952                                                       - charid ;
2953                             trie->trans[ zp ].next
2954                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2955                                                              + charid ].next );
2956                             trie->trans[ zp ].check = state;
2957                             if ( ++zp > pos ) pos = zp;
2958                             break;
2959                         }
2960                         used--;
2961                     }
2962                     if ( !flag ) {
2963                         flag = 1;
2964                         trie->states[ state ].trans.base
2965                                        = pos + trie->uniquecharcount - charid ;
2966                     }
2967                     trie->trans[ pos ].next
2968                         = SAFE_TRIE_NODENUM(
2969                                        trie->trans[ stateidx + charid ].next );
2970                     trie->trans[ pos ].check = state;
2971                     pos++;
2972                 }
2973             }
2974         }
2975         trie->lasttrans = pos + 1;
2976         trie->states = (reg_trie_state *)
2977             PerlMemShared_realloc( trie->states, laststate
2978                                    * sizeof(reg_trie_state) );
2979         DEBUG_TRIE_COMPILE_MORE_r(
2980             PerlIO_printf( Perl_debug_log,
2981                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2982                 (int)depth * 2 + 2,"",
2983                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2984                        + 1 ),
2985                 (IV)next_alloc,
2986                 (IV)pos,
2987                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2988             );
2989
2990         } /* end table compress */
2991     }
2992     DEBUG_TRIE_COMPILE_MORE_r(
2993             PerlIO_printf(Perl_debug_log,
2994                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2995                 (int)depth * 2 + 2, "",
2996                 (UV)trie->statecount,
2997                 (UV)trie->lasttrans)
2998     );
2999     /* resize the trans array to remove unused space */
3000     trie->trans = (reg_trie_trans *)
3001         PerlMemShared_realloc( trie->trans, trie->lasttrans
3002                                * sizeof(reg_trie_trans) );
3003
3004     {   /* Modify the program and insert the new TRIE node */
3005         U8 nodetype =(U8)(flags & 0xFF);
3006         char *str=NULL;
3007
3008 #ifdef DEBUGGING
3009         regnode *optimize = NULL;
3010 #ifdef RE_TRACK_PATTERN_OFFSETS
3011
3012         U32 mjd_offset = 0;
3013         U32 mjd_nodelen = 0;
3014 #endif /* RE_TRACK_PATTERN_OFFSETS */
3015 #endif /* DEBUGGING */
3016         /*
3017            This means we convert either the first branch or the first Exact,
3018            depending on whether the thing following (in 'last') is a branch
3019            or not and whther first is the startbranch (ie is it a sub part of
3020            the alternation or is it the whole thing.)
3021            Assuming its a sub part we convert the EXACT otherwise we convert
3022            the whole branch sequence, including the first.
3023          */
3024         /* Find the node we are going to overwrite */
3025         if ( first != startbranch || OP( last ) == BRANCH ) {
3026             /* branch sub-chain */
3027             NEXT_OFF( first ) = (U16)(last - first);
3028 #ifdef RE_TRACK_PATTERN_OFFSETS
3029             DEBUG_r({
3030                 mjd_offset= Node_Offset((convert));
3031                 mjd_nodelen= Node_Length((convert));
3032             });
3033 #endif
3034             /* whole branch chain */
3035         }
3036 #ifdef RE_TRACK_PATTERN_OFFSETS
3037         else {
3038             DEBUG_r({
3039                 const  regnode *nop = NEXTOPER( convert );
3040                 mjd_offset= Node_Offset((nop));
3041                 mjd_nodelen= Node_Length((nop));
3042             });
3043         }
3044         DEBUG_OPTIMISE_r(
3045             PerlIO_printf(Perl_debug_log,
3046                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
3047                 (int)depth * 2 + 2, "",
3048                 (UV)mjd_offset, (UV)mjd_nodelen)
3049         );
3050 #endif
3051         /* But first we check to see if there is a common prefix we can
3052            split out as an EXACT and put in front of the TRIE node.  */
3053         trie->startstate= 1;
3054         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3055             U32 state;
3056             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3057                 U32 ofs = 0;
3058                 I32 idx = -1;
3059                 U32 count = 0;
3060                 const U32 base = trie->states[ state ].trans.base;
3061
3062                 if ( trie->states[state].wordnum )
3063                         count = 1;
3064
3065                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3066                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3067                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3068                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3069                     {
3070                         if ( ++count > 1 ) {
3071                             SV **tmp = av_fetch( revcharmap, ofs, 0);
3072                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3073                             if ( state == 1 ) break;
3074                             if ( count == 2 ) {
3075                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3076                                 DEBUG_OPTIMISE_r(
3077                                     PerlIO_printf(Perl_debug_log,
3078                                         "%*sNew Start State=%"UVuf" Class: [",
3079                                         (int)depth * 2 + 2, "",
3080                                         (UV)state));
3081                                 if (idx >= 0) {
3082                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
3083                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3084
3085                                     TRIE_BITMAP_SET(trie,*ch);
3086                                     if ( folder )
3087                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
3088                                     DEBUG_OPTIMISE_r(
3089                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3090                                     );
3091                                 }
3092                             }
3093                             TRIE_BITMAP_SET(trie,*ch);
3094                             if ( folder )
3095                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3096                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
3097                         }
3098                         idx = ofs;
3099                     }
3100                 }
3101                 if ( count == 1 ) {
3102                     SV **tmp = av_fetch( revcharmap, idx, 0);
3103                     STRLEN len;
3104                     char *ch = SvPV( *tmp, len );
3105                     DEBUG_OPTIMISE_r({
3106                         SV *sv=sv_newmortal();
3107                         PerlIO_printf( Perl_debug_log,
3108                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3109                             (int)depth * 2 + 2, "",
3110                             (UV)state, (UV)idx,
3111                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3112                                 PL_colors[0], PL_colors[1],
3113                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3114                                 PERL_PV_ESCAPE_FIRSTCHAR
3115                             )
3116                         );
3117                     });
3118                     if ( state==1 ) {
3119                         OP( convert ) = nodetype;
3120                         str=STRING(convert);
3121                         STR_LEN(convert)=0;
3122                     }
3123                     STR_LEN(convert) += len;
3124                     while (len--)
3125                         *str++ = *ch++;
3126                 } else {
3127 #ifdef DEBUGGING
3128                     if (state>1)
3129                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3130 #endif
3131                     break;
3132                 }
3133             }
3134             trie->prefixlen = (state-1);
3135             if (str) {
3136                 regnode *n = convert+NODE_SZ_STR(convert);
3137                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3138                 trie->startstate = state;
3139                 trie->minlen -= (state - 1);
3140                 trie->maxlen -= (state - 1);
3141 #ifdef DEBUGGING
3142                /* At least the UNICOS C compiler choked on this
3143                 * being argument to DEBUG_r(), so let's just have
3144                 * it right here. */
3145                if (
3146 #ifdef PERL_EXT_RE_BUILD
3147                    1
3148 #else
3149                    DEBUG_r_TEST
3150 #endif
3151                    ) {
3152                    regnode *fix = convert;
3153                    U32 word = trie->wordcount;
3154                    mjd_nodelen++;
3155                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3156                    while( ++fix < n ) {
3157                        Set_Node_Offset_Length(fix, 0, 0);
3158                    }
3159                    while (word--) {
3160                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3161                        if (tmp) {
3162                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3163                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3164                            else
3165                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3166                        }
3167                    }
3168                }
3169 #endif
3170                 if (trie->maxlen) {
3171                     convert = n;
3172                 } else {
3173                     NEXT_OFF(convert) = (U16)(tail - convert);
3174                     DEBUG_r(optimize= n);
3175                 }
3176             }
3177         }
3178         if (!jumper)
3179             jumper = last;
3180         if ( trie->maxlen ) {
3181             NEXT_OFF( convert ) = (U16)(tail - convert);
3182             ARG_SET( convert, data_slot );
3183             /* Store the offset to the first unabsorbed branch in
3184                jump[0], which is otherwise unused by the jump logic.
3185                We use this when dumping a trie and during optimisation. */
3186             if (trie->jump)
3187                 trie->jump[0] = (U16)(nextbranch - convert);
3188
3189             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3190              *   and there is a bitmap
3191              *   and the first "jump target" node we found leaves enough room
3192              * then convert the TRIE node into a TRIEC node, with the bitmap
3193              * embedded inline in the opcode - this is hypothetically faster.
3194              */
3195             if ( !trie->states[trie->startstate].wordnum
3196                  && trie->bitmap
3197                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3198             {
3199                 OP( convert ) = TRIEC;
3200                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3201                 PerlMemShared_free(trie->bitmap);
3202                 trie->bitmap= NULL;
3203             } else
3204                 OP( convert ) = TRIE;
3205
3206             /* store the type in the flags */
3207             convert->flags = nodetype;
3208             DEBUG_r({
3209             optimize = convert
3210                       + NODE_STEP_REGNODE
3211                       + regarglen[ OP( convert ) ];
3212             });
3213             /* XXX We really should free up the resource in trie now,
3214                    as we won't use them - (which resources?) dmq */
3215         }
3216         /* needed for dumping*/
3217         DEBUG_r(if (optimize) {
3218             regnode *opt = convert;
3219
3220             while ( ++opt < optimize) {
3221                 Set_Node_Offset_Length(opt,0,0);
3222             }
3223             /*
3224                 Try to clean up some of the debris left after the
3225                 optimisation.
3226              */
3227             while( optimize < jumper ) {
3228                 mjd_nodelen += Node_Length((optimize));
3229                 OP( optimize ) = OPTIMIZED;
3230                 Set_Node_Offset_Length(optimize,0,0);
3231                 optimize++;
3232             }
3233             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3234         });
3235     } /* end node insert */
3236
3237     /*  Finish populating the prev field of the wordinfo array.  Walk back
3238      *  from each accept state until we find another accept state, and if
3239      *  so, point the first word's .prev field at the second word. If the
3240      *  second already has a .prev field set, stop now. This will be the
3241      *  case either if we've already processed that word's accept state,
3242      *  or that state had multiple words, and the overspill words were
3243      *  already linked up earlier.
3244      */
3245     {
3246         U16 word;
3247         U32 state;
3248         U16 prev;
3249
3250         for (word=1; word <= trie->wordcount; word++) {
3251             prev = 0;
3252             if (trie->wordinfo[word].prev)
3253                 continue;
3254             state = trie->wordinfo[word].accept;
3255             while (state) {
3256                 state = prev_states[state];
3257                 if (!state)
3258                     break;
3259                 prev = trie->states[state].wordnum;
3260                 if (prev)
3261                     break;
3262             }
3263             trie->wordinfo[word].prev = prev;
3264         }
3265         Safefree(prev_states);
3266     }
3267
3268
3269     /* and now dump out the compressed format */
3270     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3271
3272     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3273 #ifdef DEBUGGING
3274     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3275     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3276 #else
3277     SvREFCNT_dec_NN(revcharmap);
3278 #endif
3279     return trie->jump
3280            ? MADE_JUMP_TRIE
3281            : trie->startstate>1
3282              ? MADE_EXACT_TRIE
3283              : MADE_TRIE;
3284 }
3285
3286 STATIC regnode *
3287 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3288 {
3289 /* The Trie is constructed and compressed now so we can build a fail array if
3290  * it's needed
3291
3292    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3293    3.32 in the
3294    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3295    Ullman 1985/88
3296    ISBN 0-201-10088-6
3297
3298    We find the fail state for each state in the trie, this state is the longest
3299    proper suffix of the current state's 'word' that is also a proper prefix of
3300    another word in our trie. State 1 represents the word '' and is thus the
3301    default fail state. This allows the DFA not to have to restart after its
3302    tried and failed a word at a given point, it simply continues as though it
3303    had been matching the other word in the first place.
3304    Consider
3305       'abcdgu'=~/abcdefg|cdgu/
3306    When we get to 'd' we are still matching the first word, we would encounter
3307    'g' which would fail, which would bring us to the state representing 'd' in
3308    the second word where we would try 'g' and succeed, proceeding to match
3309    'cdgu'.
3310  */
3311  /* add a fail transition */
3312     const U32 trie_offset = ARG(source);
3313     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3314     U32 *q;
3315     const U32 ucharcount = trie->uniquecharcount;
3316     const U32 numstates = trie->statecount;
3317     const U32 ubound = trie->lasttrans + ucharcount;
3318     U32 q_read = 0;
3319     U32 q_write = 0;
3320     U32 charid;
3321     U32 base = trie->states[ 1 ].trans.base;
3322     U32 *fail;
3323     reg_ac_data *aho;
3324     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3325     regnode *stclass;
3326     GET_RE_DEBUG_FLAGS_DECL;
3327
3328     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3329     PERL_UNUSED_CONTEXT;
3330 #ifndef DEBUGGING
3331     PERL_UNUSED_ARG(depth);
3332 #endif
3333
3334     if ( OP(source) == TRIE ) {
3335         struct regnode_1 *op = (struct regnode_1 *)
3336             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3337         StructCopy(source,op,struct regnode_1);
3338         stclass = (regnode *)op;
3339     } else {
3340         struct regnode_charclass *op = (struct regnode_charclass *)
3341             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3342         StructCopy(source,op,struct regnode_charclass);
3343         stclass = (regnode *)op;
3344     }
3345     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3346
3347     ARG_SET( stclass, data_slot );
3348     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3349     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3350     aho->trie=trie_offset;
3351     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3352     Copy( trie->states, aho->states, numstates, reg_trie_state );
3353     Newxz( q, numstates, U32);
3354     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3355     aho->refcount = 1;
3356     fail = aho->fail;
3357     /* initialize fail[0..1] to be 1 so that we always have
3358        a valid final fail state */
3359     fail[ 0 ] = fail[ 1 ] = 1;
3360
3361     for ( charid = 0; charid < ucharcount ; charid++ ) {
3362         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3363         if ( newstate ) {
3364             q[ q_write ] = newstate;
3365             /* set to point at the root */
3366             fail[ q[ q_write++ ] ]=1;
3367         }
3368     }
3369     while ( q_read < q_write) {
3370         const U32 cur = q[ q_read++ % numstates ];
3371         base = trie->states[ cur ].trans.base;
3372
3373         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3374             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3375             if (ch_state) {
3376                 U32 fail_state = cur;
3377                 U32 fail_base;
3378                 do {
3379                     fail_state = fail[ fail_state ];
3380                     fail_base = aho->states[ fail_state ].trans.base;
3381                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3382
3383                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3384                 fail[ ch_state ] = fail_state;
3385                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3386                 {
3387                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3388                 }
3389                 q[ q_write++ % numstates] = ch_state;
3390             }
3391         }
3392     }
3393     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3394        when we fail in state 1, this allows us to use the
3395        charclass scan to find a valid start char. This is based on the principle
3396        that theres a good chance the string being searched contains lots of stuff
3397        that cant be a start char.
3398      */
3399     fail[ 0 ] = fail[ 1 ] = 0;
3400     DEBUG_TRIE_COMPILE_r({
3401         PerlIO_printf(Perl_debug_log,
3402                       "%*sStclass Failtable (%"UVuf" states): 0",
3403                       (int)(depth * 2), "", (UV)numstates
3404         );
3405         for( q_read=1; q_read<numstates; q_read++ ) {
3406             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3407         }
3408         PerlIO_printf(Perl_debug_log, "\n");
3409     });
3410     Safefree(q);
3411     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3412     return stclass;
3413 }
3414
3415
3416 #define DEBUG_PEEP(str,scan,depth) \
3417     DEBUG_OPTIMISE_r({if (scan){ \
3418        regnode *Next = regnext(scan); \
3419        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3420        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3421            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3422            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3423        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3424        PerlIO_printf(Perl_debug_log, "\n"); \
3425    }});
3426
3427 /* The below joins as many adjacent EXACTish nodes as possible into a single
3428  * one.  The regop may be changed if the node(s) contain certain sequences that
3429  * require special handling.  The joining is only done if:
3430  * 1) there is room in the current conglomerated node to entirely contain the
3431  *    next one.
3432  * 2) they are the exact same node type
3433  *
3434  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3435  * these get optimized out
3436  *
3437  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3438  * as possible, even if that means splitting an existing node so that its first
3439  * part is moved to the preceeding node.  This would maximise the efficiency of
3440  * memEQ during matching.  Elsewhere in this file, khw proposes splitting
3441  * EXACTFish nodes into portions that don't change under folding vs those that
3442  * do.  Those portions that don't change may be the only things in the pattern that
3443  * could be used to find fixed and floating strings.
3444  *
3445  * If a node is to match under /i (folded), the number of characters it matches
3446  * can be different than its character length if it contains a multi-character
3447  * fold.  *min_subtract is set to the total delta number of characters of the
3448  * input nodes.
3449  *
3450  * And *unfolded_multi_char is set to indicate whether or not the node contains
3451  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3452  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3453  * SMALL LETTER SHARP S, as only if the target string being matched against
3454  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3455  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3456  * whose components are all above the Latin1 range are not run-time locale
3457  * dependent, and have already been folded by the time this function is
3458  * called.)
3459  *
3460  * This is as good a place as any to discuss the design of handling these
3461  * multi-character fold sequences.  It's been wrong in Perl for a very long
3462  * time.  There are three code points in Unicode whose multi-character folds
3463  * were long ago discovered to mess things up.  The previous designs for
3464  * dealing with these involved assigning a special node for them.  This
3465  * approach doesn't always work, as evidenced by this example:
3466  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3467  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3468  * would match just the \xDF, it won't be able to handle the case where a
3469  * successful match would have to cross the node's boundary.  The new approach
3470  * that hopefully generally solves the problem generates an EXACTFU_SS node
3471  * that is "sss" in this case.
3472  *
3473  * It turns out that there are problems with all multi-character folds, and not
3474  * just these three.  Now the code is general, for all such cases.  The
3475  * approach taken is:
3476  * 1)   This routine examines each EXACTFish node that could contain multi-
3477  *      character folded sequences.  Since a single character can fold into
3478  *      such a sequence, the minimum match length for this node is less than
3479  *      the number of characters in the node.  This routine returns in
3480  *      *min_subtract how many characters to subtract from the the actual
3481  *      length of the string to get a real minimum match length; it is 0 if
3482  *      there are no multi-char foldeds.  This delta is used by the caller to
3483  *      adjust the min length of the match, and the delta between min and max,
3484  *      so that the optimizer doesn't reject these possibilities based on size
3485  *      constraints.
3486  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3487  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3488  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3489  *      there is a possible fold length change.  That means that a regular
3490  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3491  *      with length changes, and so can be processed faster.  regexec.c takes
3492  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3493  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3494  *      known until runtime).  This saves effort in regex matching.  However,
3495  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3496  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3497  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3498  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3499  *      possibilities for the non-UTF8 patterns are quite simple, except for
3500  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3501  *      members of a fold-pair, and arrays are set up for all of them so that
3502  *      the other member of the pair can be found quickly.  Code elsewhere in
3503  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3504  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3505  *      described in the next item.
3506  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3507  *      validity of the fold won't be known until runtime, and so must remain
3508  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3509  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3510  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3511  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3512  *      The reason this is a problem is that the optimizer part of regexec.c
3513  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3514  *      that a character in the pattern corresponds to at most a single
3515  *      character in the target string.  (And I do mean character, and not byte
3516  *      here, unlike other parts of the documentation that have never been
3517  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3518  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3519  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3520  *      nodes, violate the assumption, and they are the only instances where it
3521  *      is violated.  I'm reluctant to try to change the assumption, as the
3522  *      code involved is impenetrable to me (khw), so instead the code here
3523  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3524  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3525  *      boolean indicating whether or not the node contains such a fold.  When
3526  *      it is true, the caller sets a flag that later causes the optimizer in
3527  *      this file to not set values for the floating and fixed string lengths,
3528  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3529  *      assumption.  Thus, there is no optimization based on string lengths for
3530  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3531  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3532  *      assumption is wrong only in these cases is that all other non-UTF-8
3533  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3534  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3535  *      EXACTF nodes because we don't know at compile time if it actually
3536  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3537  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3538  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3539  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3540  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3541  *      string would require the pattern to be forced into UTF-8, the overhead
3542  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3543  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3544  *      locale.)
3545  *
3546  *      Similarly, the code that generates tries doesn't currently handle
3547  *      not-already-folded multi-char folds, and it looks like a pain to change
3548  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3549  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3550  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3551  *      using /iaa matching will be doing so almost entirely with ASCII
3552  *      strings, so this should rarely be encountered in practice */
3553
3554 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3555     if (PL_regkind[OP(scan)] == EXACT) \
3556         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3557
3558 STATIC U32
3559 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3560                    UV *min_subtract, bool *unfolded_multi_char,
3561                    U32 flags,regnode *val, U32 depth)
3562 {
3563     /* Merge several consecutive EXACTish nodes into one. */
3564     regnode *n = regnext(scan);
3565     U32 stringok = 1;
3566     regnode *next = scan + NODE_SZ_STR(scan);
3567     U32 merged = 0;
3568     U32 stopnow = 0;
3569 #ifdef DEBUGGING
3570     regnode *stop = scan;
3571     GET_RE_DEBUG_FLAGS_DECL;
3572 #else
3573     PERL_UNUSED_ARG(depth);
3574 #endif
3575
3576     PERL_ARGS_ASSERT_JOIN_EXACT;
3577 #ifndef EXPERIMENTAL_INPLACESCAN
3578     PERL_UNUSED_ARG(flags);
3579     PERL_UNUSED_ARG(val);
3580 #endif
3581     DEBUG_PEEP("join",scan,depth);
3582
3583     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3584      * EXACT ones that are mergeable to the current one. */
3585     while (n
3586            && (PL_regkind[OP(n)] == NOTHING
3587                || (stringok && OP(n) == OP(scan)))
3588            && NEXT_OFF(n)
3589            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3590     {
3591
3592         if (OP(n) == TAIL || n > next)
3593             stringok = 0;
3594         if (PL_regkind[OP(n)] == NOTHING) {
3595             DEBUG_PEEP("skip:",n,depth);
3596             NEXT_OFF(scan) += NEXT_OFF(n);
3597             next = n + NODE_STEP_REGNODE;
3598 #ifdef DEBUGGING
3599             if (stringok)
3600                 stop = n;
3601 #endif
3602             n = regnext(n);
3603         }
3604         else if (stringok) {
3605             const unsigned int oldl = STR_LEN(scan);
3606             regnode * const nnext = regnext(n);
3607
3608             /* XXX I (khw) kind of doubt that this works on platforms (should
3609              * Perl ever run on one) where U8_MAX is above 255 because of lots
3610              * of other assumptions */
3611             /* Don't join if the sum can't fit into a single node */
3612             if (oldl + STR_LEN(n) > U8_MAX)
3613                 break;
3614
3615             DEBUG_PEEP("merg",n,depth);
3616             merged++;
3617
3618             NEXT_OFF(scan) += NEXT_OFF(n);
3619             STR_LEN(scan) += STR_LEN(n);
3620             next = n + NODE_SZ_STR(n);
3621             /* Now we can overwrite *n : */
3622             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3623 #ifdef DEBUGGING
3624             stop = next - 1;
3625 #endif
3626             n = nnext;
3627             if (stopnow) break;
3628         }
3629
3630 #ifdef EXPERIMENTAL_INPLACESCAN
3631         if (flags && !NEXT_OFF(n)) {
3632             DEBUG_PEEP("atch", val, depth);
3633             if (reg_off_by_arg[OP(n)]) {
3634                 ARG_SET(n, val - n);
3635             }
3636             else {
3637                 NEXT_OFF(n) = val - n;
3638             }
3639             stopnow = 1;
3640         }
3641 #endif
3642     }
3643
3644     *min_subtract = 0;
3645     *unfolded_multi_char = FALSE;
3646
3647     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3648      * can now analyze for sequences of problematic code points.  (Prior to
3649      * this final joining, sequences could have been split over boundaries, and
3650      * hence missed).  The sequences only happen in folding, hence for any
3651      * non-EXACT EXACTish node */
3652     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3653         U8* s0 = (U8*) STRING(scan);
3654         U8* s = s0;
3655         U8* s_end = s0 + STR_LEN(scan);
3656
3657         int total_count_delta = 0;  /* Total delta number of characters that
3658                                        multi-char folds expand to */
3659
3660         /* One pass is made over the node's string looking for all the
3661          * possibilities.  To avoid some tests in the loop, there are two main
3662          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3663          * non-UTF-8 */
3664         if (UTF) {
3665             U8* folded = NULL;
3666
3667             if (OP(scan) == EXACTFL) {
3668                 U8 *d;
3669
3670                 /* An EXACTFL node would already have been changed to another
3671                  * node type unless there is at least one character in it that
3672                  * is problematic; likely a character whose fold definition
3673                  * won't be known until runtime, and so has yet to be folded.
3674                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3675                  * to handle the UTF-8 case, we need to create a temporary
3676                  * folded copy using UTF-8 locale rules in order to analyze it.
3677                  * This is because our macros that look to see if a sequence is
3678                  * a multi-char fold assume everything is folded (otherwise the
3679                  * tests in those macros would be too complicated and slow).
3680                  * Note that here, the non-problematic folds will have already
3681                  * been done, so we can just copy such characters.  We actually
3682                  * don't completely fold the EXACTFL string.  We skip the
3683                  * unfolded multi-char folds, as that would just create work
3684                  * below to figure out the size they already are */
3685
3686                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3687                 d = folded;
3688                 while (s < s_end) {
3689                     STRLEN s_len = UTF8SKIP(s);
3690                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3691                         Copy(s, d, s_len, U8);
3692                         d += s_len;
3693                     }
3694                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3695                         *unfolded_multi_char = TRUE;
3696                         Copy(s, d, s_len, U8);
3697                         d += s_len;
3698                     }
3699                     else if (isASCII(*s)) {
3700                         *(d++) = toFOLD(*s);
3701                     }
3702                     else {
3703                         STRLEN len;
3704                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3705                         d += len;
3706                     }
3707                     s += s_len;
3708                 }
3709
3710                 /* Point the remainder of the routine to look at our temporary
3711                  * folded copy */
3712                 s = folded;
3713                 s_end = d;
3714             } /* End of creating folded copy of EXACTFL string */
3715
3716             /* Examine the string for a multi-character fold sequence.  UTF-8
3717              * patterns have all characters pre-folded by the time this code is
3718              * executed */
3719             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3720                                      length sequence we are looking for is 2 */
3721             {
3722                 int count = 0;  /* How many characters in a multi-char fold */
3723                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3724                 if (! len) {    /* Not a multi-char fold: get next char */
3725                     s += UTF8SKIP(s);
3726                     continue;
3727                 }
3728
3729                 /* Nodes with 'ss' require special handling, except for
3730                  * EXACTFA-ish for which there is no multi-char fold to this */
3731                 if (len == 2 && *s == 's' && *(s+1) == 's'
3732                     && OP(scan) != EXACTFA
3733                     && OP(scan) != EXACTFA_NO_TRIE)
3734                 {
3735                     count = 2;
3736                     if (OP(scan) != EXACTFL) {
3737                         OP(scan) = EXACTFU_SS;
3738                     }
3739                     s += 2;
3740                 }
3741                 else { /* Here is a generic multi-char fold. */
3742                     U8* multi_end  = s + len;
3743
3744                     /* Count how many characters are in it.  In the case of
3745                      * /aa, no folds which contain ASCII code points are
3746                      * allowed, so check for those, and skip if found. */
3747                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3748                         count = utf8_length(s, multi_end);
3749                         s = multi_end;
3750                     }
3751                     else {
3752                         while (s < multi_end) {
3753                             if (isASCII(*s)) {
3754                                 s++;
3755                                 goto next_iteration;
3756                             }
3757                             else {
3758                                 s += UTF8SKIP(s);
3759                             }
3760                             count++;
3761                         }
3762                     }
3763                 }
3764
3765                 /* The delta is how long the sequence is minus 1 (1 is how long
3766                  * the character that folds to the sequence is) */
3767                 total_count_delta += count - 1;
3768               next_iteration: ;
3769             }
3770
3771             /* We created a temporary folded copy of the string in EXACTFL
3772              * nodes.  Therefore we need to be sure it doesn't go below zero,
3773              * as the real string could be shorter */
3774             if (OP(scan) == EXACTFL) {
3775                 int total_chars = utf8_length((U8*) STRING(scan),
3776                                            (U8*) STRING(scan) + STR_LEN(scan));
3777                 if (total_count_delta > total_chars) {
3778                     total_count_delta = total_chars;
3779                 }
3780             }
3781
3782             *min_subtract += total_count_delta;
3783             Safefree(folded);
3784         }
3785         else if (OP(scan) == EXACTFA) {
3786
3787             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3788              * fold to the ASCII range (and there are no existing ones in the
3789              * upper latin1 range).  But, as outlined in the comments preceding
3790              * this function, we need to flag any occurrences of the sharp s.
3791              * This character forbids trie formation (because of added
3792              * complexity) */
3793 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3794    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3795                                       || UNICODE_DOT_DOT_VERSION > 0)
3796             while (s < s_end) {
3797                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3798                     OP(scan) = EXACTFA_NO_TRIE;
3799                     *unfolded_multi_char = TRUE;
3800                     break;
3801                 }
3802                 s++;
3803             }
3804         }
3805         else {
3806
3807             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3808              * folds that are all Latin1.  As explained in the comments
3809              * preceding this function, we look also for the sharp s in EXACTF
3810              * and EXACTFL nodes; it can be in the final position.  Otherwise
3811              * we can stop looking 1 byte earlier because have to find at least
3812              * two characters for a multi-fold */
3813             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3814                               ? s_end
3815                               : s_end -1;
3816
3817             while (s < upper) {
3818                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3819                 if (! len) {    /* Not a multi-char fold. */
3820                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3821                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3822                     {
3823                         *unfolded_multi_char = TRUE;
3824                     }
3825                     s++;
3826                     continue;
3827                 }
3828
3829                 if (len == 2
3830                     && isALPHA_FOLD_EQ(*s, 's')
3831                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3832                 {
3833
3834                     /* EXACTF nodes need to know that the minimum length
3835                      * changed so that a sharp s in the string can match this
3836                      * ss in the pattern, but they remain EXACTF nodes, as they
3837                      * won't match this unless the target string is is UTF-8,
3838                      * which we don't know until runtime.  EXACTFL nodes can't
3839                      * transform into EXACTFU nodes */
3840                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3841                         OP(scan) = EXACTFU_SS;
3842                     }
3843                 }
3844
3845                 *min_subtract += len - 1;
3846                 s += len;
3847             }
3848 #endif
3849         }
3850     }
3851
3852 #ifdef DEBUGGING
3853     /* Allow dumping but overwriting the collection of skipped
3854      * ops and/or strings with fake optimized ops */
3855     n = scan + NODE_SZ_STR(scan);
3856     while (n <= stop) {
3857         OP(n) = OPTIMIZED;
3858         FLAGS(n) = 0;
3859         NEXT_OFF(n) = 0;
3860         n++;
3861     }
3862 #endif
3863     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3864     return stopnow;
3865 }
3866
3867 /* REx optimizer.  Converts nodes into quicker variants "in place".
3868    Finds fixed substrings.  */
3869
3870 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3871    to the position after last scanned or to NULL. */
3872
3873 #define INIT_AND_WITHP \
3874     assert(!and_withp); \
3875     Newx(and_withp,1, regnode_ssc); \
3876     SAVEFREEPV(and_withp)
3877
3878
3879 static void
3880 S_unwind_scan_frames(pTHX_ const void *p)
3881 {
3882     scan_frame *f= (scan_frame *)p;
3883     do {
3884         scan_frame *n= f->next_frame;
3885         Safefree(f);
3886         f= n;
3887     } while (f);
3888 }
3889
3890
3891 STATIC SSize_t
3892 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3893                         SSize_t *minlenp, SSize_t *deltap,
3894                         regnode *last,
3895                         scan_data_t *data,
3896                         I32 stopparen,
3897                         U32 recursed_depth,
3898                         regnode_ssc *and_withp,
3899                         U32 flags, U32 depth)
3900                         /* scanp: Start here (read-write). */
3901                         /* deltap: Write maxlen-minlen here. */
3902                         /* last: Stop before this one. */
3903                         /* data: string data about the pattern */
3904                         /* stopparen: treat close N as END */
3905                         /* recursed: which subroutines have we recursed into */
3906                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3907 {
3908     /* There must be at least this number of characters to match */
3909     SSize_t min = 0;
3910     I32 pars = 0, code;
3911     regnode *scan = *scanp, *next;
3912     SSize_t delta = 0;
3913     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3914     int is_inf_internal = 0;            /* The studied chunk is infinite */
3915     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3916     scan_data_t data_fake;
3917     SV *re_trie_maxbuff = NULL;
3918     regnode *first_non_open = scan;
3919     SSize_t stopmin = SSize_t_MAX;
3920     scan_frame *frame = NULL;
3921     GET_RE_DEBUG_FLAGS_DECL;
3922
3923     PERL_ARGS_ASSERT_STUDY_CHUNK;
3924
3925
3926     if ( depth == 0 ) {
3927         while (first_non_open && OP(first_non_open) == OPEN)
3928             first_non_open=regnext(first_non_open);
3929     }
3930
3931
3932   fake_study_recurse:
3933     DEBUG_r(
3934         RExC_study_chunk_recursed_count++;
3935     );
3936     DEBUG_OPTIMISE_MORE_r(
3937     {
3938         PerlIO_printf(Perl_debug_log,
3939             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3940             (int)(depth*2), "", (long)stopparen,
3941             (unsigned long)RExC_study_chunk_recursed_count,
3942             (unsigned long)depth, (unsigned long)recursed_depth,
3943             scan,
3944             last);
3945         if (recursed_depth) {
3946             U32 i;
3947             U32 j;
3948             for ( j = 0 ; j < recursed_depth ; j++ ) {
3949                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3950                     if (
3951                         PAREN_TEST(RExC_study_chunk_recursed +
3952                                    ( j * RExC_study_chunk_recursed_bytes), i )
3953                         && (
3954                             !j ||
3955                             !PAREN_TEST(RExC_study_chunk_recursed +
3956                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3957                         )
3958                     ) {
3959                         PerlIO_printf(Perl_debug_log," %d",(int)i);
3960                         break;
3961                     }
3962                 }
3963                 if ( j + 1 < recursed_depth ) {
3964                     PerlIO_printf(Perl_debug_log, ",");
3965                 }
3966             }
3967         }
3968         PerlIO_printf(Perl_debug_log,"\n");
3969     }
3970     );
3971     while ( scan && OP(scan) != END && scan < last ){
3972         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3973                                    node length to get a real minimum (because
3974                                    the folded version may be shorter) */
3975         bool unfolded_multi_char = FALSE;
3976         /* Peephole optimizer: */
3977         DEBUG_STUDYDATA("Peep:", data, depth);
3978         DEBUG_PEEP("Peep", scan, depth);
3979
3980
3981         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3982          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3983          * by a different invocation of reg() -- Yves
3984          */
3985         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3986
3987         /* Follow the next-chain of the current node and optimize
3988            away all the NOTHINGs from it.  */
3989         if (OP(scan) != CURLYX) {
3990             const int max = (reg_off_by_arg[OP(scan)]
3991                        ? I32_MAX
3992                        /* I32 may be smaller than U16 on CRAYs! */
3993                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3994             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3995             int noff;
3996             regnode *n = scan;
3997
3998             /* Skip NOTHING and LONGJMP. */
3999             while ((n = regnext(n))
4000                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4001                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4002                    && off + noff < max)
4003                 off += noff;
4004             if (reg_off_by_arg[OP(scan)])
4005                 ARG(scan) = off;
4006             else
4007                 NEXT_OFF(scan) = off;
4008         }
4009
4010         /* The principal pseudo-switch.  Cannot be a switch, since we
4011            look into several different things.  */
4012         if ( OP(scan) == DEFINEP ) {
4013             SSize_t minlen = 0;
4014             SSize_t deltanext = 0;
4015             SSize_t fake_last_close = 0;
4016             I32 f = SCF_IN_DEFINE;
4017
4018             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4019             scan = regnext(scan);
4020             assert( OP(scan) == IFTHEN );
4021             DEBUG_PEEP("expect IFTHEN", scan, depth);
4022
4023             data_fake.last_closep= &fake_last_close;
4024             minlen = *minlenp;
4025             next = regnext(scan);
4026             scan = NEXTOPER(NEXTOPER(scan));
4027             DEBUG_PEEP("scan", scan, depth);
4028             DEBUG_PEEP("next", next, depth);
4029
4030             /* we suppose the run is continuous, last=next...
4031              * NOTE we dont use the return here! */
4032             (void)study_chunk(pRExC_state, &scan, &minlen,
4033                               &deltanext, next, &data_fake, stopparen,
4034                               recursed_depth, NULL, f, depth+1);
4035
4036             scan = next;
4037         } else
4038         if (
4039             OP(scan) == BRANCH  ||
4040             OP(scan) == BRANCHJ ||
4041             OP(scan) == IFTHEN
4042         ) {
4043             next = regnext(scan);
4044             code = OP(scan);
4045
4046             /* The op(next)==code check below is to see if we
4047              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4048              * IFTHEN is special as it might not appear in pairs.
4049              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4050              * we dont handle it cleanly. */
4051             if (OP(next) == code || code == IFTHEN) {
4052                 /* NOTE - There is similar code to this block below for
4053                  * handling TRIE nodes on a re-study.  If you change stuff here
4054                  * check there too. */
4055                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4056                 regnode_ssc accum;
4057                 regnode * const startbranch=scan;
4058
4059                 if (flags & SCF_DO_SUBSTR) {
4060                     /* Cannot merge strings after this. */
4061                     scan_commit(pRExC_state, data, minlenp, is_inf);
4062                 }
4063
4064                 if (flags & SCF_DO_STCLASS)
4065                     ssc_init_zero(pRExC_state, &accum);
4066
4067                 while (OP(scan) == code) {
4068                     SSize_t deltanext, minnext, fake;
4069                     I32 f = 0;
4070                     regnode_ssc this_class;
4071
4072                     DEBUG_PEEP("Branch", scan, depth);
4073
4074                     num++;
4075                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4076                     if (data) {
4077                         data_fake.whilem_c = data->whilem_c;
4078                         data_fake.last_closep = data->last_closep;
4079                     }
4080                     else
4081                         data_fake.last_closep = &fake;
4082
4083                     data_fake.pos_delta = delta;
4084                     next = regnext(scan);
4085
4086                     scan = NEXTOPER(scan); /* everything */
4087                     if (code != BRANCH)    /* everything but BRANCH */
4088                         scan = NEXTOPER(scan);
4089
4090                     if (flags & SCF_DO_STCLASS) {
4091                         ssc_init(pRExC_state, &this_class);
4092                         data_fake.start_class = &this_class;
4093                         f = SCF_DO_STCLASS_AND;
4094                     }
4095                     if (flags & SCF_WHILEM_VISITED_POS)
4096                         f |= SCF_WHILEM_VISITED_POS;
4097
4098                     /* we suppose the run is continuous, last=next...*/
4099                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4100                                       &deltanext, next, &data_fake, stopparen,
4101                                       recursed_depth, NULL, f,depth+1);
4102
4103                     if (min1 > minnext)
4104                         min1 = minnext;
4105                     if (deltanext == SSize_t_MAX) {
4106                         is_inf = is_inf_internal = 1;
4107                         max1 = SSize_t_MAX;
4108                     } else if (max1 < minnext + deltanext)
4109                         max1 = minnext + deltanext;
4110                     scan = next;
4111                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4112                         pars++;
4113                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4114                         if ( stopmin > minnext)
4115                             stopmin = min + min1;
4116                         flags &= ~SCF_DO_SUBSTR;
4117                         if (data)
4118                             data->flags |= SCF_SEEN_ACCEPT;
4119                     }
4120                     if (data) {
4121                         if (data_fake.flags & SF_HAS_EVAL)
4122                             data->flags |= SF_HAS_EVAL;
4123                         data->whilem_c = data_fake.whilem_c;
4124                     }
4125                     if (flags & SCF_DO_STCLASS)
4126                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4127                 }
4128                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4129                     min1 = 0;
4130                 if (flags & SCF_DO_SUBSTR) {
4131                     data->pos_min += min1;
4132                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4133                         data->pos_delta = SSize_t_MAX;
4134                     else
4135                         data->pos_delta += max1 - min1;
4136                     if (max1 != min1 || is_inf)
4137                         data->longest = &(data->longest_float);
4138                 }
4139                 min += min1;
4140                 if (delta == SSize_t_MAX
4141                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4142                     delta = SSize_t_MAX;
4143                 else
4144                     delta += max1 - min1;
4145                 if (flags & SCF_DO_STCLASS_OR) {
4146                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4147                     if (min1) {
4148                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4149                         flags &= ~SCF_DO_STCLASS;
4150                     }
4151                 }
4152                 else if (flags & SCF_DO_STCLASS_AND) {
4153                     if (min1) {
4154                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4155                         flags &= ~SCF_DO_STCLASS;
4156                     }
4157                     else {
4158                         /* Switch to OR mode: cache the old value of
4159                          * data->start_class */
4160                         INIT_AND_WITHP;
4161                         StructCopy(data->start_class, and_withp, regnode_ssc);
4162                         flags &= ~SCF_DO_STCLASS_AND;
4163                         StructCopy(&accum, data->start_class, regnode_ssc);
4164                         flags |= SCF_DO_STCLASS_OR;
4165                     }
4166                 }
4167
4168                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4169                         OP( startbranch ) == BRANCH )
4170                 {
4171                 /* demq.
4172
4173                    Assuming this was/is a branch we are dealing with: 'scan'
4174                    now points at the item that follows the branch sequence,
4175                    whatever it is. We now start at the beginning of the
4176                    sequence and look for subsequences of
4177
4178                    BRANCH->EXACT=>x1
4179                    BRANCH->EXACT=>x2
4180                    tail
4181
4182                    which would be constructed from a pattern like
4183                    /A|LIST|OF|WORDS/
4184
4185                    If we can find such a subsequence we need to turn the first
4186                    element into a trie and then add the subsequent branch exact
4187                    strings to the trie.
4188
4189                    We have two cases
4190
4191                      1. patterns where the whole set of branches can be
4192                         converted.
4193
4194                      2. patterns where only a subset can be converted.
4195
4196                    In case 1 we can replace the whole set with a single regop
4197                    for the trie. In case 2 we need to keep the start and end
4198                    branches so
4199
4200                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4201                      becomes BRANCH TRIE; BRANCH X;
4202
4203                   There is an additional case, that being where there is a
4204                   common prefix, which gets split out into an EXACT like node
4205                   preceding the TRIE node.
4206
4207                   If x(1..n)==tail then we can do a simple trie, if not we make
4208                   a "jump" trie, such that when we match the appropriate word
4209                   we "jump" to the appropriate tail node. Essentially we turn
4210                   a nested if into a case structure of sorts.
4211
4212                 */
4213
4214                     int made=0;
4215                     if (!re_trie_maxbuff) {
4216                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4217                         if (!SvIOK(re_trie_maxbuff))
4218                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4219                     }
4220                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4221                         regnode *cur;
4222                         regnode *first = (regnode *)NULL;
4223                         regnode *last = (regnode *)NULL;
4224                         regnode *tail = scan;
4225                         U8 trietype = 0;
4226                         U32 count=0;
4227
4228                         /* var tail is used because there may be a TAIL
4229                            regop in the way. Ie, the exacts will point to the
4230                            thing following the TAIL, but the last branch will
4231                            point at the TAIL. So we advance tail. If we
4232                            have nested (?:) we may have to move through several
4233                            tails.
4234                          */
4235
4236                         while ( OP( tail ) == TAIL ) {
4237                             /* this is the TAIL generated by (?:) */
4238                             tail = regnext( tail );
4239                         }
4240
4241
4242                         DEBUG_TRIE_COMPILE_r({
4243                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4244                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4245                               (int)depth * 2 + 2, "",
4246                               "Looking for TRIE'able sequences. Tail node is: ",
4247                               SvPV_nolen_const( RExC_mysv )
4248                             );
4249                         });
4250
4251                         /*
4252
4253                             Step through the branches
4254                                 cur represents each branch,
4255                                 noper is the first thing to be matched as part
4256                                       of that branch
4257                                 noper_next is the regnext() of that node.
4258
4259                             We normally handle a case like this
4260                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4261                             support building with NOJUMPTRIE, which restricts
4262                             the trie logic to structures like /FOO|BAR/.
4263
4264                             If noper is a trieable nodetype then the branch is
4265                             a possible optimization target. If we are building
4266                             under NOJUMPTRIE then we require that noper_next is
4267                             the same as scan (our current position in the regex
4268                             program).
4269
4270                             Once we have two or more consecutive such branches
4271                             we can create a trie of the EXACT's contents and
4272                             stitch it in place into the program.
4273
4274                             If the sequence represents all of the branches in
4275                             the alternation we replace the entire thing with a
4276                             single TRIE node.
4277
4278                             Otherwise when it is a subsequence we need to
4279                             stitch it in place and replace only the relevant
4280                             branches. This means the first branch has to remain
4281                             as it is used by the alternation logic, and its
4282                             next pointer, and needs to be repointed at the item
4283                             on the branch chain following the last branch we
4284                             have optimized away.
4285
4286                             This could be either a BRANCH, in which case the
4287                             subsequence is internal, or it could be the item
4288                             following the branch sequence in which case the
4289                             subsequence is at the end (which does not
4290                             necessarily mean the first node is the start of the
4291                             alternation).
4292
4293                             TRIE_TYPE(X) is a define which maps the optype to a
4294                             trietype.
4295
4296                                 optype          |  trietype
4297                                 ----------------+-----------
4298                                 NOTHING         | NOTHING
4299                                 EXACT           | EXACT
4300                                 EXACTFU         | EXACTFU
4301                                 EXACTFU_SS      | EXACTFU
4302                                 EXACTFA         | EXACTFA
4303                                 EXACTL          | EXACTL
4304                                 EXACTFLU8       | EXACTFLU8
4305
4306
4307                         */
4308 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4309                        ? NOTHING                                            \
4310                        : ( EXACT == (X) )                                   \
4311                          ? EXACT                                            \
4312                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4313                            ? EXACTFU                                        \
4314                            : ( EXACTFA == (X) )                             \
4315                              ? EXACTFA                                      \
4316                              : ( EXACTL == (X) )                            \
4317                                ? EXACTL                                     \
4318                                : ( EXACTFLU8 == (X) )                        \
4319                                  ? EXACTFLU8                                 \
4320                                  : 0 )
4321
4322                         /* dont use tail as the end marker for this traverse */
4323                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4324                             regnode * const noper = NEXTOPER( cur );
4325                             U8 noper_type = OP( noper );
4326                             U8 noper_trietype = TRIE_TYPE( noper_type );
4327 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4328                             regnode * const noper_next = regnext( noper );
4329                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4330                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4331 #endif
4332
4333                             DEBUG_TRIE_COMPILE_r({
4334                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4335                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4336                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4337
4338                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4339                                 PerlIO_printf( Perl_debug_log, " -> %s",
4340                                     SvPV_nolen_const(RExC_mysv));
4341
4342                                 if ( noper_next ) {
4343                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4344                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4345                                     SvPV_nolen_const(RExC_mysv));
4346                                 }
4347                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4348                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4349                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4350                                 );
4351                             });
4352
4353                             /* Is noper a trieable nodetype that can be merged
4354                              * with the current trie (if there is one)? */
4355                             if ( noper_trietype
4356                                   &&
4357                                   (
4358                                         ( noper_trietype == NOTHING)
4359                                         || ( trietype == NOTHING )
4360                                         || ( trietype == noper_trietype )
4361                                   )
4362 #ifdef NOJUMPTRIE
4363                                   && noper_next == tail
4364 #endif
4365                                   && count < U16_MAX)
4366                             {
4367                                 /* Handle mergable triable node Either we are
4368                                  * the first node in a new trieable sequence,
4369                                  * in which case we do some bookkeeping,
4370                                  * otherwise we update the end pointer. */
4371                                 if ( !first ) {
4372                                     first = cur;
4373                                     if ( noper_trietype == NOTHING ) {
4374 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4375                                         regnode * const noper_next = regnext( noper );
4376                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4377                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4378 #endif
4379
4380                                         if ( noper_next_trietype ) {
4381                                             trietype = noper_next_trietype;
4382                                         } else if (noper_next_type)  {
4383                                             /* a NOTHING regop is 1 regop wide.
4384                                              * We need at least two for a trie
4385                                              * so we can't merge this in */
4386                                             first = NULL;
4387                                         }
4388                                     } else {
4389                                         trietype = noper_trietype;
4390                                     }
4391                                 } else {
4392                                     if ( trietype == NOTHING )
4393                                         trietype = noper_trietype;
4394                                     last = cur;
4395                                 }
4396                                 if (first)
4397                                     count++;
4398                             } /* end handle mergable triable node */
4399                             else {
4400                                 /* handle unmergable node -
4401                                  * noper may either be a triable node which can
4402                                  * not be tried together with the current trie,
4403                                  * or a non triable node */
4404                                 if ( last ) {
4405                                     /* If last is set and trietype is not
4406                                      * NOTHING then we have found at least two
4407                                      * triable branch sequences in a row of a
4408                                      * similar trietype so we can turn them
4409                                      * into a trie. If/when we allow NOTHING to
4410                                      * start a trie sequence this condition
4411                                      * will be required, and it isn't expensive
4412                                      * so we leave it in for now. */
4413                                     if ( trietype && trietype != NOTHING )
4414                                         make_trie( pRExC_state,
4415                                                 startbranch, first, cur, tail,
4416                                                 count, trietype, depth+1 );
4417                                     last = NULL; /* note: we clear/update
4418                                                     first, trietype etc below,
4419                                                     so we dont do it here */
4420                                 }
4421                                 if ( noper_trietype
4422 #ifdef NOJUMPTRIE
4423                                      && noper_next == tail
4424 #endif
4425                                 ){
4426                                     /* noper is triable, so we can start a new
4427                                      * trie sequence */
4428                                     count = 1;
4429                                     first = cur;
4430                                     trietype = noper_trietype;
4431                                 } else if (first) {
4432                                     /* if we already saw a first but the
4433                                      * current node is not triable then we have
4434                                      * to reset the first information. */
4435                                     count = 0;
4436                                     first = NULL;
4437                                     trietype = 0;
4438                                 }
4439                             } /* end handle unmergable node */
4440                         } /* loop over branches */
4441                         DEBUG_TRIE_COMPILE_r({
4442                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4443                             PerlIO_printf( Perl_debug_log,
4444                               "%*s- %s (%d) <SCAN FINISHED>\n",
4445                               (int)depth * 2 + 2,
4446                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4447
4448                         });
4449                         if ( last && trietype ) {
4450                             if ( trietype != NOTHING ) {
4451                                 /* the last branch of the sequence was part of
4452                                  * a trie, so we have to construct it here
4453                                  * outside of the loop */
4454                                 made= make_trie( pRExC_state, startbranch,
4455                                                  first, scan, tail, count,
4456                                                  trietype, depth+1 );
4457 #ifdef TRIE_STUDY_OPT
4458                                 if ( ((made == MADE_EXACT_TRIE &&
4459                                      startbranch == first)
4460                                      || ( first_non_open == first )) &&
4461                                      depth==0 ) {
4462                                     flags |= SCF_TRIE_RESTUDY;
4463                                     if ( startbranch == first
4464                                          && scan == tail )
4465                                     {
4466                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4467                                     }
4468                                 }
4469 #endif
4470                             } else {
4471                                 /* at this point we know whatever we have is a
4472                                  * NOTHING sequence/branch AND if 'startbranch'
4473                                  * is 'first' then we can turn the whole thing
4474                                  * into a NOTHING
4475                                  */
4476                                 if ( startbranch == first ) {
4477                                     regnode *opt;
4478                                     /* the entire thing is a NOTHING sequence,
4479                                      * something like this: (?:|) So we can
4480                                      * turn it into a plain NOTHING op. */
4481                                     DEBUG_TRIE_COMPILE_r({
4482                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4483                                         PerlIO_printf( Perl_debug_log,
4484                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4485                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4486
4487                                     });
4488                                     OP(startbranch)= NOTHING;
4489                                     NEXT_OFF(startbranch)= tail - startbranch;
4490                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4491                                         OP(opt)= OPTIMIZED;
4492                                 }
4493                             }
4494                         } /* end if ( last) */
4495                     } /* TRIE_MAXBUF is non zero */
4496
4497                 } /* do trie */
4498
4499             }
4500             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4501                 scan = NEXTOPER(NEXTOPER(scan));
4502             } else                      /* single branch is optimized. */
4503                 scan = NEXTOPER(scan);
4504             continue;
4505         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4506             I32 paren = 0;
4507             regnode *start = NULL;
4508             regnode *end = NULL;
4509             U32 my_recursed_depth= recursed_depth;
4510
4511
4512             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4513                 /* Do setup, note this code has side effects beyond
4514                  * the rest of this block. Specifically setting
4515                  * RExC_recurse[] must happen at least once during
4516                  * study_chunk(). */
4517                 if (OP(scan) == GOSUB) {
4518                     paren = ARG(scan);
4519                     RExC_recurse[ARG2L(scan)] = scan;
4520                     start = RExC_open_parens[paren-1];
4521                     end   = RExC_close_parens[paren-1];
4522                 } else {
4523                     start = RExC_rxi->program + 1;
4524                     end   = RExC_opend;
4525                 }
4526                 /* NOTE we MUST always execute the above code, even
4527                  * if we do nothing with a GOSUB/GOSTART */
4528                 if (
4529                     ( flags & SCF_IN_DEFINE )
4530                     ||
4531                     (
4532                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4533                         &&
4534                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4535                     )
4536                 ) {
4537                     /* no need to do anything here if we are in a define. */
4538                     /* or we are after some kind of infinite construct
4539                      * so we can skip recursing into this item.
4540                      * Since it is infinite we will not change the maxlen
4541                      * or delta, and if we miss something that might raise
4542                      * the minlen it will merely pessimise a little.
4543                      *
4544                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4545                      * might result in a minlen of 1 and not of 4,
4546                      * but this doesn't make us mismatch, just try a bit
4547                      * harder than we should.
4548                      * */
4549                     scan= regnext(scan);
4550                     continue;
4551                 }
4552
4553                 if (
4554                     !recursed_depth
4555                     ||
4556                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4557                 ) {
4558                     /* it is quite possible that there are more efficient ways
4559                      * to do this. We maintain a bitmap per level of recursion
4560                      * of which patterns we have entered so we can detect if a
4561                      * pattern creates a possible infinite loop. When we
4562                      * recurse down a level we copy the previous levels bitmap
4563                      * down. When we are at recursion level 0 we zero the top
4564                      * level bitmap. It would be nice to implement a different
4565                      * more efficient way of doing this. In particular the top
4566                      * level bitmap may be unnecessary.
4567                      */
4568                     if (!recursed_depth) {
4569                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4570                     } else {
4571                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4572                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4573                              RExC_study_chunk_recursed_bytes, U8);
4574                     }
4575                     /* we havent recursed into this paren yet, so recurse into it */
4576                     DEBUG_STUDYDATA("set:", data,depth);
4577                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4578                     my_recursed_depth= recursed_depth + 1;
4579                 } else {
4580                     DEBUG_STUDYDATA("inf:", data,depth);
4581                     /* some form of infinite recursion, assume infinite length
4582                      * */
4583                     if (flags & SCF_DO_SUBSTR) {
4584                         scan_commit(pRExC_state, data, minlenp, is_inf);
4585                         data->longest = &(data->longest_float);
4586                     }
4587                     is_inf = is_inf_internal = 1;
4588                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4589                         ssc_anything(data->start_class);
4590                     flags &= ~SCF_DO_STCLASS;
4591
4592                     start= NULL; /* reset start so we dont recurse later on. */
4593                 }
4594             } else {
4595                 paren = stopparen;
4596                 start = scan + 2;
4597                 end = regnext(scan);
4598             }
4599             if (start) {
4600                 scan_frame *newframe;
4601                 assert(end);
4602                 if (!RExC_frame_last) {
4603                     Newxz(newframe, 1, scan_frame);
4604                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4605                     RExC_frame_head= newframe;
4606                     RExC_frame_count++;
4607                 } else if (!RExC_frame_last->next_frame) {
4608                     Newxz(newframe,1,scan_frame);
4609                     RExC_frame_last->next_frame= newframe;
4610                     newframe->prev_frame= RExC_frame_last;
4611                     RExC_frame_count++;
4612                 } else {
4613                     newframe= RExC_frame_last->next_frame;
4614                 }
4615                 RExC_frame_last= newframe;
4616
4617                 newframe->next_regnode = regnext(scan);
4618                 newframe->last_regnode = last;
4619                 newframe->stopparen = stopparen;
4620                 newframe->prev_recursed_depth = recursed_depth;
4621                 newframe->this_prev_frame= frame;
4622
4623                 DEBUG_STUDYDATA("frame-new:",data,depth);
4624                 DEBUG_PEEP("fnew", scan, depth);
4625
4626                 frame = newframe;
4627                 scan =  start;
4628                 stopparen = paren;
4629                 last = end;
4630                 depth = depth + 1;
4631                 recursed_depth= my_recursed_depth;
4632
4633                 continue;
4634             }
4635         }
4636         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4637             SSize_t l = STR_LEN(scan);
4638             UV uc;
4639             if (UTF) {
4640                 const U8 * const s = (U8*)STRING(scan);
4641                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4642                 l = utf8_length(s, s + l);
4643             } else {
4644                 uc = *((U8*)STRING(scan));
4645             }
4646             min += l;
4647             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4648                 /* The code below prefers earlier match for fixed
4649                    offset, later match for variable offset.  */
4650                 if (data->last_end == -1) { /* Update the start info. */
4651                     data->last_start_min = data->pos_min;
4652                     data->last_start_max = is_inf
4653                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4654                 }
4655                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4656                 if (UTF)
4657                     SvUTF8_on(data->last_found);
4658                 {
4659                     SV * const sv = data->last_found;
4660                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4661                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4662                     if (mg && mg->mg_len >= 0)
4663                         mg->mg_len += utf8_length((U8*)STRING(scan),
4664                                               (U8*)STRING(scan)+STR_LEN(scan));
4665                 }
4666                 data->last_end = data->pos_min + l;
4667                 data->pos_min += l; /* As in the first entry. */
4668                 data->flags &= ~SF_BEFORE_EOL;
4669             }
4670
4671             /* ANDing the code point leaves at most it, and not in locale, and
4672              * can't match null string */
4673             if (flags & SCF_DO_STCLASS_AND) {
4674                 ssc_cp_and(data->start_class, uc);
4675                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4676                 ssc_clear_locale(data->start_class);
4677             }
4678             else if (flags & SCF_DO_STCLASS_OR) {
4679                 ssc_add_cp(data->start_class, uc);
4680                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4681
4682                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4683                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4684             }
4685             flags &= ~SCF_DO_STCLASS;
4686         }
4687         else if (PL_regkind[OP(scan)] == EXACT) {
4688             /* But OP != EXACT!, so is EXACTFish */
4689             SSize_t l = STR_LEN(scan);
4690             const U8 * s = (U8*)STRING(scan);
4691
4692             /* Search for fixed substrings supports EXACT only. */
4693             if (flags & SCF_DO_SUBSTR) {
4694                 assert(data);
4695                 scan_commit(pRExC_state, data, minlenp, is_inf);
4696             }
4697             if (UTF) {
4698                 l = utf8_length(s, s + l);
4699             }
4700             if (unfolded_multi_char) {
4701                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4702             }
4703             min += l - min_subtract;
4704             assert (min >= 0);
4705             delta += min_subtract;
4706             if (flags & SCF_DO_SUBSTR) {
4707                 data->pos_min += l - min_subtract;
4708                 if (data->pos_min < 0) {
4709                     data->pos_min = 0;
4710                 }
4711                 data->pos_delta += min_subtract;
4712                 if (min_subtract) {
4713                     data->longest = &(data->longest_float);
4714                 }
4715             }
4716
4717             if (flags & SCF_DO_STCLASS) {
4718                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4719
4720                 assert(EXACTF_invlist);
4721                 if (flags & SCF_DO_STCLASS_AND) {
4722                     if (OP(scan) != EXACTFL)
4723                         ssc_clear_locale(data->start_class);
4724                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4725                     ANYOF_POSIXL_ZERO(data->start_class);
4726                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4727                 }
4728                 else {  /* SCF_DO_STCLASS_OR */
4729                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4730                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4731
4732                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4733                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4734                 }
4735                 flags &= ~SCF_DO_STCLASS;
4736                 SvREFCNT_dec(EXACTF_invlist);
4737             }
4738         }
4739         else if (REGNODE_VARIES(OP(scan))) {
4740             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4741             I32 fl = 0, f = flags;
4742             regnode * const oscan = scan;
4743             regnode_ssc this_class;
4744             regnode_ssc *oclass = NULL;
4745             I32 next_is_eval = 0;
4746
4747             switch (PL_regkind[OP(scan)]) {
4748             case WHILEM:                /* End of (?:...)* . */
4749                 scan = NEXTOPER(scan);
4750                 goto finish;
4751             case PLUS:
4752                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4753                     next = NEXTOPER(scan);
4754                     if (OP(next) == EXACT
4755                         || OP(next) == EXACTL
4756                         || (flags & SCF_DO_STCLASS))
4757                     {
4758                         mincount = 1;
4759                         maxcount = REG_INFTY;
4760                         next = regnext(scan);
4761                         scan = NEXTOPER(scan);
4762                         goto do_curly;
4763                     }
4764                 }
4765                 if (flags & SCF_DO_SUBSTR)
4766                     data->pos_min++;
4767                 min++;
4768                 /* FALLTHROUGH */
4769             case STAR:
4770                 if (flags & SCF_DO_STCLASS) {
4771                     mincount = 0;
4772                     maxcount = REG_INFTY;
4773                     next = regnext(scan);
4774                     scan = NEXTOPER(scan);
4775                     goto do_curly;
4776                 }
4777                 if (flags & SCF_DO_SUBSTR) {
4778                     scan_commit(pRExC_state, data, minlenp, is_inf);
4779                     /* Cannot extend fixed substrings */
4780                     data->longest = &(data->longest_float);
4781                 }
4782                 is_inf = is_inf_internal = 1;
4783                 scan = regnext(scan);
4784                 goto optimize_curly_tail;
4785             case CURLY:
4786                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4787                     && (scan->flags == stopparen))
4788                 {
4789                     mincount = 1;
4790                     maxcount = 1;
4791                 } else {
4792                     mincount = ARG1(scan);
4793                     maxcount = ARG2(scan);
4794                 }
4795                 next = regnext(scan);
4796                 if (OP(scan) == CURLYX) {
4797                     I32 lp = (data ? *(data->last_closep) : 0);
4798                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4799                 }
4800                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4801                 next_is_eval = (OP(scan) == EVAL);
4802               do_curly:
4803                 if (flags & SCF_DO_SUBSTR) {
4804                     if (mincount == 0)
4805                         scan_commit(pRExC_state, data, minlenp, is_inf);
4806                     /* Cannot extend fixed substrings */
4807                     pos_before = data->pos_min;
4808                 }
4809                 if (data) {
4810                     fl = data->flags;
4811                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4812                     if (is_inf)
4813                         data->flags |= SF_IS_INF;
4814                 }
4815                 if (flags & SCF_DO_STCLASS) {
4816                     ssc_init(pRExC_state, &this_class);
4817                     oclass = data->start_class;
4818                     data->start_class = &this_class;
4819                     f |= SCF_DO_STCLASS_AND;
4820                     f &= ~SCF_DO_STCLASS_OR;
4821                 }
4822                 /* Exclude from super-linear cache processing any {n,m}
4823                    regops for which the combination of input pos and regex
4824                    pos is not enough information to determine if a match
4825                    will be possible.
4826
4827                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4828                    regex pos at the \s*, the prospects for a match depend not
4829                    only on the input position but also on how many (bar\s*)
4830                    repeats into the {4,8} we are. */
4831                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4832                     f &= ~SCF_WHILEM_VISITED_POS;
4833
4834                 /* This will finish on WHILEM, setting scan, or on NULL: */
4835                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4836                                   last, data, stopparen, recursed_depth, NULL,
4837                                   (mincount == 0
4838                                    ? (f & ~SCF_DO_SUBSTR)
4839                                    : f)
4840                                   ,depth+1);
4841
4842                 if (flags & SCF_DO_STCLASS)
4843                     data->start_class = oclass;
4844                 if (mincount == 0 || minnext == 0) {
4845                     if (flags & SCF_DO_STCLASS_OR) {
4846                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4847                     }
4848                     else if (flags & SCF_DO_STCLASS_AND) {
4849                         /* Switch to OR mode: cache the old value of
4850                          * data->start_class */
4851                         INIT_AND_WITHP;
4852                         StructCopy(data->start_class, and_withp, regnode_ssc);
4853                         flags &= ~SCF_DO_STCLASS_AND;
4854                         StructCopy(&this_class, data->start_class, regnode_ssc);
4855                         flags |= SCF_DO_STCLASS_OR;
4856                         ANYOF_FLAGS(data->start_class)
4857                                                 |= SSC_MATCHES_EMPTY_STRING;
4858                     }
4859                 } else {                /* Non-zero len */
4860                     if (flags & SCF_DO_STCLASS_OR) {
4861                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4862                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4863                     }
4864                     else if (flags & SCF_DO_STCLASS_AND)
4865                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4866                     flags &= ~SCF_DO_STCLASS;
4867                 }
4868                 if (!scan)              /* It was not CURLYX, but CURLY. */
4869                     scan = next;
4870                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4871                     /* ? quantifier ok, except for (?{ ... }) */
4872                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4873                     && (minnext == 0) && (deltanext == 0)
4874                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4875                     && maxcount <= REG_INFTY/3) /* Complement check for big
4876                                                    count */
4877                 {
4878                     /* Fatal warnings may leak the regexp without this: */
4879                     SAVEFREESV(RExC_rx_sv);
4880                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4881                         "Quantifier unexpected on zero-length expression "
4882                         "in regex m/%"UTF8f"/",
4883                          UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
4884                                   RExC_precomp));
4885                     (void)ReREFCNT_inc(RExC_rx_sv);
4886                 }
4887
4888                 min += minnext * mincount;
4889                 is_inf_internal |= deltanext == SSize_t_MAX
4890                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4891                 is_inf |= is_inf_internal;
4892                 if (is_inf) {
4893                     delta = SSize_t_MAX;
4894                 } else {
4895                     delta += (minnext + deltanext) * maxcount
4896                              - minnext * mincount;
4897                 }
4898                 /* Try powerful optimization CURLYX => CURLYN. */
4899                 if (  OP(oscan) == CURLYX && data
4900                       && data->flags & SF_IN_PAR
4901                       && !(data->flags & SF_HAS_EVAL)
4902                       && !deltanext && minnext == 1 ) {
4903                     /* Try to optimize to CURLYN.  */
4904                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4905                     regnode * const nxt1 = nxt;
4906 #ifdef DEBUGGING
4907                     regnode *nxt2;
4908 #endif
4909
4910                     /* Skip open. */
4911                     nxt = regnext(nxt);
4912                     if (!REGNODE_SIMPLE(OP(nxt))
4913                         && !(PL_regkind[OP(nxt)] == EXACT
4914                              && STR_LEN(nxt) == 1))
4915                         goto nogo;
4916 #ifdef DEBUGGING
4917                     nxt2 = nxt;
4918 #endif
4919                     nxt = regnext(nxt);
4920                     if (OP(nxt) != CLOSE)
4921                         goto nogo;
4922                     if (RExC_open_parens) {
4923                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4924                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4925                     }
4926                     /* Now we know that nxt2 is the only contents: */
4927                     oscan->flags = (U8)ARG(nxt);
4928                     OP(oscan) = CURLYN;
4929                     OP(nxt1) = NOTHING; /* was OPEN. */
4930
4931 #ifdef DEBUGGING
4932                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4933                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4934                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4935                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4936                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4937                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4938 #endif
4939                 }
4940               nogo:
4941
4942                 /* Try optimization CURLYX => CURLYM. */
4943                 if (  OP(oscan) == CURLYX && data
4944                       && !(data->flags & SF_HAS_PAR)
4945                       && !(data->flags & SF_HAS_EVAL)
4946                       && !deltanext     /* atom is fixed width */
4947                       && minnext != 0   /* CURLYM can't handle zero width */
4948
4949                          /* Nor characters whose fold at run-time may be
4950                           * multi-character */
4951                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4952                 ) {
4953                     /* XXXX How to optimize if data == 0? */
4954                     /* Optimize to a simpler form.  */
4955                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4956                     regnode *nxt2;
4957
4958                     OP(oscan) = CURLYM;
4959                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4960                             && (OP(nxt2) != WHILEM))
4961                         nxt = nxt2;
4962                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4963                     /* Need to optimize away parenths. */
4964                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4965                         /* Set the parenth number.  */
4966                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4967
4968                         oscan->flags = (U8)ARG(nxt);
4969                         if (RExC_open_parens) {
4970                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4971                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4972                         }
4973                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4974                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4975
4976 #ifdef DEBUGGING
4977                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4978                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4979                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4980                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4981 #endif
4982 #if 0
4983                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4984                             regnode *nnxt = regnext(nxt1);
4985                             if (nnxt == nxt) {
4986                                 if (reg_off_by_arg[OP(nxt1)])
4987                                     ARG_SET(nxt1, nxt2 - nxt1);
4988                                 else if (nxt2 - nxt1 < U16_MAX)
4989                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4990                                 else
4991                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4992                             }
4993                             nxt1 = nnxt;
4994                         }
4995 #endif
4996                         /* Optimize again: */
4997                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4998                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4999                     }
5000                     else
5001                         oscan->flags = 0;
5002                 }
5003                 else if ((OP(oscan) == CURLYX)
5004                          && (flags & SCF_WHILEM_VISITED_POS)
5005                          /* See the comment on a similar expression above.
5006                             However, this time it's not a subexpression
5007                             we care about, but the expression itself. */
5008                          && (maxcount == REG_INFTY)
5009                          && data && ++data->whilem_c < 16) {
5010                     /* This stays as CURLYX, we can put the count/of pair. */
5011                     /* Find WHILEM (as in regexec.c) */
5012                     regnode *nxt = oscan + NEXT_OFF(oscan);
5013
5014                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5015                         nxt += ARG(nxt);
5016                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
5017                         | (RExC_whilem_seen << 4)); /* On WHILEM */
5018                 }
5019                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5020                     pars++;
5021                 if (flags & SCF_DO_SUBSTR) {
5022                     SV *last_str = NULL;
5023                     STRLEN last_chrs = 0;
5024                     int counted = mincount != 0;
5025
5026                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5027                                                                   string. */
5028                         SSize_t b = pos_before >= data->last_start_min
5029                             ? pos_before : data->last_start_min;
5030                         STRLEN l;
5031                         const char * const s = SvPV_const(data->last_found, l);
5032                         SSize_t old = b - data->last_start_min;
5033
5034                         if (UTF)
5035                             old = utf8_hop((U8*)s, old) - (U8*)s;
5036                         l -= old;
5037                         /* Get the added string: */
5038                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5039                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5040                                             (U8*)(s + old + l)) : l;
5041                         if (deltanext == 0 && pos_before == b) {
5042                             /* What was added is a constant string */
5043                             if (mincount > 1) {
5044
5045                                 SvGROW(last_str, (mincount * l) + 1);
5046                                 repeatcpy(SvPVX(last_str) + l,
5047                                           SvPVX_const(last_str), l,
5048                                           mincount - 1);
5049                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5050                                 /* Add additional parts. */
5051                                 SvCUR_set(data->last_found,
5052                                           SvCUR(data->last_found) - l);
5053                                 sv_catsv(data->last_found, last_str);
5054                                 {
5055                                     SV * sv = data->last_found;
5056                                     MAGIC *mg =
5057                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5058                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5059                                     if (mg && mg->mg_len >= 0)
5060                                         mg->mg_len += last_chrs * (mincount-1);
5061                                 }
5062                                 last_chrs *= mincount;
5063                                 data->last_end += l * (mincount - 1);
5064                             }
5065                         } else {
5066                             /* start offset must point into the last copy */
5067                             data->last_start_min += minnext * (mincount - 1);
5068                             data->last_start_max =
5069                               is_inf
5070                                ? SSize_t_MAX
5071                                : data->last_start_max +
5072                                  (maxcount - 1) * (minnext + data->pos_delta);
5073                         }
5074                     }
5075                     /* It is counted once already... */
5076                     data->pos_min += minnext * (mincount - counted);
5077 #if 0
5078 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5079                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5080                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5081     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5082     (UV)mincount);
5083 if (deltanext != SSize_t_MAX)
5084 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5085     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5086           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5087 #endif
5088                     if (deltanext == SSize_t_MAX
5089                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5090                         data->pos_delta = SSize_t_MAX;
5091                     else
5092                         data->pos_delta += - counted * deltanext +
5093                         (minnext + deltanext) * maxcount - minnext * mincount;
5094                     if (mincount != maxcount) {
5095                          /* Cannot extend fixed substrings found inside
5096                             the group.  */
5097                         scan_commit(pRExC_state, data, minlenp, is_inf);
5098                         if (mincount && last_str) {
5099                             SV * const sv = data->last_found;
5100                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5101                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5102
5103                             if (mg)
5104                                 mg->mg_len = -1;
5105                             sv_setsv(sv, last_str);
5106                             data->last_end = data->pos_min;
5107                             data->last_start_min = data->pos_min - last_chrs;
5108                             data->last_start_max = is_inf
5109                                 ? SSize_t_MAX
5110                                 : data->pos_min + data->pos_delta - last_chrs;
5111                         }
5112                         data->longest = &(data->longest_float);
5113                     }
5114                     SvREFCNT_dec(last_str);
5115                 }
5116                 if (data && (fl & SF_HAS_EVAL))
5117                     data->flags |= SF_HAS_EVAL;
5118               optimize_curly_tail:
5119                 if (OP(oscan) != CURLYX) {
5120                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5121                            && NEXT_OFF(next))
5122                         NEXT_OFF(oscan) += NEXT_OFF(next);
5123                 }
5124                 continue;
5125
5126             default:
5127 #ifdef DEBUGGING
5128                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5129                                                                     OP(scan));
5130 #endif
5131             case REF:
5132             case CLUMP:
5133                 if (flags & SCF_DO_SUBSTR) {
5134                     /* Cannot expect anything... */
5135                     scan_commit(pRExC_state, data, minlenp, is_inf);
5136                     data->longest = &(data->longest_float);
5137                 }
5138                 is_inf = is_inf_internal = 1;
5139                 if (flags & SCF_DO_STCLASS_OR) {
5140                     if (OP(scan) == CLUMP) {
5141                         /* Actually is any start char, but very few code points
5142                          * aren't start characters */
5143                         ssc_match_all_cp(data->start_class);
5144                     }
5145                     else {
5146                         ssc_anything(data->start_class);
5147                     }
5148                 }
5149                 flags &= ~SCF_DO_STCLASS;
5150                 break;
5151             }
5152         }
5153         else if (OP(scan) == LNBREAK) {
5154             if (flags & SCF_DO_STCLASS) {
5155                 if (flags & SCF_DO_STCLASS_AND) {
5156                     ssc_intersection(data->start_class,
5157                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5158                     ssc_clear_locale(data->start_class);
5159                     ANYOF_FLAGS(data->start_class)
5160                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5161                 }
5162                 else if (flags & SCF_DO_STCLASS_OR) {
5163                     ssc_union(data->start_class,
5164                               PL_XPosix_ptrs[_CC_VERTSPACE],
5165                               FALSE);
5166                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5167
5168                     /* See commit msg for
5169                      * 749e076fceedeb708a624933726e7989f2302f6a */
5170                     ANYOF_FLAGS(data->start_class)
5171                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5172                 }
5173                 flags &= ~SCF_DO_STCLASS;
5174             }
5175             min++;
5176             if (delta != SSize_t_MAX)
5177                 delta++;    /* Because of the 2 char string cr-lf */
5178             if (flags & SCF_DO_SUBSTR) {
5179                 /* Cannot expect anything... */
5180                 scan_commit(pRExC_state, data, minlenp, is_inf);
5181                 data->pos_min += 1;
5182                 data->pos_delta += 1;
5183                 data->longest = &(data->longest_float);
5184             }
5185         }
5186         else if (REGNODE_SIMPLE(OP(scan))) {
5187
5188             if (flags & SCF_DO_SUBSTR) {
5189                 scan_commit(pRExC_state, data, minlenp, is_inf);
5190                 data->pos_min++;
5191             }
5192             min++;
5193             if (flags & SCF_DO_STCLASS) {
5194                 bool invert = 0;
5195                 SV* my_invlist = NULL;
5196                 U8 namedclass;
5197
5198                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5199                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5200
5201                 /* Some of the logic below assumes that switching
5202                    locale on will only add false positives. */
5203                 switch (OP(scan)) {
5204
5205                 default:
5206 #ifdef DEBUGGING
5207                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5208                                                                      OP(scan));
5209 #endif
5210                 case SANY:
5211                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5212                         ssc_match_all_cp(data->start_class);
5213                     break;
5214
5215                 case REG_ANY:
5216                     {
5217                         SV* REG_ANY_invlist = _new_invlist(2);
5218                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5219                                                             '\n');
5220                         if (flags & SCF_DO_STCLASS_OR) {
5221                             ssc_union(data->start_class,
5222                                       REG_ANY_invlist,
5223                                       TRUE /* TRUE => invert, hence all but \n
5224                                             */
5225                                       );
5226                         }
5227                         else if (flags & SCF_DO_STCLASS_AND) {
5228                             ssc_intersection(data->start_class,
5229                                              REG_ANY_invlist,
5230                                              TRUE  /* TRUE => invert */
5231                                              );
5232                             ssc_clear_locale(data->start_class);
5233                         }
5234                         SvREFCNT_dec_NN(REG_ANY_invlist);
5235                     }
5236                     break;
5237
5238                 case ANYOFD:
5239                 case ANYOFL:
5240                 case ANYOF:
5241                     if (flags & SCF_DO_STCLASS_AND)
5242                         ssc_and(pRExC_state, data->start_class,
5243                                 (regnode_charclass *) scan);
5244                     else
5245                         ssc_or(pRExC_state, data->start_class,
5246                                                           (regnode_charclass *) scan);
5247                     break;
5248
5249                 case NPOSIXL:
5250                     invert = 1;
5251                     /* FALLTHROUGH */
5252
5253                 case POSIXL:
5254                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5255                     if (flags & SCF_DO_STCLASS_AND) {
5256                         bool was_there = cBOOL(
5257                                           ANYOF_POSIXL_TEST(data->start_class,
5258                                                                  namedclass));
5259                         ANYOF_POSIXL_ZERO(data->start_class);
5260                         if (was_there) {    /* Do an AND */
5261                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5262                         }
5263                         /* No individual code points can now match */
5264                         data->start_class->invlist
5265                                                 = sv_2mortal(_new_invlist(0));
5266                     }
5267                     else {
5268                         int complement = namedclass + ((invert) ? -1 : 1);
5269
5270                         assert(flags & SCF_DO_STCLASS_OR);
5271
5272                         /* If the complement of this class was already there,
5273                          * the result is that they match all code points,
5274                          * (\d + \D == everything).  Remove the classes from
5275                          * future consideration.  Locale is not relevant in
5276                          * this case */
5277                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5278                             ssc_match_all_cp(data->start_class);
5279                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5280                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5281                         }
5282                         else {  /* The usual case; just add this class to the
5283                                    existing set */
5284                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5285                         }
5286                     }
5287                     break;
5288
5289                 case NPOSIXA:   /* For these, we always know the exact set of
5290                                    what's matched */
5291                     invert = 1;
5292                     /* FALLTHROUGH */
5293                 case POSIXA:
5294                     if (FLAGS(scan) == _CC_ASCII) {
5295                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5296                     }
5297                     else {
5298                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5299                                               PL_XPosix_ptrs[_CC_ASCII],
5300                                               &my_invlist);
5301                     }
5302                     goto join_posix;
5303
5304                 case NPOSIXD:
5305                 case NPOSIXU:
5306                     invert = 1;
5307                     /* FALLTHROUGH */
5308                 case POSIXD:
5309                 case POSIXU:
5310                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5311
5312                     /* NPOSIXD matches all upper Latin1 code points unless the
5313                      * target string being matched is UTF-8, which is
5314                      * unknowable until match time.  Since we are going to
5315                      * invert, we want to get rid of all of them so that the
5316                      * inversion will match all */
5317                     if (OP(scan) == NPOSIXD) {
5318                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5319                                           &my_invlist);
5320                     }
5321
5322                   join_posix:
5323
5324                     if (flags & SCF_DO_STCLASS_AND) {
5325                         ssc_intersection(data->start_class, my_invlist, invert);
5326                         ssc_clear_locale(data->start_class);
5327                     }
5328                     else {
5329                         assert(flags & SCF_DO_STCLASS_OR);
5330                         ssc_union(data->start_class, my_invlist, invert);
5331                     }
5332                     SvREFCNT_dec(my_invlist);
5333                 }
5334                 if (flags & SCF_DO_STCLASS_OR)
5335                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5336                 flags &= ~SCF_DO_STCLASS;
5337             }
5338         }
5339         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5340             data->flags |= (OP(scan) == MEOL
5341                             ? SF_BEFORE_MEOL
5342                             : SF_BEFORE_SEOL);
5343             scan_commit(pRExC_state, data, minlenp, is_inf);
5344
5345         }
5346         else if (  PL_regkind[OP(scan)] == BRANCHJ
5347                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5348                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5349                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5350         {
5351             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5352                 || OP(scan) == UNLESSM )
5353             {
5354                 /* Negative Lookahead/lookbehind
5355                    In this case we can't do fixed string optimisation.
5356                 */
5357
5358                 SSize_t deltanext, minnext, fake = 0;
5359                 regnode *nscan;
5360                 regnode_ssc intrnl;
5361                 int f = 0;
5362
5363                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5364                 if (data) {
5365                     data_fake.whilem_c = data->whilem_c;
5366                     data_fake.last_closep = data->last_closep;
5367                 }
5368                 else
5369                     data_fake.last_closep = &fake;
5370                 data_fake.pos_delta = delta;
5371                 if ( flags & SCF_DO_STCLASS && !scan->flags
5372                      && OP(scan) == IFMATCH ) { /* Lookahead */
5373                     ssc_init(pRExC_state, &intrnl);
5374                     data_fake.start_class = &intrnl;
5375                     f |= SCF_DO_STCLASS_AND;
5376                 }
5377                 if (flags & SCF_WHILEM_VISITED_POS)
5378                     f |= SCF_WHILEM_VISITED_POS;
5379                 next = regnext(scan);
5380                 nscan = NEXTOPER(NEXTOPER(scan));
5381                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5382                                       last, &data_fake, stopparen,
5383                                       recursed_depth, NULL, f, depth+1);
5384                 if (scan->flags) {
5385                     if (deltanext) {
5386                         FAIL("Variable length lookbehind not implemented");
5387                     }
5388                     else if (minnext > (I32)U8_MAX) {
5389                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5390                               (UV)U8_MAX);
5391                     }
5392                     scan->flags = (U8)minnext;
5393                 }
5394                 if (data) {
5395                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5396                         pars++;
5397                     if (data_fake.flags & SF_HAS_EVAL)
5398                         data->flags |= SF_HAS_EVAL;
5399                     data->whilem_c = data_fake.whilem_c;
5400                 }
5401                 if (f & SCF_DO_STCLASS_AND) {
5402                     if (flags & SCF_DO_STCLASS_OR) {
5403                         /* OR before, AND after: ideally we would recurse with
5404                          * data_fake to get the AND applied by study of the
5405                          * remainder of the pattern, and then derecurse;
5406                          * *** HACK *** for now just treat as "no information".
5407                          * See [perl #56690].
5408                          */
5409                         ssc_init(pRExC_state, data->start_class);
5410                     }  else {
5411                         /* AND before and after: combine and continue.  These
5412                          * assertions are zero-length, so can match an EMPTY
5413                          * string */
5414                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5415                         ANYOF_FLAGS(data->start_class)
5416                                                    |= SSC_MATCHES_EMPTY_STRING;
5417                     }
5418                 }
5419             }
5420 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5421             else {
5422                 /* Positive Lookahead/lookbehind
5423                    In this case we can do fixed string optimisation,
5424                    but we must be careful about it. Note in the case of
5425                    lookbehind the positions will be offset by the minimum
5426                    length of the pattern, something we won't know about
5427                    until after the recurse.
5428                 */
5429                 SSize_t deltanext, fake = 0;
5430                 regnode *nscan;
5431                 regnode_ssc intrnl;
5432                 int f = 0;
5433                 /* We use SAVEFREEPV so that when the full compile
5434                     is finished perl will clean up the allocated
5435                     minlens when it's all done. This way we don't
5436                     have to worry about freeing them when we know
5437                     they wont be used, which would be a pain.
5438                  */
5439                 SSize_t *minnextp;
5440                 Newx( minnextp, 1, SSize_t );
5441                 SAVEFREEPV(minnextp);
5442
5443                 if (data) {
5444                     StructCopy(data, &data_fake, scan_data_t);
5445                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5446                         f |= SCF_DO_SUBSTR;
5447                         if (scan->flags)
5448                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5449                         data_fake.last_found=newSVsv(data->last_found);
5450                     }
5451                 }
5452                 else
5453                     data_fake.last_closep = &fake;
5454                 data_fake.flags = 0;
5455                 data_fake.pos_delta = delta;
5456                 if (is_inf)
5457                     data_fake.flags |= SF_IS_INF;
5458                 if ( flags & SCF_DO_STCLASS && !scan->flags
5459                      && OP(scan) == IFMATCH ) { /* Lookahead */
5460                     ssc_init(pRExC_state, &intrnl);
5461                     data_fake.start_class = &intrnl;
5462                     f |= SCF_DO_STCLASS_AND;
5463                 }
5464                 if (flags & SCF_WHILEM_VISITED_POS)
5465                     f |= SCF_WHILEM_VISITED_POS;
5466                 next = regnext(scan);
5467                 nscan = NEXTOPER(NEXTOPER(scan));
5468
5469                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5470                                         &deltanext, last, &data_fake,
5471                                         stopparen, recursed_depth, NULL,
5472                                         f,depth+1);
5473                 if (scan->flags) {
5474                     if (deltanext) {
5475                         FAIL("Variable length lookbehind not implemented");
5476                     }
5477                     else if (*minnextp > (I32)U8_MAX) {
5478                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5479                               (UV)U8_MAX);
5480                     }
5481                     scan->flags = (U8)*minnextp;
5482                 }
5483
5484                 *minnextp += min;
5485
5486                 if (f & SCF_DO_STCLASS_AND) {
5487                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5488                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5489                 }
5490                 if (data) {
5491                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5492                         pars++;
5493                     if (data_fake.flags & SF_HAS_EVAL)
5494                         data->flags |= SF_HAS_EVAL;
5495                     data->whilem_c = data_fake.whilem_c;
5496                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5497                         if (RExC_rx->minlen<*minnextp)
5498                             RExC_rx->minlen=*minnextp;
5499                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5500                         SvREFCNT_dec_NN(data_fake.last_found);
5501
5502                         if ( data_fake.minlen_fixed != minlenp )
5503                         {
5504                             data->offset_fixed= data_fake.offset_fixed;
5505                             data->minlen_fixed= data_fake.minlen_fixed;
5506                             data->lookbehind_fixed+= scan->flags;
5507                         }
5508                         if ( data_fake.minlen_float != minlenp )
5509                         {
5510                             data->minlen_float= data_fake.minlen_float;
5511                             data->offset_float_min=data_fake.offset_float_min;
5512                             data->offset_float_max=data_fake.offset_float_max;
5513                             data->lookbehind_float+= scan->flags;
5514                         }
5515                     }
5516                 }
5517             }
5518 #endif
5519         }
5520         else if (OP(scan) == OPEN) {
5521             if (stopparen != (I32)ARG(scan))
5522                 pars++;
5523         }
5524         else if (OP(scan) == CLOSE) {
5525             if (stopparen == (I32)ARG(scan)) {
5526                 break;
5527             }
5528             if ((I32)ARG(scan) == is_par) {
5529                 next = regnext(scan);
5530
5531                 if ( next && (OP(next) != WHILEM) && next < last)
5532                     is_par = 0;         /* Disable optimization */
5533             }
5534             if (data)
5535                 *(data->last_closep) = ARG(scan);
5536         }
5537         else if (OP(scan) == EVAL) {
5538                 if (data)
5539                     data->flags |= SF_HAS_EVAL;
5540         }
5541         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5542             if (flags & SCF_DO_SUBSTR) {
5543                 scan_commit(pRExC_state, data, minlenp, is_inf);
5544                 flags &= ~SCF_DO_SUBSTR;
5545             }
5546             if (data && OP(scan)==ACCEPT) {
5547                 data->flags |= SCF_SEEN_ACCEPT;
5548                 if (stopmin > min)
5549                     stopmin = min;
5550             }
5551         }
5552         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5553         {
5554                 if (flags & SCF_DO_SUBSTR) {
5555                     scan_commit(pRExC_state, data, minlenp, is_inf);
5556                     data->longest = &(data->longest_float);
5557                 }
5558                 is_inf = is_inf_internal = 1;
5559                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5560                     ssc_anything(data->start_class);
5561                 flags &= ~SCF_DO_STCLASS;
5562         }
5563         else if (OP(scan) == GPOS) {
5564             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5565                 !(delta || is_inf || (data && data->pos_delta)))
5566             {
5567                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5568                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5569                 if (RExC_rx->gofs < (STRLEN)min)
5570                     RExC_rx->gofs = min;
5571             } else {
5572                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5573                 RExC_rx->gofs = 0;
5574             }
5575         }
5576 #ifdef TRIE_STUDY_OPT
5577 #ifdef FULL_TRIE_STUDY
5578         else if (PL_regkind[OP(scan)] == TRIE) {
5579             /* NOTE - There is similar code to this block above for handling
5580                BRANCH nodes on the initial study.  If you change stuff here
5581                check there too. */
5582             regnode *trie_node= scan;
5583             regnode *tail= regnext(scan);
5584             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5585             SSize_t max1 = 0, min1 = SSize_t_MAX;
5586             regnode_ssc accum;
5587
5588             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5589                 /* Cannot merge strings after this. */
5590                 scan_commit(pRExC_state, data, minlenp, is_inf);
5591             }
5592             if (flags & SCF_DO_STCLASS)
5593                 ssc_init_zero(pRExC_state, &accum);
5594
5595             if (!trie->jump) {
5596                 min1= trie->minlen;
5597                 max1= trie->maxlen;
5598             } else {
5599                 const regnode *nextbranch= NULL;
5600                 U32 word;
5601
5602                 for ( word=1 ; word <= trie->wordcount ; word++)
5603                 {
5604                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5605                     regnode_ssc this_class;
5606
5607                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5608                     if (data) {
5609                         data_fake.whilem_c = data->whilem_c;
5610                         data_fake.last_closep = data->last_closep;
5611                     }
5612                     else
5613                         data_fake.last_closep = &fake;
5614                     data_fake.pos_delta = delta;
5615                     if (flags & SCF_DO_STCLASS) {
5616                         ssc_init(pRExC_state, &this_class);
5617                         data_fake.start_class = &this_class;
5618                         f = SCF_DO_STCLASS_AND;
5619                     }
5620                     if (flags & SCF_WHILEM_VISITED_POS)
5621                         f |= SCF_WHILEM_VISITED_POS;
5622
5623                     if (trie->jump[word]) {
5624                         if (!nextbranch)
5625                             nextbranch = trie_node + trie->jump[0];
5626                         scan= trie_node + trie->jump[word];
5627                         /* We go from the jump point to the branch that follows
5628                            it. Note this means we need the vestigal unused
5629                            branches even though they arent otherwise used. */
5630                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5631                             &deltanext, (regnode *)nextbranch, &data_fake,
5632                             stopparen, recursed_depth, NULL, f,depth+1);
5633                     }
5634                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5635                         nextbranch= regnext((regnode*)nextbranch);
5636
5637                     if (min1 > (SSize_t)(minnext + trie->minlen))
5638                         min1 = minnext + trie->minlen;
5639                     if (deltanext == SSize_t_MAX) {
5640                         is_inf = is_inf_internal = 1;
5641                         max1 = SSize_t_MAX;
5642                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5643                         max1 = minnext + deltanext + trie->maxlen;
5644
5645                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5646                         pars++;
5647                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5648                         if ( stopmin > min + min1)
5649                             stopmin = min + min1;
5650                         flags &= ~SCF_DO_SUBSTR;
5651                         if (data)
5652                             data->flags |= SCF_SEEN_ACCEPT;
5653                     }
5654                     if (data) {
5655                         if (data_fake.flags & SF_HAS_EVAL)
5656                             data->flags |= SF_HAS_EVAL;
5657                         data->whilem_c = data_fake.whilem_c;
5658                     }
5659                     if (flags & SCF_DO_STCLASS)
5660                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5661                 }
5662             }
5663             if (flags & SCF_DO_SUBSTR) {
5664                 data->pos_min += min1;
5665                 data->pos_delta += max1 - min1;
5666                 if (max1 != min1 || is_inf)
5667                     data->longest = &(data->longest_float);
5668             }
5669             min += min1;
5670             if (delta != SSize_t_MAX)
5671                 delta += max1 - min1;
5672             if (flags & SCF_DO_STCLASS_OR) {
5673                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5674                 if (min1) {
5675                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5676                     flags &= ~SCF_DO_STCLASS;
5677                 }
5678             }
5679             else if (flags & SCF_DO_STCLASS_AND) {
5680                 if (min1) {
5681                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5682                     flags &= ~SCF_DO_STCLASS;
5683                 }
5684                 else {
5685                     /* Switch to OR mode: cache the old value of
5686                      * data->start_class */
5687                     INIT_AND_WITHP;
5688                     StructCopy(data->start_class, and_withp, regnode_ssc);
5689                     flags &= ~SCF_DO_STCLASS_AND;
5690                     StructCopy(&accum, data->start_class, regnode_ssc);
5691                     flags |= SCF_DO_STCLASS_OR;
5692                 }
5693             }
5694             scan= tail;
5695             continue;
5696         }
5697 #else
5698         else if (PL_regkind[OP(scan)] == TRIE) {
5699             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5700             U8*bang=NULL;
5701
5702             min += trie->minlen;
5703             delta += (trie->maxlen - trie->minlen);
5704             flags &= ~SCF_DO_STCLASS; /* xxx */
5705             if (flags & SCF_DO_SUBSTR) {
5706                 /* Cannot expect anything... */
5707                 scan_commit(pRExC_state, data, minlenp, is_inf);
5708                 data->pos_min += trie->minlen;
5709                 data->pos_delta += (trie->maxlen - trie->minlen);
5710                 if (trie->maxlen != trie->minlen)
5711                     data->longest = &(data->longest_float);
5712             }
5713             if (trie->jump) /* no more substrings -- for now /grr*/
5714                flags &= ~SCF_DO_SUBSTR;
5715         }
5716 #endif /* old or new */
5717 #endif /* TRIE_STUDY_OPT */
5718
5719         /* Else: zero-length, ignore. */
5720         scan = regnext(scan);
5721     }
5722     /* If we are exiting a recursion we can unset its recursed bit
5723      * and allow ourselves to enter it again - no danger of an
5724      * infinite loop there.
5725     if (stopparen > -1 && recursed) {
5726         DEBUG_STUDYDATA("unset:", data,depth);
5727         PAREN_UNSET( recursed, stopparen);
5728     }
5729     */
5730     if (frame) {
5731         depth = depth - 1;
5732
5733         DEBUG_STUDYDATA("frame-end:",data,depth);
5734         DEBUG_PEEP("fend", scan, depth);
5735
5736         /* restore previous context */
5737         last = frame->last_regnode;
5738         scan = frame->next_regnode;
5739         stopparen = frame->stopparen;
5740         recursed_depth = frame->prev_recursed_depth;
5741
5742         RExC_frame_last = frame->prev_frame;
5743         frame = frame->this_prev_frame;
5744         goto fake_study_recurse;
5745     }
5746
5747   finish:
5748     assert(!frame);
5749     DEBUG_STUDYDATA("pre-fin:",data,depth);
5750
5751     *scanp = scan;
5752     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5753
5754     if (flags & SCF_DO_SUBSTR && is_inf)
5755         data->pos_delta = SSize_t_MAX - data->pos_min;
5756     if (is_par > (I32)U8_MAX)
5757         is_par = 0;
5758     if (is_par && pars==1 && data) {
5759         data->flags |= SF_IN_PAR;
5760         data->flags &= ~SF_HAS_PAR;
5761     }
5762     else if (pars && data) {
5763         data->flags |= SF_HAS_PAR;
5764         data->flags &= ~SF_IN_PAR;
5765     }
5766     if (flags & SCF_DO_STCLASS_OR)
5767         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5768     if (flags & SCF_TRIE_RESTUDY)
5769         data->flags |=  SCF_TRIE_RESTUDY;
5770
5771     DEBUG_STUDYDATA("post-fin:",data,depth);
5772
5773     {
5774         SSize_t final_minlen= min < stopmin ? min : stopmin;
5775
5776         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5777             if (final_minlen > SSize_t_MAX - delta)
5778                 RExC_maxlen = SSize_t_MAX;
5779             else if (RExC_maxlen < final_minlen + delta)
5780                 RExC_maxlen = final_minlen + delta;
5781         }
5782         return final_minlen;
5783     }
5784     NOT_REACHED; /* NOTREACHED */
5785 }
5786
5787 STATIC U32
5788 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5789 {
5790     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5791
5792     PERL_ARGS_ASSERT_ADD_DATA;
5793
5794     Renewc(RExC_rxi->data,
5795            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5796            char, struct reg_data);
5797     if(count)
5798         Renew(RExC_rxi->data->what, count + n, U8);
5799     else
5800         Newx(RExC_rxi->data->what, n, U8);
5801     RExC_rxi->data->count = count + n;
5802     Copy(s, RExC_rxi->data->what + count, n, U8);
5803     return count;
5804 }
5805
5806 /*XXX: todo make this not included in a non debugging perl, but appears to be
5807  * used anyway there, in 'use re' */
5808 #ifndef PERL_IN_XSUB_RE
5809 void
5810 Perl_reginitcolors(pTHX)
5811 {
5812     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5813     if (s) {
5814         char *t = savepv(s);
5815         int i = 0;
5816         PL_colors[0] = t;
5817         while (++i < 6) {
5818             t = strchr(t, '\t');
5819             if (t) {
5820                 *t = '\0';
5821                 PL_colors[i] = ++t;
5822             }
5823             else
5824                 PL_colors[i] = t = (char *)"";
5825         }
5826     } else {
5827         int i = 0;
5828         while (i < 6)
5829             PL_colors[i++] = (char *)"";
5830     }
5831     PL_colorset = 1;
5832 }
5833 #endif
5834
5835
5836 #ifdef TRIE_STUDY_OPT
5837 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5838     STMT_START {                                            \
5839         if (                                                \
5840               (data.flags & SCF_TRIE_RESTUDY)               \
5841               && ! restudied++                              \
5842         ) {                                                 \
5843             dOsomething;                                    \
5844             goto reStudy;                                   \
5845         }                                                   \
5846     } STMT_END
5847 #else
5848 #define CHECK_RESTUDY_GOTO_butfirst
5849 #endif
5850
5851 /*
5852  * pregcomp - compile a regular expression into internal code
5853  *
5854  * Decides which engine's compiler to call based on the hint currently in
5855  * scope
5856  */
5857
5858 #ifndef PERL_IN_XSUB_RE
5859
5860 /* return the currently in-scope regex engine (or the default if none)  */
5861
5862 regexp_engine const *
5863 Perl_current_re_engine(pTHX)
5864 {
5865     if (IN_PERL_COMPILETIME) {
5866         HV * const table = GvHV(PL_hintgv);
5867         SV **ptr;
5868
5869         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5870             return &PL_core_reg_engine;
5871         ptr = hv_fetchs(table, "regcomp", FALSE);
5872         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5873             return &PL_core_reg_engine;
5874         return INT2PTR(regexp_engine*,SvIV(*ptr));
5875     }
5876     else {
5877         SV *ptr;
5878         if (!PL_curcop->cop_hints_hash)
5879             return &PL_core_reg_engine;
5880         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5881         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5882             return &PL_core_reg_engine;
5883         return INT2PTR(regexp_engine*,SvIV(ptr));
5884     }
5885 }
5886
5887
5888 REGEXP *
5889 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5890 {
5891     regexp_engine const *eng = current_re_engine();
5892     GET_RE_DEBUG_FLAGS_DECL;
5893
5894     PERL_ARGS_ASSERT_PREGCOMP;
5895
5896     /* Dispatch a request to compile a regexp to correct regexp engine. */
5897     DEBUG_COMPILE_r({
5898         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5899                         PTR2UV(eng));
5900     });
5901     return CALLREGCOMP_ENG(eng, pattern, flags);
5902 }
5903 #endif
5904
5905 /* public(ish) entry point for the perl core's own regex compiling code.
5906  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5907  * pattern rather than a list of OPs, and uses the internal engine rather
5908  * than the current one */
5909
5910 REGEXP *
5911 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5912 {
5913     SV *pat = pattern; /* defeat constness! */
5914     PERL_ARGS_ASSERT_RE_COMPILE;
5915     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5916 #ifdef PERL_IN_XSUB_RE
5917                                 &my_reg_engine,
5918 #else
5919                                 &PL_core_reg_engine,
5920 #endif
5921                                 NULL, NULL, rx_flags, 0);
5922 }
5923
5924
5925 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5926  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5927  * point to the realloced string and length.
5928  *
5929  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5930  * stuff added */
5931
5932 static void
5933 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5934                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5935 {
5936     U8 *const src = (U8*)*pat_p;
5937     U8 *dst, *d;
5938     int n=0;
5939     STRLEN s = 0;
5940     bool do_end = 0;
5941     GET_RE_DEBUG_FLAGS_DECL;
5942
5943     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5944         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5945
5946     Newx(dst, *plen_p * 2 + 1, U8);
5947     d = dst;
5948
5949     while (s < *plen_p) {
5950         append_utf8_from_native_byte(src[s], &d);
5951         if (n < num_code_blocks) {
5952             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5953                 pRExC_state->code_blocks[n].start = d - dst - 1;
5954                 assert(*(d - 1) == '(');
5955                 do_end = 1;
5956             }
5957             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5958                 pRExC_state->code_blocks[n].end = d - dst - 1;
5959                 assert(*(d - 1) == ')');
5960                 do_end = 0;
5961                 n++;
5962             }
5963         }
5964         s++;
5965     }
5966     *d = '\0';
5967     *plen_p = d - dst;
5968     *pat_p = (char*) dst;
5969     SAVEFREEPV(*pat_p);
5970     RExC_orig_utf8 = RExC_utf8 = 1;
5971 }
5972
5973
5974
5975 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5976  * while recording any code block indices, and handling overloading,
5977  * nested qr// objects etc.  If pat is null, it will allocate a new
5978  * string, or just return the first arg, if there's only one.
5979  *
5980  * Returns the malloced/updated pat.
5981  * patternp and pat_count is the array of SVs to be concatted;
5982  * oplist is the optional list of ops that generated the SVs;
5983  * recompile_p is a pointer to a boolean that will be set if
5984  *   the regex will need to be recompiled.
5985  * delim, if non-null is an SV that will be inserted between each element
5986  */
5987
5988 static SV*
5989 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5990                 SV *pat, SV ** const patternp, int pat_count,
5991                 OP *oplist, bool *recompile_p, SV *delim)
5992 {
5993     SV **svp;
5994     int n = 0;
5995     bool use_delim = FALSE;
5996     bool alloced = FALSE;
5997
5998     /* if we know we have at least two args, create an empty string,
5999      * then concatenate args to that. For no args, return an empty string */
6000     if (!pat && pat_count != 1) {
6001         pat = newSVpvs("");
6002         SAVEFREESV(pat);
6003         alloced = TRUE;
6004     }
6005
6006     for (svp = patternp; svp < patternp + pat_count; svp++) {
6007         SV *sv;
6008         SV *rx  = NULL;
6009         STRLEN orig_patlen = 0;
6010         bool code = 0;
6011         SV *msv = use_delim ? delim : *svp;
6012         if (!msv) msv = &PL_sv_undef;
6013
6014         /* if we've got a delimiter, we go round the loop twice for each
6015          * svp slot (except the last), using the delimiter the second
6016          * time round */
6017         if (use_delim) {
6018             svp--;
6019             use_delim = FALSE;
6020         }
6021         else if (delim)
6022             use_delim = TRUE;
6023
6024         if (SvTYPE(msv) == SVt_PVAV) {
6025             /* we've encountered an interpolated array within
6026              * the pattern, e.g. /...@a..../. Expand the list of elements,
6027              * then recursively append elements.
6028              * The code in this block is based on S_pushav() */
6029
6030             AV *const av = (AV*)msv;
6031             const SSize_t maxarg = AvFILL(av) + 1;
6032             SV **array;
6033
6034             if (oplist) {
6035                 assert(oplist->op_type == OP_PADAV
6036                     || oplist->op_type == OP_RV2AV);
6037                 oplist = OpSIBLING(oplist);
6038             }
6039
6040             if (SvRMAGICAL(av)) {
6041                 SSize_t i;
6042
6043                 Newx(array, maxarg, SV*);
6044                 SAVEFREEPV(array);
6045                 for (i=0; i < maxarg; i++) {
6046                     SV ** const svp = av_fetch(av, i, FALSE);
6047                     array[i] = svp ? *svp : &PL_sv_undef;
6048                 }
6049             }
6050             else
6051                 array = AvARRAY(av);
6052
6053             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6054                                 array, maxarg, NULL, recompile_p,
6055                                 /* $" */
6056                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6057
6058             continue;
6059         }
6060
6061
6062         /* we make the assumption here that each op in the list of
6063          * op_siblings maps to one SV pushed onto the stack,
6064          * except for code blocks, with have both an OP_NULL and
6065          * and OP_CONST.
6066          * This allows us to match up the list of SVs against the
6067          * list of OPs to find the next code block.
6068          *
6069          * Note that       PUSHMARK PADSV PADSV ..
6070          * is optimised to
6071          *                 PADRANGE PADSV  PADSV  ..
6072          * so the alignment still works. */
6073
6074         if (oplist) {
6075             if (oplist->op_type == OP_NULL
6076                 && (oplist->op_flags & OPf_SPECIAL))
6077             {
6078                 assert(n < pRExC_state->num_code_blocks);
6079                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6080                 pRExC_state->code_blocks[n].block = oplist;
6081                 pRExC_state->code_blocks[n].src_regex = NULL;
6082                 n++;
6083                 code = 1;
6084                 oplist = OpSIBLING(oplist); /* skip CONST */
6085                 assert(oplist);
6086             }
6087             oplist = OpSIBLING(oplist);;
6088         }
6089
6090         /* apply magic and QR overloading to arg */
6091
6092         SvGETMAGIC(msv);
6093         if (SvROK(msv) && SvAMAGIC(msv)) {
6094             SV *sv = AMG_CALLunary(msv, regexp_amg);
6095             if (sv) {
6096                 if (SvROK(sv))
6097                     sv = SvRV(sv);
6098                 if (SvTYPE(sv) != SVt_REGEXP)
6099                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6100                 msv = sv;
6101             }
6102         }
6103
6104         /* try concatenation overload ... */
6105         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6106                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6107         {
6108             sv_setsv(pat, sv);
6109             /* overloading involved: all bets are off over literal
6110              * code. Pretend we haven't seen it */
6111             pRExC_state->num_code_blocks -= n;
6112             n = 0;
6113         }
6114         else  {
6115             /* ... or failing that, try "" overload */
6116             while (SvAMAGIC(msv)
6117                     && (sv = AMG_CALLunary(msv, string_amg))
6118                     && sv != msv
6119                     &&  !(   SvROK(msv)
6120                           && SvROK(sv)
6121                           && SvRV(msv) == SvRV(sv))
6122             ) {
6123                 msv = sv;
6124                 SvGETMAGIC(msv);
6125             }
6126             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6127                 msv = SvRV(msv);
6128
6129             if (pat) {
6130                 /* this is a partially unrolled
6131                  *     sv_catsv_nomg(pat, msv);
6132                  * that allows us to adjust code block indices if
6133                  * needed */
6134                 STRLEN dlen;
6135                 char *dst = SvPV_force_nomg(pat, dlen);
6136                 orig_patlen = dlen;
6137                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6138                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6139                     sv_setpvn(pat, dst, dlen);
6140                     SvUTF8_on(pat);
6141                 }
6142                 sv_catsv_nomg(pat, msv);
6143                 rx = msv;
6144             }
6145             else
6146                 pat = msv;
6147
6148             if (code)
6149                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6150         }
6151
6152         /* extract any code blocks within any embedded qr//'s */
6153         if (rx && SvTYPE(rx) == SVt_REGEXP
6154             && RX_ENGINE((REGEXP*)rx)->op_comp)
6155         {
6156
6157             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6158             if (ri->num_code_blocks) {
6159                 int i;
6160                 /* the presence of an embedded qr// with code means
6161                  * we should always recompile: the text of the
6162                  * qr// may not have changed, but it may be a
6163                  * different closure than last time */
6164                 *recompile_p = 1;
6165                 Renew(pRExC_state->code_blocks,
6166                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6167                     struct reg_code_block);
6168                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6169
6170                 for (i=0; i < ri->num_code_blocks; i++) {
6171                     struct reg_code_block *src, *dst;
6172                     STRLEN offset =  orig_patlen
6173                         + ReANY((REGEXP *)rx)->pre_prefix;
6174                     assert(n < pRExC_state->num_code_blocks);
6175                     src = &ri->code_blocks[i];
6176                     dst = &pRExC_state->code_blocks[n];
6177                     dst->start      = src->start + offset;
6178                     dst->end        = src->end   + offset;
6179                     dst->block      = src->block;
6180                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6181                                             src->src_regex
6182                                                 ? src->src_regex
6183                                                 : (REGEXP*)rx);
6184                     n++;
6185                 }
6186             }
6187         }
6188     }
6189     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6190     if (alloced)
6191         SvSETMAGIC(pat);
6192
6193     return pat;
6194 }
6195
6196
6197
6198 /* see if there are any run-time code blocks in the pattern.
6199  * False positives are allowed */
6200
6201 static bool
6202 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6203                     char *pat, STRLEN plen)
6204 {
6205     int n = 0;
6206     STRLEN s;
6207
6208     PERL_UNUSED_CONTEXT;
6209
6210     for (s = 0; s < plen; s++) {
6211         if (n < pRExC_state->num_code_blocks
6212             && s == pRExC_state->code_blocks[n].start)
6213         {
6214             s = pRExC_state->code_blocks[n].end;
6215             n++;
6216             continue;
6217         }
6218         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6219          * positives here */
6220         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6221             (pat[s+2] == '{'
6222                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6223         )
6224             return 1;
6225     }
6226     return 0;
6227 }
6228
6229 /* Handle run-time code blocks. We will already have compiled any direct
6230  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6231  * copy of it, but with any literal code blocks blanked out and
6232  * appropriate chars escaped; then feed it into
6233  *
6234  *    eval "qr'modified_pattern'"
6235  *
6236  * For example,
6237  *
6238  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6239  *
6240  * becomes
6241  *
6242  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6243  *
6244  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6245  * and merge them with any code blocks of the original regexp.
6246  *
6247  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6248  * instead, just save the qr and return FALSE; this tells our caller that
6249  * the original pattern needs upgrading to utf8.
6250  */
6251
6252 static bool
6253 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6254     char *pat, STRLEN plen)
6255 {
6256     SV *qr;
6257
6258     GET_RE_DEBUG_FLAGS_DECL;
6259
6260     if (pRExC_state->runtime_code_qr) {
6261         /* this is the second time we've been called; this should
6262          * only happen if the main pattern got upgraded to utf8
6263          * during compilation; re-use the qr we compiled first time
6264          * round (which should be utf8 too)
6265          */
6266         qr = pRExC_state->runtime_code_qr;
6267         pRExC_state->runtime_code_qr = NULL;
6268         assert(RExC_utf8 && SvUTF8(qr));
6269     }
6270     else {
6271         int n = 0;
6272         STRLEN s;
6273         char *p, *newpat;
6274         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6275         SV *sv, *qr_ref;
6276         dSP;
6277
6278         /* determine how many extra chars we need for ' and \ escaping */
6279         for (s = 0; s < plen; s++) {
6280             if (pat[s] == '\'' || pat[s] == '\\')
6281                 newlen++;
6282         }
6283
6284         Newx(newpat, newlen, char);
6285         p = newpat;
6286         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6287
6288         for (s = 0; s < plen; s++) {
6289             if (n < pRExC_state->num_code_blocks
6290                 && s == pRExC_state->code_blocks[n].start)
6291             {
6292                 /* blank out literal code block */
6293                 assert(pat[s] == '(');
6294                 while (s <= pRExC_state->code_blocks[n].end) {
6295                     *p++ = '_';
6296                     s++;
6297                 }
6298                 s--;
6299                 n++;
6300                 continue;
6301             }
6302             if (pat[s] == '\'' || pat[s] == '\\')
6303                 *p++ = '\\';
6304             *p++ = pat[s];
6305         }
6306         *p++ = '\'';
6307         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6308             *p++ = 'x';
6309         *p++ = '\0';
6310         DEBUG_COMPILE_r({
6311             PerlIO_printf(Perl_debug_log,
6312                 "%sre-parsing pattern for runtime code:%s %s\n",
6313                 PL_colors[4],PL_colors[5],newpat);
6314         });
6315
6316         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6317         Safefree(newpat);
6318
6319         ENTER;
6320         SAVETMPS;
6321         save_re_context();
6322         PUSHSTACKi(PERLSI_REQUIRE);
6323         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6324          * parsing qr''; normally only q'' does this. It also alters
6325          * hints handling */
6326         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6327         SvREFCNT_dec_NN(sv);
6328         SPAGAIN;
6329         qr_ref = POPs;
6330         PUTBACK;
6331         {
6332             SV * const errsv = ERRSV;
6333             if (SvTRUE_NN(errsv))
6334             {
6335                 Safefree(pRExC_state->code_blocks);
6336                 /* use croak_sv ? */
6337                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6338             }
6339         }
6340         assert(SvROK(qr_ref));
6341         qr = SvRV(qr_ref);
6342         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6343         /* the leaving below frees the tmp qr_ref.
6344          * Give qr a life of its own */
6345         SvREFCNT_inc(qr);
6346         POPSTACK;
6347         FREETMPS;
6348         LEAVE;
6349
6350     }
6351
6352     if (!RExC_utf8 && SvUTF8(qr)) {
6353         /* first time through; the pattern got upgraded; save the
6354          * qr for the next time through */
6355         assert(!pRExC_state->runtime_code_qr);
6356         pRExC_state->runtime_code_qr = qr;
6357         return 0;
6358     }
6359
6360
6361     /* extract any code blocks within the returned qr//  */
6362
6363
6364     /* merge the main (r1) and run-time (r2) code blocks into one */
6365     {
6366         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6367         struct reg_code_block *new_block, *dst;
6368         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6369         int i1 = 0, i2 = 0;
6370
6371         if (!r2->num_code_blocks) /* we guessed wrong */
6372         {
6373             SvREFCNT_dec_NN(qr);
6374             return 1;
6375         }
6376
6377         Newx(new_block,
6378             r1->num_code_blocks + r2->num_code_blocks,
6379             struct reg_code_block);
6380         dst = new_block;
6381
6382         while (    i1 < r1->num_code_blocks
6383                 || i2 < r2->num_code_blocks)
6384         {
6385             struct reg_code_block *src;
6386             bool is_qr = 0;
6387
6388             if (i1 == r1->num_code_blocks) {
6389                 src = &r2->code_blocks[i2++];
6390                 is_qr = 1;
6391             }
6392             else if (i2 == r2->num_code_blocks)
6393                 src = &r1->code_blocks[i1++];
6394             else if (  r1->code_blocks[i1].start
6395                      < r2->code_blocks[i2].start)
6396             {
6397                 src = &r1->code_blocks[i1++];
6398                 assert(src->end < r2->code_blocks[i2].start);
6399             }
6400             else {
6401                 assert(  r1->code_blocks[i1].start
6402                        > r2->code_blocks[i2].start);
6403                 src = &r2->code_blocks[i2++];
6404                 is_qr = 1;
6405                 assert(src->end < r1->code_blocks[i1].start);
6406             }
6407
6408             assert(pat[src->start] == '(');
6409             assert(pat[src->end]   == ')');
6410             dst->start      = src->start;
6411             dst->end        = src->end;
6412             dst->block      = src->block;
6413             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6414                                     : src->src_regex;
6415             dst++;
6416         }
6417         r1->num_code_blocks += r2->num_code_blocks;
6418         Safefree(r1->code_blocks);
6419         r1->code_blocks = new_block;
6420     }
6421
6422     SvREFCNT_dec_NN(qr);
6423     return 1;
6424 }
6425
6426
6427 STATIC bool
6428 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6429                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6430                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6431                       STRLEN longest_length, bool eol, bool meol)
6432 {
6433     /* This is the common code for setting up the floating and fixed length
6434      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6435      * as to whether succeeded or not */
6436
6437     I32 t;
6438     SSize_t ml;
6439
6440     if (! (longest_length
6441            || (eol /* Can't have SEOL and MULTI */
6442                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6443           )
6444             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6445         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6446     {
6447         return FALSE;
6448     }
6449
6450     /* copy the information about the longest from the reg_scan_data
6451         over to the program. */
6452     if (SvUTF8(sv_longest)) {
6453         *rx_utf8 = sv_longest;
6454         *rx_substr = NULL;
6455     } else {
6456         *rx_substr = sv_longest;
6457         *rx_utf8 = NULL;
6458     }
6459     /* end_shift is how many chars that must be matched that
6460         follow this item. We calculate it ahead of time as once the
6461         lookbehind offset is added in we lose the ability to correctly
6462         calculate it.*/
6463     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6464     *rx_end_shift = ml - offset
6465         - longest_length + (SvTAIL(sv_longest) != 0)
6466         + lookbehind;
6467
6468     t = (eol/* Can't have SEOL and MULTI */
6469          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6470     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6471
6472     return TRUE;
6473 }
6474
6475 /*
6476  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6477  * regular expression into internal code.
6478  * The pattern may be passed either as:
6479  *    a list of SVs (patternp plus pat_count)
6480  *    a list of OPs (expr)
6481  * If both are passed, the SV list is used, but the OP list indicates
6482  * which SVs are actually pre-compiled code blocks
6483  *
6484  * The SVs in the list have magic and qr overloading applied to them (and
6485  * the list may be modified in-place with replacement SVs in the latter
6486  * case).
6487  *
6488  * If the pattern hasn't changed from old_re, then old_re will be
6489  * returned.
6490  *
6491  * eng is the current engine. If that engine has an op_comp method, then
6492  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6493  * do the initial concatenation of arguments and pass on to the external
6494  * engine.
6495  *
6496  * If is_bare_re is not null, set it to a boolean indicating whether the
6497  * arg list reduced (after overloading) to a single bare regex which has
6498  * been returned (i.e. /$qr/).
6499  *
6500  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6501  *
6502  * pm_flags contains the PMf_* flags, typically based on those from the
6503  * pm_flags field of the related PMOP. Currently we're only interested in
6504  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6505  *
6506  * We can't allocate space until we know how big the compiled form will be,
6507  * but we can't compile it (and thus know how big it is) until we've got a
6508  * place to put the code.  So we cheat:  we compile it twice, once with code
6509  * generation turned off and size counting turned on, and once "for real".
6510  * This also means that we don't allocate space until we are sure that the
6511  * thing really will compile successfully, and we never have to move the
6512  * code and thus invalidate pointers into it.  (Note that it has to be in
6513  * one piece because free() must be able to free it all.) [NB: not true in perl]
6514  *
6515  * Beware that the optimization-preparation code in here knows about some
6516  * of the structure of the compiled regexp.  [I'll say.]
6517  */
6518
6519 REGEXP *
6520 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6521                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6522                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6523 {
6524     REGEXP *rx;
6525     struct regexp *r;
6526     regexp_internal *ri;
6527     STRLEN plen;
6528     char *exp;
6529     regnode *scan;
6530     I32 flags;
6531     SSize_t minlen = 0;
6532     U32 rx_flags;
6533     SV *pat;
6534     SV *code_blocksv = NULL;
6535     SV** new_patternp = patternp;
6536
6537     /* these are all flags - maybe they should be turned
6538      * into a single int with different bit masks */
6539     I32 sawlookahead = 0;
6540     I32 sawplus = 0;
6541     I32 sawopen = 0;
6542     I32 sawminmod = 0;
6543
6544     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6545     bool recompile = 0;
6546     bool runtime_code = 0;
6547     scan_data_t data;
6548     RExC_state_t RExC_state;
6549     RExC_state_t * const pRExC_state = &RExC_state;
6550 #ifdef TRIE_STUDY_OPT
6551     int restudied = 0;
6552     RExC_state_t copyRExC_state;
6553 #endif
6554     GET_RE_DEBUG_FLAGS_DECL;
6555
6556     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6557
6558     DEBUG_r(if (!PL_colorset) reginitcolors());
6559
6560     /* Initialize these here instead of as-needed, as is quick and avoids
6561      * having to test them each time otherwise */
6562     if (! PL_AboveLatin1) {
6563         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6564         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6565         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6566         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6567         PL_HasMultiCharFold =
6568                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6569
6570         /* This is calculated here, because the Perl program that generates the
6571          * static global ones doesn't currently have access to
6572          * NUM_ANYOF_CODE_POINTS */
6573         PL_InBitmap = _new_invlist(2);
6574         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6575                                                     NUM_ANYOF_CODE_POINTS - 1);
6576     }
6577
6578     pRExC_state->code_blocks = NULL;
6579     pRExC_state->num_code_blocks = 0;
6580
6581     if (is_bare_re)
6582         *is_bare_re = FALSE;
6583
6584     if (expr && (expr->op_type == OP_LIST ||
6585                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6586         /* allocate code_blocks if needed */
6587         OP *o;
6588         int ncode = 0;
6589
6590         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6591             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6592                 ncode++; /* count of DO blocks */
6593         if (ncode) {
6594             pRExC_state->num_code_blocks = ncode;
6595             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6596         }
6597     }
6598
6599     if (!pat_count) {
6600         /* compile-time pattern with just OP_CONSTs and DO blocks */
6601
6602         int n;
6603         OP *o;
6604
6605         /* find how many CONSTs there are */
6606         assert(expr);
6607         n = 0;
6608         if (expr->op_type == OP_CONST)
6609             n = 1;
6610         else
6611             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6612                 if (o->op_type == OP_CONST)
6613                     n++;
6614             }
6615
6616         /* fake up an SV array */
6617
6618         assert(!new_patternp);
6619         Newx(new_patternp, n, SV*);
6620         SAVEFREEPV(new_patternp);
6621         pat_count = n;
6622
6623         n = 0;
6624         if (expr->op_type == OP_CONST)
6625             new_patternp[n] = cSVOPx_sv(expr);
6626         else
6627             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6628                 if (o->op_type == OP_CONST)
6629                     new_patternp[n++] = cSVOPo_sv;
6630             }
6631
6632     }
6633
6634     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6635         "Assembling pattern from %d elements%s\n", pat_count,
6636             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6637
6638     /* set expr to the first arg op */
6639
6640     if (pRExC_state->num_code_blocks
6641          && expr->op_type != OP_CONST)
6642     {
6643             expr = cLISTOPx(expr)->op_first;
6644             assert(   expr->op_type == OP_PUSHMARK
6645                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6646                    || expr->op_type == OP_PADRANGE);
6647             expr = OpSIBLING(expr);
6648     }
6649
6650     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6651                         expr, &recompile, NULL);
6652
6653     /* handle bare (possibly after overloading) regex: foo =~ $re */
6654     {
6655         SV *re = pat;
6656         if (SvROK(re))
6657             re = SvRV(re);
6658         if (SvTYPE(re) == SVt_REGEXP) {
6659             if (is_bare_re)
6660                 *is_bare_re = TRUE;
6661             SvREFCNT_inc(re);
6662             Safefree(pRExC_state->code_blocks);
6663             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6664                 "Precompiled pattern%s\n",
6665                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6666
6667             return (REGEXP*)re;
6668         }
6669     }
6670
6671     exp = SvPV_nomg(pat, plen);
6672
6673     if (!eng->op_comp) {
6674         if ((SvUTF8(pat) && IN_BYTES)
6675                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6676         {
6677             /* make a temporary copy; either to convert to bytes,
6678              * or to avoid repeating get-magic / overloaded stringify */
6679             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6680                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6681         }
6682         Safefree(pRExC_state->code_blocks);
6683         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6684     }
6685
6686     /* ignore the utf8ness if the pattern is 0 length */
6687     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6688
6689     RExC_uni_semantics = 0;
6690     RExC_seen_unfolded_sharp_s = 0;
6691     RExC_contains_locale = 0;
6692     RExC_contains_i = 0;
6693     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6694     pRExC_state->runtime_code_qr = NULL;
6695     RExC_frame_head= NULL;
6696     RExC_frame_last= NULL;
6697     RExC_frame_count= 0;
6698
6699     DEBUG_r({
6700         RExC_mysv1= sv_newmortal();
6701         RExC_mysv2= sv_newmortal();
6702     });
6703     DEBUG_COMPILE_r({
6704             SV *dsv= sv_newmortal();
6705             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6706             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6707                           PL_colors[4],PL_colors[5],s);
6708         });
6709
6710   redo_first_pass:
6711     /* we jump here if we have to recompile, e.g., from upgrading the pattern
6712      * to utf8 */
6713
6714     if ((pm_flags & PMf_USE_RE_EVAL)
6715                 /* this second condition covers the non-regex literal case,
6716                  * i.e.  $foo =~ '(?{})'. */
6717                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6718     )
6719         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6720
6721     /* return old regex if pattern hasn't changed */
6722     /* XXX: note in the below we have to check the flags as well as the
6723      * pattern.
6724      *
6725      * Things get a touch tricky as we have to compare the utf8 flag
6726      * independently from the compile flags.  */
6727
6728     if (   old_re
6729         && !recompile
6730         && !!RX_UTF8(old_re) == !!RExC_utf8
6731         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6732         && RX_PRECOMP(old_re)
6733         && RX_PRELEN(old_re) == plen
6734         && memEQ(RX_PRECOMP(old_re), exp, plen)
6735         && !runtime_code /* with runtime code, always recompile */ )
6736     {
6737         Safefree(pRExC_state->code_blocks);
6738         return old_re;
6739     }
6740
6741     rx_flags = orig_rx_flags;
6742
6743     if (rx_flags & PMf_FOLD) {
6744         RExC_contains_i = 1;
6745     }
6746     if (   initial_charset == REGEX_DEPENDS_CHARSET
6747         && (RExC_utf8 ||RExC_uni_semantics))
6748     {
6749
6750         /* Set to use unicode semantics if the pattern is in utf8 and has the
6751          * 'depends' charset specified, as it means unicode when utf8  */
6752         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6753     }
6754
6755     RExC_precomp = exp;
6756     RExC_precomp_adj = 0;
6757     RExC_flags = rx_flags;
6758     RExC_pm_flags = pm_flags;
6759
6760     if (runtime_code) {
6761         assert(TAINTING_get || !TAINT_get);
6762         if (TAINT_get)
6763             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6764
6765         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6766             /* whoops, we have a non-utf8 pattern, whilst run-time code
6767              * got compiled as utf8. Try again with a utf8 pattern */
6768             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6769                                     pRExC_state->num_code_blocks);
6770             goto redo_first_pass;
6771         }
6772     }
6773     assert(!pRExC_state->runtime_code_qr);
6774
6775     RExC_sawback = 0;
6776
6777     RExC_seen = 0;
6778     RExC_maxlen = 0;
6779     RExC_in_lookbehind = 0;
6780     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6781     RExC_extralen = 0;
6782     RExC_override_recoding = 0;
6783 #ifdef EBCDIC
6784     RExC_recode_x_to_native = 0;
6785 #endif
6786     RExC_in_multi_char_class = 0;
6787
6788     /* First pass: determine size, legality. */
6789     RExC_parse = exp;
6790     RExC_start = RExC_adjusted_start = exp;
6791     RExC_end = exp + plen;
6792     RExC_precomp_end = RExC_end;
6793     RExC_naughty = 0;
6794     RExC_npar = 1;
6795     RExC_nestroot = 0;
6796     RExC_size = 0L;
6797     RExC_emit = (regnode *) &RExC_emit_dummy;
6798     RExC_whilem_seen = 0;
6799     RExC_open_parens = NULL;
6800     RExC_close_parens = NULL;
6801     RExC_opend = NULL;
6802     RExC_paren_names = NULL;
6803 #ifdef DEBUGGING
6804     RExC_paren_name_list = NULL;
6805 #endif
6806     RExC_recurse = NULL;
6807     RExC_study_chunk_recursed = NULL;
6808     RExC_study_chunk_recursed_bytes= 0;
6809     RExC_recurse_count = 0;
6810     pRExC_state->code_index = 0;
6811
6812     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
6813      * code makes sure the final byte is an uncounted NUL.  But should this
6814      * ever not be the case, lots of things could read beyond the end of the
6815      * buffer: loops like
6816      *      while(isFOO(*RExC_parse)) RExC_parse++;
6817      *      strchr(RExC_parse, "foo");
6818      * etc.  So it is worth noting. */
6819     assert(*RExC_end == '\0');
6820
6821     DEBUG_PARSE_r(
6822         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6823         RExC_lastnum=0;
6824         RExC_lastparse=NULL;
6825     );
6826     /* reg may croak on us, not giving us a chance to free
6827        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6828        need it to survive as long as the regexp (qr/(?{})/).
6829        We must check that code_blocksv is not already set, because we may
6830        have jumped back to restart the sizing pass. */
6831     if (pRExC_state->code_blocks && !code_blocksv) {
6832         code_blocksv = newSV_type(SVt_PV);
6833         SAVEFREESV(code_blocksv);
6834         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6835         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6836     }
6837     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6838         /* It's possible to write a regexp in ascii that represents Unicode
6839         codepoints outside of the byte range, such as via \x{100}. If we
6840         detect such a sequence we have to convert the entire pattern to utf8
6841         and then recompile, as our sizing calculation will have been based
6842         on 1 byte == 1 character, but we will need to use utf8 to encode
6843         at least some part of the pattern, and therefore must convert the whole
6844         thing.
6845         -- dmq */
6846         if (flags & RESTART_PASS1) {
6847             if (flags & NEED_UTF8) {
6848                 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6849                                     pRExC_state->num_code_blocks);
6850             }
6851             else {
6852                 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6853                 "Need to redo pass 1\n"));
6854             }
6855
6856             goto redo_first_pass;
6857         }
6858         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6859     }
6860     if (code_blocksv)
6861         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6862
6863     DEBUG_PARSE_r({
6864         PerlIO_printf(Perl_debug_log,
6865             "Required size %"IVdf" nodes\n"
6866             "Starting second pass (creation)\n",
6867             (IV)RExC_size);
6868         RExC_lastnum=0;
6869         RExC_lastparse=NULL;
6870     });
6871
6872     /* The first pass could have found things that force Unicode semantics */
6873     if ((RExC_utf8 || RExC_uni_semantics)
6874          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6875     {
6876         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6877     }
6878
6879     /* Small enough for pointer-storage convention?
6880        If extralen==0, this means that we will not need long jumps. */
6881     if (RExC_size >= 0x10000L && RExC_extralen)
6882         RExC_size += RExC_extralen;
6883     else
6884         RExC_extralen = 0;
6885     if (RExC_whilem_seen > 15)
6886         RExC_whilem_seen = 15;
6887
6888     /* Allocate space and zero-initialize. Note, the two step process
6889        of zeroing when in debug mode, thus anything assigned has to
6890        happen after that */
6891     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6892     r = ReANY(rx);
6893     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6894          char, regexp_internal);
6895     if ( r == NULL || ri == NULL )
6896         FAIL("Regexp out of space");
6897 #ifdef DEBUGGING
6898     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6899     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6900          char);
6901 #else
6902     /* bulk initialize base fields with 0. */
6903     Zero(ri, sizeof(regexp_internal), char);
6904 #endif
6905
6906     /* non-zero initialization begins here */
6907     RXi_SET( r, ri );
6908     r->engine= eng;
6909     r->extflags = rx_flags;
6910     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6911
6912     if (pm_flags & PMf_IS_QR) {
6913         ri->code_blocks = pRExC_state->code_blocks;
6914         ri->num_code_blocks = pRExC_state->num_code_blocks;
6915     }
6916     else
6917     {
6918         int n;
6919         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6920             if (pRExC_state->code_blocks[n].src_regex)
6921                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6922         if(pRExC_state->code_blocks)
6923             SAVEFREEPV(pRExC_state->code_blocks); /* often null */
6924     }
6925
6926     {
6927         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6928         bool has_charset = (get_regex_charset(r->extflags)
6929                                                     != REGEX_DEPENDS_CHARSET);
6930
6931         /* The caret is output if there are any defaults: if not all the STD
6932          * flags are set, or if no character set specifier is needed */
6933         bool has_default =
6934                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6935                     || ! has_charset);
6936         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6937                                                    == REG_RUN_ON_COMMENT_SEEN);
6938         U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
6939                             >> RXf_PMf_STD_PMMOD_SHIFT);
6940         const char *fptr = STD_PAT_MODS;        /*"msixn"*/
6941         char *p;
6942
6943         /* We output all the necessary flags; we never output a minus, as all
6944          * those are defaults, so are
6945          * covered by the caret */
6946         const STRLEN wraplen = plen + has_p + has_runon
6947             + has_default       /* If needs a caret */
6948             + PL_bitcount[reganch] /* 1 char for each set standard flag */
6949
6950                 /* If needs a character set specifier */
6951             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6952             + (sizeof("(?:)") - 1);
6953
6954         /* make sure PL_bitcount bounds not exceeded */
6955         assert(sizeof(STD_PAT_MODS) <= 8);
6956
6957         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6958         r->xpv_len_u.xpvlenu_pv = p;
6959         if (RExC_utf8)
6960             SvFLAGS(rx) |= SVf_UTF8;
6961         *p++='('; *p++='?';
6962
6963         /* If a default, cover it using the caret */
6964         if (has_default) {
6965             *p++= DEFAULT_PAT_MOD;
6966         }
6967         if (has_charset) {
6968             STRLEN len;
6969             const char* const name = get_regex_charset_name(r->extflags, &len);
6970             Copy(name, p, len, char);
6971             p += len;
6972         }
6973         if (has_p)
6974             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6975         {
6976             char ch;
6977             while((ch = *fptr++)) {
6978                 if(reganch & 1)
6979                     *p++ = ch;
6980                 reganch >>= 1;
6981             }
6982         }
6983
6984         *p++ = ':';
6985         Copy(RExC_precomp, p, plen, char);
6986         assert ((RX_WRAPPED(rx) - p) < 16);
6987         r->pre_prefix = p - RX_WRAPPED(rx);
6988         p += plen;
6989         if (has_runon)
6990             *p++ = '\n';
6991         *p++ = ')';
6992         *p = 0;
6993         SvCUR_set(rx, p - RX_WRAPPED(rx));
6994     }
6995
6996     r->intflags = 0;
6997     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6998
6999     /* setup various meta data about recursion, this all requires
7000      * RExC_npar to be correctly set, and a bit later on we clear it */
7001     if (RExC_seen & REG_RECURSE_SEEN) {
7002         Newxz(RExC_open_parens, RExC_npar,regnode *);
7003         SAVEFREEPV(RExC_open_parens);
7004         Newxz(RExC_close_parens,RExC_npar,regnode *);
7005         SAVEFREEPV(RExC_close_parens);
7006     }
7007     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
7008         /* Note, RExC_npar is 1 + the number of parens in a pattern.
7009          * So its 1 if there are no parens. */
7010         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7011                                          ((RExC_npar & 0x07) != 0);
7012         Newx(RExC_study_chunk_recursed,
7013              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7014         SAVEFREEPV(RExC_study_chunk_recursed);
7015     }
7016
7017     /* Useful during FAIL. */
7018 #ifdef RE_TRACK_PATTERN_OFFSETS
7019     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7020     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
7021                           "%s %"UVuf" bytes for offset annotations.\n",
7022                           ri->u.offsets ? "Got" : "Couldn't get",
7023                           (UV)((2*RExC_size+1) * sizeof(U32))));
7024 #endif
7025     SetProgLen(ri,RExC_size);
7026     RExC_rx_sv = rx;
7027     RExC_rx = r;
7028     RExC_rxi = ri;
7029
7030     /* Second pass: emit code. */
7031     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7032     RExC_pm_flags = pm_flags;
7033     RExC_parse = exp;
7034     RExC_end = exp + plen;
7035     RExC_naughty = 0;
7036     RExC_npar = 1;
7037     RExC_emit_start = ri->program;
7038     RExC_emit = ri->program;
7039     RExC_emit_bound = ri->program + RExC_size + 1;
7040     pRExC_state->code_index = 0;
7041
7042     *((char*) RExC_emit++) = (char) REG_MAGIC;
7043     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7044         ReREFCNT_dec(rx);
7045         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
7046     }
7047     /* XXXX To minimize changes to RE engine we always allocate
7048        3-units-long substrs field. */
7049     Newx(r->substrs, 1, struct reg_substr_data);
7050     if (RExC_recurse_count) {
7051         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7052         SAVEFREEPV(RExC_recurse);
7053     }
7054
7055   reStudy:
7056     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7057     DEBUG_r(
7058         RExC_study_chunk_recursed_count= 0;
7059     );
7060     Zero(r->substrs, 1, struct reg_substr_data);
7061     if (RExC_study_chunk_recursed) {
7062         Zero(RExC_study_chunk_recursed,
7063              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7064     }
7065
7066
7067 #ifdef TRIE_STUDY_OPT
7068     if (!restudied) {
7069         StructCopy(&zero_scan_data, &data, scan_data_t);
7070         copyRExC_state = RExC_state;
7071     } else {
7072         U32 seen=RExC_seen;
7073         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7074
7075         RExC_state = copyRExC_state;
7076         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7077             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7078         else
7079             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7080         StructCopy(&zero_scan_data, &data, scan_data_t);
7081     }
7082 #else
7083     StructCopy(&zero_scan_data, &data, scan_data_t);
7084 #endif
7085
7086     /* Dig out information for optimizations. */
7087     r->extflags = RExC_flags; /* was pm_op */
7088     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7089
7090     if (UTF)
7091         SvUTF8_on(rx);  /* Unicode in it? */
7092     ri->regstclass = NULL;
7093     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7094         r->intflags |= PREGf_NAUGHTY;
7095     scan = ri->program + 1;             /* First BRANCH. */
7096
7097     /* testing for BRANCH here tells us whether there is "must appear"
7098        data in the pattern. If there is then we can use it for optimisations */
7099     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7100                                                   */
7101         SSize_t fake;
7102         STRLEN longest_float_length, longest_fixed_length;
7103         regnode_ssc ch_class; /* pointed to by data */
7104         int stclass_flag;
7105         SSize_t last_close = 0; /* pointed to by data */
7106         regnode *first= scan;
7107         regnode *first_next= regnext(first);
7108         /*
7109          * Skip introductions and multiplicators >= 1
7110          * so that we can extract the 'meat' of the pattern that must
7111          * match in the large if() sequence following.
7112          * NOTE that EXACT is NOT covered here, as it is normally
7113          * picked up by the optimiser separately.
7114          *
7115          * This is unfortunate as the optimiser isnt handling lookahead
7116          * properly currently.
7117          *
7118          */
7119         while ((OP(first) == OPEN && (sawopen = 1)) ||
7120                /* An OR of *one* alternative - should not happen now. */
7121             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7122             /* for now we can't handle lookbehind IFMATCH*/
7123             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7124             (OP(first) == PLUS) ||
7125             (OP(first) == MINMOD) ||
7126                /* An {n,m} with n>0 */
7127             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7128             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7129         {
7130                 /*
7131                  * the only op that could be a regnode is PLUS, all the rest
7132                  * will be regnode_1 or regnode_2.
7133                  *
7134                  * (yves doesn't think this is true)
7135                  */
7136                 if (OP(first) == PLUS)
7137                     sawplus = 1;
7138                 else {
7139                     if (OP(first) == MINMOD)
7140                         sawminmod = 1;
7141                     first += regarglen[OP(first)];
7142                 }
7143                 first = NEXTOPER(first);
7144                 first_next= regnext(first);
7145         }
7146
7147         /* Starting-point info. */
7148       again:
7149         DEBUG_PEEP("first:",first,0);
7150         /* Ignore EXACT as we deal with it later. */
7151         if (PL_regkind[OP(first)] == EXACT) {
7152             if (OP(first) == EXACT || OP(first) == EXACTL)
7153                 NOOP;   /* Empty, get anchored substr later. */
7154             else
7155                 ri->regstclass = first;
7156         }
7157 #ifdef TRIE_STCLASS
7158         else if (PL_regkind[OP(first)] == TRIE &&
7159                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7160         {
7161             /* this can happen only on restudy */
7162             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7163         }
7164 #endif
7165         else if (REGNODE_SIMPLE(OP(first)))
7166             ri->regstclass = first;
7167         else if (PL_regkind[OP(first)] == BOUND ||
7168                  PL_regkind[OP(first)] == NBOUND)
7169             ri->regstclass = first;
7170         else if (PL_regkind[OP(first)] == BOL) {
7171             r->intflags |= (OP(first) == MBOL
7172                            ? PREGf_ANCH_MBOL
7173                            : PREGf_ANCH_SBOL);
7174             first = NEXTOPER(first);
7175             goto again;
7176         }
7177         else if (OP(first) == GPOS) {
7178             r->intflags |= PREGf_ANCH_GPOS;
7179             first = NEXTOPER(first);
7180             goto again;
7181         }
7182         else if ((!sawopen || !RExC_sawback) &&
7183             !sawlookahead &&
7184             (OP(first) == STAR &&
7185             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7186             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7187         {
7188             /* turn .* into ^.* with an implied $*=1 */
7189             const int type =
7190                 (OP(NEXTOPER(first)) == REG_ANY)
7191                     ? PREGf_ANCH_MBOL
7192                     : PREGf_ANCH_SBOL;
7193             r->intflags |= (type | PREGf_IMPLICIT);
7194             first = NEXTOPER(first);
7195             goto again;
7196         }
7197         if (sawplus && !sawminmod && !sawlookahead
7198             && (!sawopen || !RExC_sawback)
7199             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7200             /* x+ must match at the 1st pos of run of x's */
7201             r->intflags |= PREGf_SKIP;
7202
7203         /* Scan is after the zeroth branch, first is atomic matcher. */
7204 #ifdef TRIE_STUDY_OPT
7205         DEBUG_PARSE_r(
7206             if (!restudied)
7207                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7208                               (IV)(first - scan + 1))
7209         );
7210 #else
7211         DEBUG_PARSE_r(
7212             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7213                 (IV)(first - scan + 1))
7214         );
7215 #endif
7216
7217
7218         /*
7219         * If there's something expensive in the r.e., find the
7220         * longest literal string that must appear and make it the
7221         * regmust.  Resolve ties in favor of later strings, since
7222         * the regstart check works with the beginning of the r.e.
7223         * and avoiding duplication strengthens checking.  Not a
7224         * strong reason, but sufficient in the absence of others.
7225         * [Now we resolve ties in favor of the earlier string if
7226         * it happens that c_offset_min has been invalidated, since the
7227         * earlier string may buy us something the later one won't.]
7228         */
7229
7230         data.longest_fixed = newSVpvs("");
7231         data.longest_float = newSVpvs("");
7232         data.last_found = newSVpvs("");
7233         data.longest = &(data.longest_fixed);
7234         ENTER_with_name("study_chunk");
7235         SAVEFREESV(data.longest_fixed);
7236         SAVEFREESV(data.longest_float);
7237         SAVEFREESV(data.last_found);
7238         first = scan;
7239         if (!ri->regstclass) {
7240             ssc_init(pRExC_state, &ch_class);
7241             data.start_class = &ch_class;
7242             stclass_flag = SCF_DO_STCLASS_AND;
7243         } else                          /* XXXX Check for BOUND? */
7244             stclass_flag = 0;
7245         data.last_closep = &last_close;
7246
7247         DEBUG_RExC_seen();
7248         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7249                              scan + RExC_size, /* Up to end */
7250             &data, -1, 0, NULL,
7251             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7252                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7253             0);
7254
7255
7256         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7257
7258
7259         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7260              && data.last_start_min == 0 && data.last_end > 0
7261              && !RExC_seen_zerolen
7262              && !(RExC_seen & REG_VERBARG_SEEN)
7263              && !(RExC_seen & REG_GPOS_SEEN)
7264         ){
7265             r->extflags |= RXf_CHECK_ALL;
7266         }
7267         scan_commit(pRExC_state, &data,&minlen,0);
7268
7269         longest_float_length = CHR_SVLEN(data.longest_float);
7270
7271         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7272                    && data.offset_fixed == data.offset_float_min
7273                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7274             && S_setup_longest (aTHX_ pRExC_state,
7275                                     data.longest_float,
7276                                     &(r->float_utf8),
7277                                     &(r->float_substr),
7278                                     &(r->float_end_shift),
7279                                     data.lookbehind_float,
7280                                     data.offset_float_min,
7281                                     data.minlen_float,
7282                                     longest_float_length,
7283                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7284                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7285         {
7286             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7287             r->float_max_offset = data.offset_float_max;
7288             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7289                 r->float_max_offset -= data.lookbehind_float;
7290             SvREFCNT_inc_simple_void_NN(data.longest_float);
7291         }
7292         else {
7293             r->float_substr = r->float_utf8 = NULL;
7294             longest_float_length = 0;
7295         }
7296
7297         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7298
7299         if (S_setup_longest (aTHX_ pRExC_state,
7300                                 data.longest_fixed,
7301                                 &(r->anchored_utf8),
7302                                 &(r->anchored_substr),
7303                                 &(r->anchored_end_shift),
7304                                 data.lookbehind_fixed,
7305                                 data.offset_fixed,
7306                                 data.minlen_fixed,
7307                                 longest_fixed_length,
7308                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7309                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7310         {
7311             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7312             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7313         }
7314         else {
7315             r->anchored_substr = r->anchored_utf8 = NULL;
7316             longest_fixed_length = 0;
7317         }
7318         LEAVE_with_name("study_chunk");
7319
7320         if (ri->regstclass
7321             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7322             ri->regstclass = NULL;
7323
7324         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7325             && stclass_flag
7326             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7327             && is_ssc_worth_it(pRExC_state, data.start_class))
7328         {
7329             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7330
7331             ssc_finalize(pRExC_state, data.start_class);
7332
7333             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7334             StructCopy(data.start_class,
7335                        (regnode_ssc*)RExC_rxi->data->data[n],
7336                        regnode_ssc);
7337             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7338             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7339             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7340                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7341                       PerlIO_printf(Perl_debug_log,
7342                                     "synthetic stclass \"%s\".\n",
7343                                     SvPVX_const(sv));});
7344             data.start_class = NULL;
7345         }
7346
7347         /* A temporary algorithm prefers floated substr to fixed one to dig
7348          * more info. */
7349         if (longest_fixed_length > longest_float_length) {
7350             r->substrs->check_ix = 0;
7351             r->check_end_shift = r->anchored_end_shift;
7352             r->check_substr = r->anchored_substr;
7353             r->check_utf8 = r->anchored_utf8;
7354             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7355             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7356                 r->intflags |= PREGf_NOSCAN;
7357         }
7358         else {
7359             r->substrs->check_ix = 1;
7360             r->check_end_shift = r->float_end_shift;
7361             r->check_substr = r->float_substr;
7362             r->check_utf8 = r->float_utf8;
7363             r->check_offset_min = r->float_min_offset;
7364             r->check_offset_max = r->float_max_offset;
7365         }
7366         if ((r->check_substr || r->check_utf8) ) {
7367             r->extflags |= RXf_USE_INTUIT;
7368             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7369                 r->extflags |= RXf_INTUIT_TAIL;
7370         }
7371         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7372
7373         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7374         if ( (STRLEN)minlen < longest_float_length )
7375             minlen= longest_float_length;
7376         if ( (STRLEN)minlen < longest_fixed_length )
7377             minlen= longest_fixed_length;
7378         */
7379     }
7380     else {
7381         /* Several toplevels. Best we can is to set minlen. */
7382         SSize_t fake;
7383         regnode_ssc ch_class;
7384         SSize_t last_close = 0;
7385
7386         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7387
7388         scan = ri->program + 1;
7389         ssc_init(pRExC_state, &ch_class);
7390         data.start_class = &ch_class;
7391         data.last_closep = &last_close;
7392
7393         DEBUG_RExC_seen();
7394         minlen = study_chunk(pRExC_state,
7395             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7396             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7397                                                       ? SCF_TRIE_DOING_RESTUDY
7398                                                       : 0),
7399             0);
7400
7401         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7402
7403         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7404                 = r->float_substr = r->float_utf8 = NULL;
7405
7406         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7407             && is_ssc_worth_it(pRExC_state, data.start_class))
7408         {
7409             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7410
7411             ssc_finalize(pRExC_state, data.start_class);
7412
7413             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7414             StructCopy(data.start_class,
7415                        (regnode_ssc*)RExC_rxi->data->data[n],
7416                        regnode_ssc);
7417             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7418             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7419             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7420                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7421                       PerlIO_printf(Perl_debug_log,
7422                                     "synthetic stclass \"%s\".\n",
7423                                     SvPVX_const(sv));});
7424             data.start_class = NULL;
7425         }
7426     }
7427
7428     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7429         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7430         r->maxlen = REG_INFTY;
7431     }
7432     else {
7433         r->maxlen = RExC_maxlen;
7434     }
7435
7436     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7437        the "real" pattern. */
7438     DEBUG_OPTIMISE_r({
7439         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7440                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7441     });
7442     r->minlenret = minlen;
7443     if (r->minlen < minlen)
7444         r->minlen = minlen;
7445
7446     if (RExC_seen & REG_GPOS_SEEN)
7447         r->intflags |= PREGf_GPOS_SEEN;
7448     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7449         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7450                                                 lookbehind */
7451     if (pRExC_state->num_code_blocks)
7452         r->extflags |= RXf_EVAL_SEEN;
7453     if (RExC_seen & REG_VERBARG_SEEN)
7454     {
7455         r->intflags |= PREGf_VERBARG_SEEN;
7456         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7457     }
7458     if (RExC_seen & REG_CUTGROUP_SEEN)
7459         r->intflags |= PREGf_CUTGROUP_SEEN;
7460     if (pm_flags & PMf_USE_RE_EVAL)
7461         r->intflags |= PREGf_USE_RE_EVAL;
7462     if (RExC_paren_names)
7463         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7464     else
7465         RXp_PAREN_NAMES(r) = NULL;
7466
7467     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7468      * so it can be used in pp.c */
7469     if (r->intflags & PREGf_ANCH)
7470         r->extflags |= RXf_IS_ANCHORED;
7471
7472
7473     {
7474         /* this is used to identify "special" patterns that might result
7475          * in Perl NOT calling the regex engine and instead doing the match "itself",
7476          * particularly special cases in split//. By having the regex compiler
7477          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7478          * we avoid weird issues with equivalent patterns resulting in different behavior,
7479          * AND we allow non Perl engines to get the same optimizations by the setting the
7480          * flags appropriately - Yves */
7481         regnode *first = ri->program + 1;
7482         U8 fop = OP(first);
7483         regnode *next = regnext(first);
7484         U8 nop = OP(next);
7485
7486         if (PL_regkind[fop] == NOTHING && nop == END)
7487             r->extflags |= RXf_NULL;
7488         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7489             /* when fop is SBOL first->flags will be true only when it was
7490              * produced by parsing /\A/, and not when parsing /^/. This is
7491              * very important for the split code as there we want to
7492              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7493              * See rt #122761 for more details. -- Yves */
7494             r->extflags |= RXf_START_ONLY;
7495         else if (fop == PLUS
7496                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7497                  && nop == END)
7498             r->extflags |= RXf_WHITE;
7499         else if ( r->extflags & RXf_SPLIT
7500                   && (fop == EXACT || fop == EXACTL)
7501                   && STR_LEN(first) == 1
7502                   && *(STRING(first)) == ' '
7503                   && nop == END )
7504             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7505
7506     }
7507
7508     if (RExC_contains_locale) {
7509         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7510     }
7511
7512 #ifdef DEBUGGING
7513     if (RExC_paren_names) {
7514         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7515         ri->data->data[ri->name_list_idx]
7516                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7517     } else
7518 #endif
7519         ri->name_list_idx = 0;
7520
7521     if (RExC_recurse_count) {
7522         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7523             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7524             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7525         }
7526     }
7527     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7528     /* assume we don't need to swap parens around before we match */
7529     DEBUG_TEST_r({
7530         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7531             (unsigned long)RExC_study_chunk_recursed_count);
7532     });
7533     DEBUG_DUMP_r({
7534         DEBUG_RExC_seen();
7535         PerlIO_printf(Perl_debug_log,"Final program:\n");
7536         regdump(r);
7537     });
7538 #ifdef RE_TRACK_PATTERN_OFFSETS
7539     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7540         const STRLEN len = ri->u.offsets[0];
7541         STRLEN i;
7542         GET_RE_DEBUG_FLAGS_DECL;
7543         PerlIO_printf(Perl_debug_log,
7544                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7545         for (i = 1; i <= len; i++) {
7546             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7547                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7548                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7549             }
7550         PerlIO_printf(Perl_debug_log, "\n");
7551     });
7552 #endif
7553
7554 #ifdef USE_ITHREADS
7555     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7556      * by setting the regexp SV to readonly-only instead. If the
7557      * pattern's been recompiled, the USEDness should remain. */
7558     if (old_re && SvREADONLY(old_re))
7559         SvREADONLY_on(rx);
7560 #endif
7561     return rx;
7562 }
7563
7564
7565 SV*
7566 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7567                     const U32 flags)
7568 {
7569     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7570
7571     PERL_UNUSED_ARG(value);
7572
7573     if (flags & RXapif_FETCH) {
7574         return reg_named_buff_fetch(rx, key, flags);
7575     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7576         Perl_croak_no_modify();
7577         return NULL;
7578     } else if (flags & RXapif_EXISTS) {
7579         return reg_named_buff_exists(rx, key, flags)
7580             ? &PL_sv_yes
7581             : &PL_sv_no;
7582     } else if (flags & RXapif_REGNAMES) {
7583         return reg_named_buff_all(rx, flags);
7584     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7585         return reg_named_buff_scalar(rx, flags);
7586     } else {
7587         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7588         return NULL;
7589     }
7590 }
7591
7592 SV*
7593 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7594                          const U32 flags)
7595 {
7596     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7597     PERL_UNUSED_ARG(lastkey);
7598
7599     if (flags & RXapif_FIRSTKEY)
7600         return reg_named_buff_firstkey(rx, flags);
7601     else if (flags & RXapif_NEXTKEY)
7602         return reg_named_buff_nextkey(rx, flags);
7603     else {
7604         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7605                                             (int)flags);
7606         return NULL;
7607     }
7608 }
7609
7610 SV*
7611 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7612                           const U32 flags)
7613 {
7614     AV *retarray = NULL;
7615     SV *ret;
7616     struct regexp *const rx = ReANY(r);
7617
7618     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7619
7620     if (flags & RXapif_ALL)
7621         retarray=newAV();
7622
7623     if (rx && RXp_PAREN_NAMES(rx)) {
7624         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7625         if (he_str) {
7626             IV i;
7627             SV* sv_dat=HeVAL(he_str);
7628             I32 *nums=(I32*)SvPVX(sv_dat);
7629             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7630                 if ((I32)(rx->nparens) >= nums[i]
7631                     && rx->offs[nums[i]].start != -1
7632                     && rx->offs[nums[i]].end != -1)
7633                 {
7634                     ret = newSVpvs("");
7635                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7636                     if (!retarray)
7637                         return ret;
7638                 } else {
7639                     if (retarray)
7640                         ret = newSVsv(&PL_sv_undef);
7641                 }
7642                 if (retarray)
7643                     av_push(retarray, ret);
7644             }
7645             if (retarray)
7646                 return newRV_noinc(MUTABLE_SV(retarray));
7647         }
7648     }
7649     return NULL;
7650 }
7651
7652 bool
7653 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7654                            const U32 flags)
7655 {
7656     struct regexp *const rx = ReANY(r);
7657
7658     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7659
7660     if (rx && RXp_PAREN_NAMES(rx)) {
7661         if (flags & RXapif_ALL) {
7662             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7663         } else {
7664             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7665             if (sv) {
7666                 SvREFCNT_dec_NN(sv);
7667                 return TRUE;
7668             } else {
7669                 return FALSE;
7670             }
7671         }
7672     } else {
7673         return FALSE;
7674     }
7675 }
7676
7677 SV*
7678 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7679 {
7680     struct regexp *const rx = ReANY(r);
7681
7682     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7683
7684     if ( rx && RXp_PAREN_NAMES(rx) ) {
7685         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7686
7687         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7688     } else {
7689         return FALSE;
7690     }
7691 }
7692
7693 SV*
7694 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7695 {
7696     struct regexp *const rx = ReANY(r);
7697     GET_RE_DEBUG_FLAGS_DECL;
7698
7699     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7700
7701     if (rx && RXp_PAREN_NAMES(rx)) {
7702         HV *hv = RXp_PAREN_NAMES(rx);
7703         HE *temphe;
7704         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7705             IV i;
7706             IV parno = 0;
7707             SV* sv_dat = HeVAL(temphe);
7708             I32 *nums = (I32*)SvPVX(sv_dat);
7709             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7710                 if ((I32)(rx->lastparen) >= nums[i] &&
7711                     rx->offs[nums[i]].start != -1 &&
7712                     rx->offs[nums[i]].end != -1)
7713                 {
7714                     parno = nums[i];
7715                     break;
7716                 }
7717             }
7718             if (parno || flags & RXapif_ALL) {
7719                 return newSVhek(HeKEY_hek(temphe));
7720             }
7721         }
7722     }
7723     return NULL;
7724 }
7725
7726 SV*
7727 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7728 {
7729     SV *ret;
7730     AV *av;
7731     SSize_t length;
7732     struct regexp *const rx = ReANY(r);
7733
7734     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7735
7736     if (rx && RXp_PAREN_NAMES(rx)) {
7737         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7738             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7739         } else if (flags & RXapif_ONE) {
7740             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7741             av = MUTABLE_AV(SvRV(ret));
7742             length = av_tindex(av);
7743             SvREFCNT_dec_NN(ret);
7744             return newSViv(length + 1);
7745         } else {
7746             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7747                                                 (int)flags);
7748             return NULL;
7749         }
7750     }
7751     return &PL_sv_undef;
7752 }
7753
7754 SV*
7755 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7756 {
7757     struct regexp *const rx = ReANY(r);
7758     AV *av = newAV();
7759
7760     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7761
7762     if (rx && RXp_PAREN_NAMES(rx)) {
7763         HV *hv= RXp_PAREN_NAMES(rx);
7764         HE *temphe;
7765         (void)hv_iterinit(hv);
7766         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7767             IV i;
7768             IV parno = 0;
7769             SV* sv_dat = HeVAL(temphe);
7770             I32 *nums = (I32*)SvPVX(sv_dat);
7771             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7772                 if ((I32)(rx->lastparen) >= nums[i] &&
7773                     rx->offs[nums[i]].start != -1 &&
7774                     rx->offs[nums[i]].end != -1)
7775                 {
7776                     parno = nums[i];
7777                     break;
7778                 }
7779             }
7780             if (parno || flags & RXapif_ALL) {
7781                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7782             }
7783         }
7784     }
7785
7786     return newRV_noinc(MUTABLE_SV(av));
7787 }
7788
7789 void
7790 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7791                              SV * const sv)
7792 {
7793     struct regexp *const rx = ReANY(r);
7794     char *s = NULL;
7795     SSize_t i = 0;
7796     SSize_t s1, t1;
7797     I32 n = paren;
7798
7799     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7800
7801     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7802            || n == RX_BUFF_IDX_CARET_FULLMATCH
7803            || n == RX_BUFF_IDX_CARET_POSTMATCH
7804        )
7805     {
7806         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7807         if (!keepcopy) {
7808             /* on something like
7809              *    $r = qr/.../;
7810              *    /$qr/p;
7811              * the KEEPCOPY is set on the PMOP rather than the regex */
7812             if (PL_curpm && r == PM_GETRE(PL_curpm))
7813                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7814         }
7815         if (!keepcopy)
7816             goto ret_undef;
7817     }
7818
7819     if (!rx->subbeg)
7820         goto ret_undef;
7821
7822     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7823         /* no need to distinguish between them any more */
7824         n = RX_BUFF_IDX_FULLMATCH;
7825
7826     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7827         && rx->offs[0].start != -1)
7828     {
7829         /* $`, ${^PREMATCH} */
7830         i = rx->offs[0].start;
7831         s = rx->subbeg;
7832     }
7833     else
7834     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7835         && rx->offs[0].end != -1)
7836     {
7837         /* $', ${^POSTMATCH} */
7838         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7839         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7840     }
7841     else
7842     if ( 0 <= n && n <= (I32)rx->nparens &&
7843         (s1 = rx->offs[n].start) != -1 &&
7844         (t1 = rx->offs[n].end) != -1)
7845     {
7846         /* $&, ${^MATCH},  $1 ... */
7847         i = t1 - s1;
7848         s = rx->subbeg + s1 - rx->suboffset;
7849     } else {
7850         goto ret_undef;
7851     }
7852
7853     assert(s >= rx->subbeg);
7854     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7855     if (i >= 0) {
7856 #ifdef NO_TAINT_SUPPORT
7857         sv_setpvn(sv, s, i);
7858 #else
7859         const int oldtainted = TAINT_get;
7860         TAINT_NOT;
7861         sv_setpvn(sv, s, i);
7862         TAINT_set(oldtainted);
7863 #endif
7864         if (RXp_MATCH_UTF8(rx))
7865             SvUTF8_on(sv);
7866         else
7867             SvUTF8_off(sv);
7868         if (TAINTING_get) {
7869             if (RXp_MATCH_TAINTED(rx)) {
7870                 if (SvTYPE(sv) >= SVt_PVMG) {
7871                     MAGIC* const mg = SvMAGIC(sv);
7872                     MAGIC* mgt;
7873                     TAINT;
7874                     SvMAGIC_set(sv, mg->mg_moremagic);
7875                     SvTAINT(sv);
7876                     if ((mgt = SvMAGIC(sv))) {
7877                         mg->mg_moremagic = mgt;
7878                         SvMAGIC_set(sv, mg);
7879                     }
7880                 } else {
7881                     TAINT;
7882                     SvTAINT(sv);
7883                 }
7884             } else
7885                 SvTAINTED_off(sv);
7886         }
7887     } else {
7888       ret_undef:
7889         sv_setsv(sv,&PL_sv_undef);
7890         return;
7891     }
7892 }
7893
7894 void
7895 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7896                                                          SV const * const value)
7897 {
7898     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7899
7900     PERL_UNUSED_ARG(rx);
7901     PERL_UNUSED_ARG(paren);
7902     PERL_UNUSED_ARG(value);
7903
7904     if (!PL_localizing)
7905         Perl_croak_no_modify();
7906 }
7907
7908 I32
7909 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7910                               const I32 paren)
7911 {
7912     struct regexp *const rx = ReANY(r);
7913     I32 i;
7914     I32 s1, t1;
7915
7916     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7917
7918     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7919         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7920         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7921     )
7922     {
7923         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7924         if (!keepcopy) {
7925             /* on something like
7926              *    $r = qr/.../;
7927              *    /$qr/p;
7928              * the KEEPCOPY is set on the PMOP rather than the regex */
7929             if (PL_curpm && r == PM_GETRE(PL_curpm))
7930                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7931         }
7932         if (!keepcopy)
7933             goto warn_undef;
7934     }
7935
7936     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7937     switch (paren) {
7938       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7939       case RX_BUFF_IDX_PREMATCH:       /* $` */
7940         if (rx->offs[0].start != -1) {
7941                         i = rx->offs[0].start;
7942                         if (i > 0) {
7943                                 s1 = 0;
7944                                 t1 = i;
7945                                 goto getlen;
7946                         }
7947             }
7948         return 0;
7949
7950       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7951       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7952             if (rx->offs[0].end != -1) {
7953                         i = rx->sublen - rx->offs[0].end;
7954                         if (i > 0) {
7955                                 s1 = rx->offs[0].end;
7956                                 t1 = rx->sublen;
7957                                 goto getlen;
7958                         }
7959             }
7960         return 0;
7961
7962       default: /* $& / ${^MATCH}, $1, $2, ... */
7963             if (paren <= (I32)rx->nparens &&
7964             (s1 = rx->offs[paren].start) != -1 &&
7965             (t1 = rx->offs[paren].end) != -1)
7966             {
7967             i = t1 - s1;
7968             goto getlen;
7969         } else {
7970           warn_undef:
7971             if (ckWARN(WARN_UNINITIALIZED))
7972                 report_uninit((const SV *)sv);
7973             return 0;
7974         }
7975     }
7976   getlen:
7977     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7978         const char * const s = rx->subbeg - rx->suboffset + s1;
7979         const U8 *ep;
7980         STRLEN el;
7981
7982         i = t1 - s1;
7983         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7984                         i = el;
7985     }
7986     return i;
7987 }
7988
7989 SV*
7990 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7991 {
7992     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7993         PERL_UNUSED_ARG(rx);
7994         if (0)
7995             return NULL;
7996         else
7997             return newSVpvs("Regexp");
7998 }
7999
8000 /* Scans the name of a named buffer from the pattern.
8001  * If flags is REG_RSN_RETURN_NULL returns null.
8002  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8003  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8004  * to the parsed name as looked up in the RExC_paren_names hash.
8005  * If there is an error throws a vFAIL().. type exception.
8006  */
8007
8008 #define REG_RSN_RETURN_NULL    0
8009 #define REG_RSN_RETURN_NAME    1
8010 #define REG_RSN_RETURN_DATA    2
8011
8012 STATIC SV*
8013 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8014 {
8015     char *name_start = RExC_parse;
8016
8017     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8018
8019     assert (RExC_parse <= RExC_end);
8020     if (RExC_parse == RExC_end) NOOP;
8021     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
8022          /* skip IDFIRST by using do...while */
8023         if (UTF)
8024             do {
8025                 RExC_parse += UTF8SKIP(RExC_parse);
8026             } while (isWORDCHAR_utf8((U8*)RExC_parse));
8027         else
8028             do {
8029                 RExC_parse++;
8030             } while (isWORDCHAR(*RExC_parse));
8031     } else {
8032         RExC_parse++; /* so the <- from the vFAIL is after the offending
8033                          character */
8034         vFAIL("Group name must start with a non-digit word character");
8035     }
8036     if ( flags ) {
8037         SV* sv_name
8038             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8039                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8040         if ( flags == REG_RSN_RETURN_NAME)
8041             return sv_name;
8042         else if (flags==REG_RSN_RETURN_DATA) {
8043             HE *he_str = NULL;
8044             SV *sv_dat = NULL;
8045             if ( ! sv_name )      /* should not happen*/
8046                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8047             if (RExC_paren_names)
8048                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8049             if ( he_str )
8050                 sv_dat = HeVAL(he_str);
8051             if ( ! sv_dat )
8052                 vFAIL("Reference to nonexistent named group");
8053             return sv_dat;
8054         }
8055         else {
8056             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8057                        (unsigned long) flags);
8058         }
8059         NOT_REACHED; /* NOTREACHED */
8060     }
8061     return NULL;
8062 }
8063
8064 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8065     int num;                                                    \
8066     if (RExC_lastparse!=RExC_parse) {                           \
8067         PerlIO_printf(Perl_debug_log, "%s",                     \
8068             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8069                 RExC_end - RExC_parse, 16,                      \
8070                 "", "",                                         \
8071                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8072                 PERL_PV_PRETTY_ELLIPSES   |                     \
8073                 PERL_PV_PRETTY_LTGT       |                     \
8074                 PERL_PV_ESCAPE_RE         |                     \
8075                 PERL_PV_PRETTY_EXACTSIZE                        \
8076             )                                                   \
8077         );                                                      \
8078     } else                                                      \
8079         PerlIO_printf(Perl_debug_log,"%16s","");                \
8080                                                                 \
8081     if (SIZE_ONLY)                                              \
8082        num = RExC_size + 1;                                     \
8083     else                                                        \
8084        num=REG_NODE_NUM(RExC_emit);                             \
8085     if (RExC_lastnum!=num)                                      \
8086        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
8087     else                                                        \
8088        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
8089     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
8090         (int)((depth*2)), "",                                   \
8091         (funcname)                                              \
8092     );                                                          \
8093     RExC_lastnum=num;                                           \
8094     RExC_lastparse=RExC_parse;                                  \
8095 })
8096
8097
8098
8099 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8100     DEBUG_PARSE_MSG((funcname));                            \
8101     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
8102 })
8103 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
8104     DEBUG_PARSE_MSG((funcname));                            \
8105     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
8106 })
8107
8108 /* This section of code defines the inversion list object and its methods.  The
8109  * interfaces are highly subject to change, so as much as possible is static to
8110  * this file.  An inversion list is here implemented as a malloc'd C UV array
8111  * as an SVt_INVLIST scalar.
8112  *
8113  * An inversion list for Unicode is an array of code points, sorted by ordinal
8114  * number.  The zeroth element is the first code point in the list.  The 1th
8115  * element is the first element beyond that not in the list.  In other words,
8116  * the first range is
8117  *  invlist[0]..(invlist[1]-1)
8118  * The other ranges follow.  Thus every element whose index is divisible by two
8119  * marks the beginning of a range that is in the list, and every element not
8120  * divisible by two marks the beginning of a range not in the list.  A single
8121  * element inversion list that contains the single code point N generally
8122  * consists of two elements
8123  *  invlist[0] == N
8124  *  invlist[1] == N+1
8125  * (The exception is when N is the highest representable value on the
8126  * machine, in which case the list containing just it would be a single
8127  * element, itself.  By extension, if the last range in the list extends to
8128  * infinity, then the first element of that range will be in the inversion list
8129  * at a position that is divisible by two, and is the final element in the
8130  * list.)
8131  * Taking the complement (inverting) an inversion list is quite simple, if the
8132  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8133  * This implementation reserves an element at the beginning of each inversion
8134  * list to always contain 0; there is an additional flag in the header which
8135  * indicates if the list begins at the 0, or is offset to begin at the next
8136  * element.
8137  *
8138  * More about inversion lists can be found in "Unicode Demystified"
8139  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8140  * More will be coming when functionality is added later.
8141  *
8142  * The inversion list data structure is currently implemented as an SV pointing
8143  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8144  * array of UV whose memory management is automatically handled by the existing
8145  * facilities for SV's.
8146  *
8147  * Some of the methods should always be private to the implementation, and some
8148  * should eventually be made public */
8149
8150 /* The header definitions are in F<invlist_inline.h> */
8151
8152 PERL_STATIC_INLINE UV*
8153 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8154 {
8155     /* Returns a pointer to the first element in the inversion list's array.
8156      * This is called upon initialization of an inversion list.  Where the
8157      * array begins depends on whether the list has the code point U+0000 in it
8158      * or not.  The other parameter tells it whether the code that follows this
8159      * call is about to put a 0 in the inversion list or not.  The first
8160      * element is either the element reserved for 0, if TRUE, or the element
8161      * after it, if FALSE */
8162
8163     bool* offset = get_invlist_offset_addr(invlist);
8164     UV* zero_addr = (UV *) SvPVX(invlist);
8165
8166     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8167
8168     /* Must be empty */
8169     assert(! _invlist_len(invlist));
8170
8171     *zero_addr = 0;
8172
8173     /* 1^1 = 0; 1^0 = 1 */
8174     *offset = 1 ^ will_have_0;
8175     return zero_addr + *offset;
8176 }
8177
8178 PERL_STATIC_INLINE void
8179 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8180 {
8181     /* Sets the current number of elements stored in the inversion list.
8182      * Updates SvCUR correspondingly */
8183     PERL_UNUSED_CONTEXT;
8184     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8185
8186     assert(SvTYPE(invlist) == SVt_INVLIST);
8187
8188     SvCUR_set(invlist,
8189               (len == 0)
8190                ? 0
8191                : TO_INTERNAL_SIZE(len + offset));
8192     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8193 }
8194
8195 #ifndef PERL_IN_XSUB_RE
8196
8197 PERL_STATIC_INLINE IV*
8198 S_get_invlist_previous_index_addr(SV* invlist)
8199 {
8200     /* Return the address of the IV that is reserved to hold the cached index
8201      * */
8202     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8203
8204     assert(SvTYPE(invlist) == SVt_INVLIST);
8205
8206     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8207 }
8208
8209 PERL_STATIC_INLINE IV
8210 S_invlist_previous_index(SV* const invlist)
8211 {
8212     /* Returns cached index of previous search */
8213
8214     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8215
8216     return *get_invlist_previous_index_addr(invlist);
8217 }
8218
8219 PERL_STATIC_INLINE void
8220 S_invlist_set_previous_index(SV* const invlist, const IV index)
8221 {
8222     /* Caches <index> for later retrieval */
8223
8224     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8225
8226     assert(index == 0 || index < (int) _invlist_len(invlist));
8227
8228     *get_invlist_previous_index_addr(invlist) = index;
8229 }
8230
8231 PERL_STATIC_INLINE void
8232 S_invlist_trim(SV* const invlist)
8233 {
8234     PERL_ARGS_ASSERT_INVLIST_TRIM;
8235
8236     assert(SvTYPE(invlist) == SVt_INVLIST);
8237
8238     /* Change the length of the inversion list to how many entries it currently
8239      * has */
8240     SvPV_shrink_to_cur((SV *) invlist);
8241 }
8242
8243 PERL_STATIC_INLINE bool
8244 S_invlist_is_iterating(SV* const invlist)
8245 {
8246     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8247
8248     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8249 }
8250
8251 #endif /* ifndef PERL_IN_XSUB_RE */
8252
8253 PERL_STATIC_INLINE UV
8254 S_invlist_max(SV* const invlist)
8255 {
8256     /* Returns the maximum number of elements storable in the inversion list's
8257      * array, without having to realloc() */
8258
8259     PERL_ARGS_ASSERT_INVLIST_MAX;
8260
8261     assert(SvTYPE(invlist) == SVt_INVLIST);
8262
8263     /* Assumes worst case, in which the 0 element is not counted in the
8264      * inversion list, so subtracts 1 for that */
8265     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8266            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8267            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8268 }
8269
8270 #ifndef PERL_IN_XSUB_RE
8271 SV*
8272 Perl__new_invlist(pTHX_ IV initial_size)
8273 {
8274
8275     /* Return a pointer to a newly constructed inversion list, with enough
8276      * space to store 'initial_size' elements.  If that number is negative, a
8277      * system default is used instead */
8278
8279     SV* new_list;
8280
8281     if (initial_size < 0) {
8282         initial_size = 10;
8283     }
8284
8285     /* Allocate the initial space */
8286     new_list = newSV_type(SVt_INVLIST);
8287
8288     /* First 1 is in case the zero element isn't in the list; second 1 is for
8289      * trailing NUL */
8290     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8291     invlist_set_len(new_list, 0, 0);
8292
8293     /* Force iterinit() to be used to get iteration to work */
8294     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8295
8296     *get_invlist_previous_index_addr(new_list) = 0;
8297
8298     return new_list;
8299 }
8300
8301 SV*
8302 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8303 {
8304     /* Return a pointer to a newly constructed inversion list, initialized to
8305      * point to <list>, which has to be in the exact correct inversion list
8306      * form, including internal fields.  Thus this is a dangerous routine that
8307      * should not be used in the wrong hands.  The passed in 'list' contains
8308      * several header fields at the beginning that are not part of the
8309      * inversion list body proper */
8310
8311     const STRLEN length = (STRLEN) list[0];
8312     const UV version_id =          list[1];
8313     const bool offset   =    cBOOL(list[2]);
8314 #define HEADER_LENGTH 3
8315     /* If any of the above changes in any way, you must change HEADER_LENGTH
8316      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8317      *      perl -E 'say int(rand 2**31-1)'
8318      */
8319 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8320                                         data structure type, so that one being
8321                                         passed in can be validated to be an
8322                                         inversion list of the correct vintage.
8323                                        */
8324
8325     SV* invlist = newSV_type(SVt_INVLIST);
8326
8327     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8328
8329     if (version_id != INVLIST_VERSION_ID) {
8330         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8331     }
8332
8333     /* The generated array passed in includes header elements that aren't part
8334      * of the list proper, so start it just after them */
8335     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8336
8337     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8338                                shouldn't touch it */
8339
8340     *(get_invlist_offset_addr(invlist)) = offset;
8341
8342     /* The 'length' passed to us is the physical number of elements in the
8343      * inversion list.  But if there is an offset the logical number is one
8344      * less than that */
8345     invlist_set_len(invlist, length  - offset, offset);
8346
8347     invlist_set_previous_index(invlist, 0);
8348
8349     /* Initialize the iteration pointer. */
8350     invlist_iterfinish(invlist);
8351
8352     SvREADONLY_on(invlist);
8353
8354     return invlist;
8355 }
8356 #endif /* ifndef PERL_IN_XSUB_RE */
8357
8358 STATIC void
8359 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8360 {
8361     /* Grow the maximum size of an inversion list */
8362
8363     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8364
8365     assert(SvTYPE(invlist) == SVt_INVLIST);
8366
8367     /* Add one to account for the zero element at the beginning which may not
8368      * be counted by the calling parameters */
8369     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8370 }
8371
8372 STATIC void
8373 S__append_range_to_invlist(pTHX_ SV* const invlist,
8374                                  const UV start, const UV end)
8375 {
8376    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8377     * the end of the inversion list.  The range must be above any existing
8378     * ones. */
8379
8380     UV* array;
8381     UV max = invlist_max(invlist);
8382     UV len = _invlist_len(invlist);
8383     bool offset;
8384
8385     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8386
8387     if (len == 0) { /* Empty lists must be initialized */
8388         offset = start != 0;
8389         array = _invlist_array_init(invlist, ! offset);
8390     }
8391     else {
8392         /* Here, the existing list is non-empty. The current max entry in the
8393          * list is generally the first value not in the set, except when the
8394          * set extends to the end of permissible values, in which case it is
8395          * the first entry in that final set, and so this call is an attempt to
8396          * append out-of-order */
8397
8398         UV final_element = len - 1;
8399         array = invlist_array(invlist);
8400         if (array[final_element] > start
8401             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8402         {
8403             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8404                      array[final_element], start,
8405                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8406         }
8407
8408         /* Here, it is a legal append.  If the new range begins with the first
8409          * value not in the set, it is extending the set, so the new first
8410          * value not in the set is one greater than the newly extended range.
8411          * */
8412         offset = *get_invlist_offset_addr(invlist);
8413         if (array[final_element] == start) {
8414             if (end != UV_MAX) {
8415                 array[final_element] = end + 1;
8416             }
8417             else {
8418                 /* But if the end is the maximum representable on the machine,
8419                  * just let the range that this would extend to have no end */
8420                 invlist_set_len(invlist, len - 1, offset);
8421             }
8422             return;
8423         }
8424     }
8425
8426     /* Here the new range doesn't extend any existing set.  Add it */
8427
8428     len += 2;   /* Includes an element each for the start and end of range */
8429
8430     /* If wll overflow the existing space, extend, which may cause the array to
8431      * be moved */
8432     if (max < len) {
8433         invlist_extend(invlist, len);
8434
8435         /* Have to set len here to avoid assert failure in invlist_array() */
8436         invlist_set_len(invlist, len, offset);
8437
8438         array = invlist_array(invlist);
8439     }
8440     else {
8441         invlist_set_len(invlist, len, offset);
8442     }
8443
8444     /* The next item on the list starts the range, the one after that is
8445      * one past the new range.  */
8446     array[len - 2] = start;
8447     if (end != UV_MAX) {
8448         array[len - 1] = end + 1;
8449     }
8450     else {
8451         /* But if the end is the maximum representable on the machine, just let
8452          * the range have no end */
8453         invlist_set_len(invlist, len - 1, offset);
8454     }
8455 }
8456
8457 #ifndef PERL_IN_XSUB_RE
8458
8459 IV
8460 Perl__invlist_search(SV* const invlist, const UV cp)
8461 {
8462     /* Searches the inversion list for the entry that contains the input code
8463      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8464      * return value is the index into the list's array of the range that
8465      * contains <cp> */
8466
8467     IV low = 0;
8468     IV mid;
8469     IV high = _invlist_len(invlist);
8470     const IV highest_element = high - 1;
8471     const UV* array;
8472
8473     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8474
8475     /* If list is empty, return failure. */
8476     if (high == 0) {
8477         return -1;
8478     }
8479
8480     /* (We can't get the array unless we know the list is non-empty) */
8481     array = invlist_array(invlist);
8482
8483     mid = invlist_previous_index(invlist);
8484     assert(mid >=0 && mid <= highest_element);
8485
8486     /* <mid> contains the cache of the result of the previous call to this
8487      * function (0 the first time).  See if this call is for the same result,
8488      * or if it is for mid-1.  This is under the theory that calls to this
8489      * function will often be for related code points that are near each other.
8490      * And benchmarks show that caching gives better results.  We also test
8491      * here if the code point is within the bounds of the list.  These tests
8492      * replace others that would have had to be made anyway to make sure that
8493      * the array bounds were not exceeded, and these give us extra information
8494      * at the same time */
8495     if (cp >= array[mid]) {
8496         if (cp >= array[highest_element]) {
8497             return highest_element;
8498         }
8499
8500         /* Here, array[mid] <= cp < array[highest_element].  This means that
8501          * the final element is not the answer, so can exclude it; it also
8502          * means that <mid> is not the final element, so can refer to 'mid + 1'
8503          * safely */
8504         if (cp < array[mid + 1]) {
8505             return mid;
8506         }
8507         high--;
8508         low = mid + 1;
8509     }
8510     else { /* cp < aray[mid] */
8511         if (cp < array[0]) { /* Fail if outside the array */
8512             return -1;
8513         }
8514         high = mid;
8515         if (cp >= array[mid - 1]) {
8516             goto found_entry;
8517         }
8518     }
8519
8520     /* Binary search.  What we are looking for is <i> such that
8521      *  array[i] <= cp < array[i+1]
8522      * The loop below converges on the i+1.  Note that there may not be an
8523      * (i+1)th element in the array, and things work nonetheless */
8524     while (low < high) {
8525         mid = (low + high) / 2;
8526         assert(mid <= highest_element);
8527         if (array[mid] <= cp) { /* cp >= array[mid] */
8528             low = mid + 1;
8529
8530             /* We could do this extra test to exit the loop early.
8531             if (cp < array[low]) {
8532                 return mid;
8533             }
8534             */
8535         }
8536         else { /* cp < array[mid] */
8537             high = mid;
8538         }
8539     }
8540
8541   found_entry:
8542     high--;
8543     invlist_set_previous_index(invlist, high);
8544     return high;
8545 }
8546
8547 void
8548 Perl__invlist_populate_swatch(SV* const invlist,
8549                               const UV start, const UV end, U8* swatch)
8550 {
8551     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8552      * but is used when the swash has an inversion list.  This makes this much
8553      * faster, as it uses a binary search instead of a linear one.  This is
8554      * intimately tied to that function, and perhaps should be in utf8.c,
8555      * except it is intimately tied to inversion lists as well.  It assumes
8556      * that <swatch> is all 0's on input */
8557
8558     UV current = start;
8559     const IV len = _invlist_len(invlist);
8560     IV i;
8561     const UV * array;
8562
8563     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8564
8565     if (len == 0) { /* Empty inversion list */
8566         return;
8567     }
8568
8569     array = invlist_array(invlist);
8570
8571     /* Find which element it is */
8572     i = _invlist_search(invlist, start);
8573
8574     /* We populate from <start> to <end> */
8575     while (current < end) {
8576         UV upper;
8577
8578         /* The inversion list gives the results for every possible code point
8579          * after the first one in the list.  Only those ranges whose index is
8580          * even are ones that the inversion list matches.  For the odd ones,
8581          * and if the initial code point is not in the list, we have to skip
8582          * forward to the next element */
8583         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8584             i++;
8585             if (i >= len) { /* Finished if beyond the end of the array */
8586                 return;
8587             }
8588             current = array[i];
8589             if (current >= end) {   /* Finished if beyond the end of what we
8590                                        are populating */
8591                 if (LIKELY(end < UV_MAX)) {
8592                     return;
8593                 }
8594
8595                 /* We get here when the upper bound is the maximum
8596                  * representable on the machine, and we are looking for just
8597                  * that code point.  Have to special case it */
8598                 i = len;
8599                 goto join_end_of_list;
8600             }
8601         }
8602         assert(current >= start);
8603
8604         /* The current range ends one below the next one, except don't go past
8605          * <end> */
8606         i++;
8607         upper = (i < len && array[i] < end) ? array[i] : end;
8608
8609         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8610          * for each code point in it */
8611         for (; current < upper; current++) {
8612             const STRLEN offset = (STRLEN)(current - start);
8613             swatch[offset >> 3] |= 1 << (offset & 7);
8614         }
8615
8616       join_end_of_list:
8617
8618         /* Quit if at the end of the list */
8619         if (i >= len) {
8620
8621             /* But first, have to deal with the highest possible code point on
8622              * the platform.  The previous code assumes that <end> is one
8623              * beyond where we want to populate, but that is impossible at the
8624              * platform's infinity, so have to handle it specially */
8625             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8626             {
8627                 const STRLEN offset = (STRLEN)(end - start);
8628                 swatch[offset >> 3] |= 1 << (offset & 7);
8629             }
8630             return;
8631         }
8632
8633         /* Advance to the next range, which will be for code points not in the
8634          * inversion list */
8635         current = array[i];
8636     }
8637
8638     return;
8639 }
8640
8641 void
8642 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8643                                          const bool complement_b, SV** output)
8644 {
8645     /* Take the union of two inversion lists and point <output> to it.  *output
8646      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8647      * the reference count to that list will be decremented if not already a
8648      * temporary (mortal); otherwise *output will be made correspondingly
8649      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8650      * second list is returned.  If <complement_b> is TRUE, the union is taken
8651      * of the complement (inversion) of <b> instead of b itself.
8652      *
8653      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8654      * Richard Gillam, published by Addison-Wesley, and explained at some
8655      * length there.  The preface says to incorporate its examples into your
8656      * code at your own risk.
8657      *
8658      * The algorithm is like a merge sort.
8659      *
8660      * XXX A potential performance improvement is to keep track as we go along
8661      * if only one of the inputs contributes to the result, meaning the other
8662      * is a subset of that one.  In that case, we can skip the final copy and
8663      * return the larger of the input lists, but then outside code might need
8664      * to keep track of whether to free the input list or not */
8665
8666     const UV* array_a;    /* a's array */
8667     const UV* array_b;
8668     UV len_a;       /* length of a's array */
8669     UV len_b;
8670
8671     SV* u;                      /* the resulting union */
8672     UV* array_u;
8673     UV len_u;
8674
8675     UV i_a = 0;             /* current index into a's array */
8676     UV i_b = 0;
8677     UV i_u = 0;
8678
8679     /* running count, as explained in the algorithm source book; items are
8680      * stopped accumulating and are output when the count changes to/from 0.
8681      * The count is incremented when we start a range that's in the set, and
8682      * decremented when we start a range that's not in the set.  So its range
8683      * is 0 to 2.  Only when the count is zero is something not in the set.
8684      */
8685     UV count = 0;
8686
8687     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8688     assert(a != b);
8689
8690     /* If either one is empty, the union is the other one */
8691     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8692         bool make_temp = FALSE; /* Should we mortalize the result? */
8693
8694         if (*output == a) {
8695             if (a != NULL) {
8696                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8697                     SvREFCNT_dec_NN(a);
8698                 }
8699             }
8700         }
8701         if (*output != b) {
8702             *output = invlist_clone(b);
8703             if (complement_b) {
8704                 _invlist_invert(*output);
8705             }
8706         } /* else *output already = b; */
8707
8708         if (make_temp) {
8709             sv_2mortal(*output);
8710         }
8711         return;
8712     }
8713     else if ((len_b = _invlist_len(b)) == 0) {
8714         bool make_temp = FALSE;
8715         if (*output == b) {
8716             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8717                 SvREFCNT_dec_NN(b);
8718             }
8719         }
8720
8721         /* The complement of an empty list is a list that has everything in it,
8722          * so the union with <a> includes everything too */
8723         if (complement_b) {
8724             if (a == *output) {
8725                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8726                     SvREFCNT_dec_NN(a);
8727                 }
8728             }
8729             *output = _new_invlist(1);
8730             _append_range_to_invlist(*output, 0, UV_MAX);
8731         }
8732         else if (*output != a) {
8733             *output = invlist_clone(a);
8734         }
8735         /* else *output already = a; */
8736
8737         if (make_temp) {
8738             sv_2mortal(*output);
8739         }
8740         return;
8741     }
8742
8743     /* Here both lists exist and are non-empty */
8744     array_a = invlist_array(a);
8745     array_b = invlist_array(b);
8746
8747     /* If are to take the union of 'a' with the complement of b, set it
8748      * up so are looking at b's complement. */
8749     if (complement_b) {
8750
8751         /* To complement, we invert: if the first element is 0, remove it.  To
8752          * do this, we just pretend the array starts one later */
8753         if (array_b[0] == 0) {
8754             array_b++;
8755             len_b--;
8756         }
8757         else {
8758
8759             /* But if the first element is not zero, we pretend the list starts
8760              * at the 0 that is always stored immediately before the array. */
8761             array_b--;
8762             len_b++;
8763         }
8764     }
8765
8766     /* Size the union for the worst case: that the sets are completely
8767      * disjoint */
8768     u = _new_invlist(len_a + len_b);
8769
8770     /* Will contain U+0000 if either component does */
8771     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8772                                       || (len_b > 0 && array_b[0] == 0));
8773
8774     /* Go through each list item by item, stopping when exhausted one of
8775      * them */
8776     while (i_a < len_a && i_b < len_b) {
8777         UV cp;      /* The element to potentially add to the union's array */
8778         bool cp_in_set;   /* is it in the the input list's set or not */
8779
8780         /* We need to take one or the other of the two inputs for the union.
8781          * Since we are merging two sorted lists, we take the smaller of the
8782          * next items.  In case of a tie, we take the one that is in its set
8783          * first.  If we took one not in the set first, it would decrement the
8784          * count, possibly to 0 which would cause it to be output as ending the
8785          * range, and the next time through we would take the same number, and
8786          * output it again as beginning the next range.  By doing it the
8787          * opposite way, there is no possibility that the count will be
8788          * momentarily decremented to 0, and thus the two adjoining ranges will
8789          * be seamlessly merged.  (In a tie and both are in the set or both not
8790          * in the set, it doesn't matter which we take first.) */
8791         if (array_a[i_a] < array_b[i_b]
8792             || (array_a[i_a] == array_b[i_b]
8793                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8794         {
8795             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8796             cp= array_a[i_a++];
8797         }
8798         else {
8799             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8800             cp = array_b[i_b++];
8801         }
8802
8803         /* Here, have chosen which of the two inputs to look at.  Only output
8804          * if the running count changes to/from 0, which marks the
8805          * beginning/end of a range in that's in the set */
8806         if (cp_in_set) {
8807             if (count == 0) {
8808                 array_u[i_u++] = cp;
8809             }
8810             count++;
8811         }
8812         else {
8813             count--;
8814             if (count == 0) {
8815                 array_u[i_u++] = cp;
8816             }
8817         }
8818     }
8819
8820     /* Here, we are finished going through at least one of the lists, which
8821      * means there is something remaining in at most one.  We check if the list
8822      * that hasn't been exhausted is positioned such that we are in the middle
8823      * of a range in its set or not.  (i_a and i_b point to the element beyond
8824      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8825      * is potentially more to output.
8826      * There are four cases:
8827      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8828      *     in the union is entirely from the non-exhausted set.
8829      *  2) Both were in their sets, count is 2.  Nothing further should
8830      *     be output, as everything that remains will be in the exhausted
8831      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8832      *     that
8833      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8834      *     Nothing further should be output because the union includes
8835      *     everything from the exhausted set.  Not decrementing ensures that.
8836      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8837      *     decrementing to 0 insures that we look at the remainder of the
8838      *     non-exhausted set */
8839     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8840         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8841     {
8842         count--;
8843     }
8844
8845     /* The final length is what we've output so far, plus what else is about to
8846      * be output.  (If 'count' is non-zero, then the input list we exhausted
8847      * has everything remaining up to the machine's limit in its set, and hence
8848      * in the union, so there will be no further output. */
8849     len_u = i_u;
8850     if (count == 0) {
8851         /* At most one of the subexpressions will be non-zero */
8852         len_u += (len_a - i_a) + (len_b - i_b);
8853     }
8854
8855     /* Set result to final length, which can change the pointer to array_u, so
8856      * re-find it */
8857     if (len_u != _invlist_len(u)) {
8858         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8859         invlist_trim(u);
8860         array_u = invlist_array(u);
8861     }
8862
8863     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8864      * the other) ended with everything above it not in its set.  That means
8865      * that the remaining part of the union is precisely the same as the
8866      * non-exhausted list, so can just copy it unchanged.  (If both list were
8867      * exhausted at the same time, then the operations below will be both 0.)
8868      */
8869     if (count == 0) {
8870         IV copy_count; /* At most one will have a non-zero copy count */
8871         if ((copy_count = len_a - i_a) > 0) {
8872             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8873         }
8874         else if ((copy_count = len_b - i_b) > 0) {
8875             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8876         }
8877     }
8878
8879     /*  We may be removing a reference to one of the inputs.  If so, the output
8880      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8881      *  count decremented) */
8882     if (a == *output || b == *output) {
8883         assert(! invlist_is_iterating(*output));
8884         if ((SvTEMP(*output))) {
8885             sv_2mortal(u);
8886         }
8887         else {
8888             SvREFCNT_dec_NN(*output);
8889         }
8890     }
8891
8892     *output = u;
8893
8894     return;
8895 }
8896
8897 void
8898 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8899                                                const bool complement_b, SV** i)
8900 {
8901     /* Take the intersection of two inversion lists and point <i> to it.  *i
8902      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8903      * the reference count to that list will be decremented if not already a
8904      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8905      * The first list, <a>, may be NULL, in which case an empty list is
8906      * returned.  If <complement_b> is TRUE, the result will be the
8907      * intersection of <a> and the complement (or inversion) of <b> instead of
8908      * <b> directly.
8909      *
8910      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8911      * Richard Gillam, published by Addison-Wesley, and explained at some
8912      * length there.  The preface says to incorporate its examples into your
8913      * code at your own risk.  In fact, it had bugs
8914      *
8915      * The algorithm is like a merge sort, and is essentially the same as the
8916      * union above
8917      */
8918
8919     const UV* array_a;          /* a's array */
8920     const UV* array_b;
8921     UV len_a;   /* length of a's array */
8922     UV len_b;
8923
8924     SV* r;                   /* the resulting intersection */
8925     UV* array_r;
8926     UV len_r;
8927
8928     UV i_a = 0;             /* current index into a's array */
8929     UV i_b = 0;
8930     UV i_r = 0;
8931
8932     /* running count, as explained in the algorithm source book; items are
8933      * stopped accumulating and are output when the count changes to/from 2.
8934      * The count is incremented when we start a range that's in the set, and
8935      * decremented when we start a range that's not in the set.  So its range
8936      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8937      */
8938     UV count = 0;
8939
8940     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8941     assert(a != b);
8942
8943     /* Special case if either one is empty */
8944     len_a = (a == NULL) ? 0 : _invlist_len(a);
8945     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8946         bool make_temp = FALSE;
8947
8948         if (len_a != 0 && complement_b) {
8949
8950             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8951              * be empty.  Here, also we are using 'b's complement, which hence
8952              * must be every possible code point.  Thus the intersection is
8953              * simply 'a'. */
8954             if (*i != a) {
8955                 if (*i == b) {
8956                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8957                         SvREFCNT_dec_NN(b);
8958                     }
8959                 }
8960
8961                 *i = invlist_clone(a);
8962             }
8963             /* else *i is already 'a' */
8964
8965             if (make_temp) {
8966                 sv_2mortal(*i);
8967             }
8968             return;
8969         }
8970
8971         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8972          * intersection must be empty */
8973         if (*i == a) {
8974             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8975                 SvREFCNT_dec_NN(a);
8976             }
8977         }
8978         else if (*i == b) {
8979             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8980                 SvREFCNT_dec_NN(b);
8981             }
8982         }
8983         *i = _new_invlist(0);
8984         if (make_temp) {
8985             sv_2mortal(*i);
8986         }
8987
8988         return;
8989     }
8990
8991     /* Here both lists exist and are non-empty */
8992     array_a = invlist_array(a);
8993     array_b = invlist_array(b);
8994
8995     /* If are to take the intersection of 'a' with the complement of b, set it
8996      * up so are looking at b's complement. */
8997     if (complement_b) {
8998
8999         /* To complement, we invert: if the first element is 0, remove it.  To
9000          * do this, we just pretend the array starts one later */
9001         if (array_b[0] == 0) {
9002             array_b++;
9003             len_b--;
9004         }
9005         else {
9006
9007             /* But if the first element is not zero, we pretend the list starts
9008              * at the 0 that is always stored immediately before the array. */
9009             array_b--;
9010             len_b++;
9011         }
9012     }
9013
9014     /* Size the intersection for the worst case: that the intersection ends up
9015      * fragmenting everything to be completely disjoint */
9016     r= _new_invlist(len_a + len_b);
9017
9018     /* Will contain U+0000 iff both components do */
9019     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9020                                      && len_b > 0 && array_b[0] == 0);
9021
9022     /* Go through each list item by item, stopping when exhausted one of
9023      * them */
9024     while (i_a < len_a && i_b < len_b) {
9025         UV cp;      /* The element to potentially add to the intersection's
9026                        array */
9027         bool cp_in_set; /* Is it in the input list's set or not */
9028
9029         /* We need to take one or the other of the two inputs for the
9030          * intersection.  Since we are merging two sorted lists, we take the
9031          * smaller of the next items.  In case of a tie, we take the one that
9032          * is not in its set first (a difference from the union algorithm).  If
9033          * we took one in the set first, it would increment the count, possibly
9034          * to 2 which would cause it to be output as starting a range in the
9035          * intersection, and the next time through we would take that same
9036          * number, and output it again as ending the set.  By doing it the
9037          * opposite of this, there is no possibility that the count will be
9038          * momentarily incremented to 2.  (In a tie and both are in the set or
9039          * both not in the set, it doesn't matter which we take first.) */
9040         if (array_a[i_a] < array_b[i_b]
9041             || (array_a[i_a] == array_b[i_b]
9042                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9043         {
9044             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9045             cp= array_a[i_a++];
9046         }
9047         else {
9048             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9049             cp= array_b[i_b++];
9050         }
9051
9052         /* Here, have chosen which of the two inputs to look at.  Only output
9053          * if the running count changes to/from 2, which marks the
9054          * beginning/end of a range that's in the intersection */
9055         if (cp_in_set) {
9056             count++;
9057             if (count == 2) {
9058                 array_r[i_r++] = cp;
9059             }
9060         }
9061         else {
9062             if (count == 2) {
9063                 array_r[i_r++] = cp;
9064             }
9065             count--;
9066         }
9067     }
9068
9069     /* Here, we are finished going through at least one of the lists, which
9070      * means there is something remaining in at most one.  We check if the list
9071      * that has been exhausted is positioned such that we are in the middle
9072      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
9073      * the ones we care about.)  There are four cases:
9074      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
9075      *     nothing left in the intersection.
9076      *  2) Both were in their sets, count is 2 and perhaps is incremented to
9077      *     above 2.  What should be output is exactly that which is in the
9078      *     non-exhausted set, as everything it has is also in the intersection
9079      *     set, and everything it doesn't have can't be in the intersection
9080      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9081      *     gets incremented to 2.  Like the previous case, the intersection is
9082      *     everything that remains in the non-exhausted set.
9083      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9084      *     remains 1.  And the intersection has nothing more. */
9085     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9086         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9087     {
9088         count++;
9089     }
9090
9091     /* The final length is what we've output so far plus what else is in the
9092      * intersection.  At most one of the subexpressions below will be non-zero
9093      * */
9094     len_r = i_r;
9095     if (count >= 2) {
9096         len_r += (len_a - i_a) + (len_b - i_b);
9097     }
9098
9099     /* Set result to final length, which can change the pointer to array_r, so
9100      * re-find it */
9101     if (len_r != _invlist_len(r)) {
9102         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9103         invlist_trim(r);
9104         array_r = invlist_array(r);
9105     }
9106
9107     /* Finish outputting any remaining */
9108     if (count >= 2) { /* At most one will have a non-zero copy count */
9109         IV copy_count;
9110         if ((copy_count = len_a - i_a) > 0) {
9111             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9112         }
9113         else if ((copy_count = len_b - i_b) > 0) {
9114             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9115         }
9116     }
9117
9118     /*  We may be removing a reference to one of the inputs.  If so, the output
9119      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
9120      *  count decremented) */
9121     if (a == *i || b == *i) {
9122         assert(! invlist_is_iterating(*i));
9123         if (SvTEMP(*i)) {
9124             sv_2mortal(r);
9125         }
9126         else {
9127             SvREFCNT_dec_NN(*i);
9128         }
9129     }
9130
9131     *i = r;
9132
9133     return;
9134 }
9135
9136 SV*
9137 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9138 {
9139     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9140      * set.  A pointer to the inversion list is returned.  This may actually be
9141      * a new list, in which case the passed in one has been destroyed.  The
9142      * passed-in inversion list can be NULL, in which case a new one is created
9143      * with just the one range in it */
9144
9145     SV* range_invlist;
9146     UV len;
9147
9148     if (invlist == NULL) {
9149         invlist = _new_invlist(2);
9150         len = 0;
9151     }
9152     else {
9153         len = _invlist_len(invlist);
9154     }
9155
9156     /* If comes after the final entry actually in the list, can just append it
9157      * to the end, */
9158     if (len == 0
9159         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9160             && start >= invlist_array(invlist)[len - 1]))
9161     {
9162         _append_range_to_invlist(invlist, start, end);
9163         return invlist;
9164     }
9165
9166     /* Here, can't just append things, create and return a new inversion list
9167      * which is the union of this range and the existing inversion list.  (If
9168      * the new range is well-behaved wrt to the old one, we could just insert
9169      * it, doing a Move() down on the tail of the old one (potentially growing
9170      * it first).  But to determine that means we would have the extra
9171      * (possibly throw-away) work of first finding where the new one goes and
9172      * whether it disrupts (splits) an existing range, so it doesn't appear to
9173      * me (khw) that it's worth it) */
9174     range_invlist = _new_invlist(2);
9175     _append_range_to_invlist(range_invlist, start, end);
9176
9177     _invlist_union(invlist, range_invlist, &invlist);
9178
9179     /* The temporary can be freed */
9180     SvREFCNT_dec_NN(range_invlist);
9181
9182     return invlist;
9183 }
9184
9185 SV*
9186 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9187                                  UV** other_elements_ptr)
9188 {
9189     /* Create and return an inversion list whose contents are to be populated
9190      * by the caller.  The caller gives the number of elements (in 'size') and
9191      * the very first element ('element0').  This function will set
9192      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9193      * are to be placed.
9194      *
9195      * Obviously there is some trust involved that the caller will properly
9196      * fill in the other elements of the array.
9197      *
9198      * (The first element needs to be passed in, as the underlying code does
9199      * things differently depending on whether it is zero or non-zero) */
9200
9201     SV* invlist = _new_invlist(size);
9202     bool offset;
9203
9204     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9205
9206     _append_range_to_invlist(invlist, element0, element0);
9207     offset = *get_invlist_offset_addr(invlist);
9208
9209     invlist_set_len(invlist, size, offset);
9210     *other_elements_ptr = invlist_array(invlist) + 1;
9211     return invlist;
9212 }
9213
9214 #endif
9215
9216 PERL_STATIC_INLINE SV*
9217 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9218     return _add_range_to_invlist(invlist, cp, cp);
9219 }
9220
9221 #ifndef PERL_IN_XSUB_RE
9222 void
9223 Perl__invlist_invert(pTHX_ SV* const invlist)
9224 {
9225     /* Complement the input inversion list.  This adds a 0 if the list didn't
9226      * have a zero; removes it otherwise.  As described above, the data
9227      * structure is set up so that this is very efficient */
9228
9229     PERL_ARGS_ASSERT__INVLIST_INVERT;
9230
9231     assert(! invlist_is_iterating(invlist));
9232
9233     /* The inverse of matching nothing is matching everything */
9234     if (_invlist_len(invlist) == 0) {
9235         _append_range_to_invlist(invlist, 0, UV_MAX);
9236         return;
9237     }
9238
9239     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9240 }
9241
9242 #endif
9243
9244 PERL_STATIC_INLINE SV*
9245 S_invlist_clone(pTHX_ SV* const invlist)
9246 {
9247
9248     /* Return a new inversion list that is a copy of the input one, which is
9249      * unchanged.  The new list will not be mortal even if the old one was. */
9250
9251     /* Need to allocate extra space to accommodate Perl's addition of a
9252      * trailing NUL to SvPV's, since it thinks they are always strings */
9253     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9254     STRLEN physical_length = SvCUR(invlist);
9255     bool offset = *(get_invlist_offset_addr(invlist));
9256
9257     PERL_ARGS_ASSERT_INVLIST_CLONE;
9258
9259     *(get_invlist_offset_addr(new_invlist)) = offset;
9260     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9261     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9262
9263     return new_invlist;
9264 }
9265
9266 PERL_STATIC_INLINE STRLEN*
9267 S_get_invlist_iter_addr(SV* invlist)
9268 {
9269     /* Return the address of the UV that contains the current iteration
9270      * position */
9271
9272     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9273
9274     assert(SvTYPE(invlist) == SVt_INVLIST);
9275
9276     return &(((XINVLIST*) SvANY(invlist))->iterator);
9277 }
9278
9279 PERL_STATIC_INLINE void
9280 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9281 {
9282     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9283
9284     *get_invlist_iter_addr(invlist) = 0;
9285 }
9286
9287 PERL_STATIC_INLINE void
9288 S_invlist_iterfinish(SV* invlist)
9289 {
9290     /* Terminate iterator for invlist.  This is to catch development errors.
9291      * Any iteration that is interrupted before completed should call this
9292      * function.  Functions that add code points anywhere else but to the end
9293      * of an inversion list assert that they are not in the middle of an
9294      * iteration.  If they were, the addition would make the iteration
9295      * problematical: if the iteration hadn't reached the place where things
9296      * were being added, it would be ok */
9297
9298     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9299
9300     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9301 }
9302
9303 STATIC bool
9304 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9305 {
9306     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9307      * This call sets in <*start> and <*end>, the next range in <invlist>.
9308      * Returns <TRUE> if successful and the next call will return the next
9309      * range; <FALSE> if was already at the end of the list.  If the latter,
9310      * <*start> and <*end> are unchanged, and the next call to this function
9311      * will start over at the beginning of the list */
9312
9313     STRLEN* pos = get_invlist_iter_addr(invlist);
9314     UV len = _invlist_len(invlist);
9315     UV *array;
9316
9317     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9318
9319     if (*pos >= len) {
9320         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9321         return FALSE;
9322     }
9323
9324     array = invlist_array(invlist);
9325
9326     *start = array[(*pos)++];
9327
9328     if (*pos >= len) {
9329         *end = UV_MAX;
9330     }
9331     else {
9332         *end = array[(*pos)++] - 1;
9333     }
9334
9335     return TRUE;
9336 }
9337
9338 PERL_STATIC_INLINE UV
9339 S_invlist_highest(SV* const invlist)
9340 {
9341     /* Returns the highest code point that matches an inversion list.  This API
9342      * has an ambiguity, as it returns 0 under either the highest is actually
9343      * 0, or if the list is empty.  If this distinction matters to you, check
9344      * for emptiness before calling this function */
9345
9346     UV len = _invlist_len(invlist);
9347     UV *array;
9348
9349     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9350
9351     if (len == 0) {
9352         return 0;
9353     }
9354
9355     array = invlist_array(invlist);
9356
9357     /* The last element in the array in the inversion list always starts a
9358      * range that goes to infinity.  That range may be for code points that are
9359      * matched in the inversion list, or it may be for ones that aren't
9360      * matched.  In the latter case, the highest code point in the set is one
9361      * less than the beginning of this range; otherwise it is the final element
9362      * of this range: infinity */
9363     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9364            ? UV_MAX
9365            : array[len - 1] - 1;
9366 }
9367
9368 #ifndef PERL_IN_XSUB_RE
9369 SV *
9370 Perl__invlist_contents(pTHX_ SV* const invlist)
9371 {
9372     /* Get the contents of an inversion list into a string SV so that they can
9373      * be printed out.  It uses the format traditionally done for debug tracing
9374      */
9375
9376     UV start, end;
9377     SV* output = newSVpvs("\n");
9378
9379     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9380
9381     assert(! invlist_is_iterating(invlist));
9382
9383     invlist_iterinit(invlist);
9384     while (invlist_iternext(invlist, &start, &end)) {
9385         if (end == UV_MAX) {
9386             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9387         }
9388         else if (end != start) {
9389             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9390                     start,       end);
9391         }
9392         else {
9393             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9394         }
9395     }
9396
9397     return output;
9398 }
9399 #endif
9400
9401 #ifndef PERL_IN_XSUB_RE
9402 void
9403 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9404                          const char * const indent, SV* const invlist)
9405 {
9406     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9407      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9408      * the string 'indent'.  The output looks like this:
9409          [0] 0x000A .. 0x000D
9410          [2] 0x0085
9411          [4] 0x2028 .. 0x2029
9412          [6] 0x3104 .. INFINITY
9413      * This means that the first range of code points matched by the list are
9414      * 0xA through 0xD; the second range contains only the single code point
9415      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9416      * are used to define each range (except if the final range extends to
9417      * infinity, only a single element is needed).  The array index of the
9418      * first element for the corresponding range is given in brackets. */
9419
9420     UV start, end;
9421     STRLEN count = 0;
9422
9423     PERL_ARGS_ASSERT__INVLIST_DUMP;
9424
9425     if (invlist_is_iterating(invlist)) {
9426         Perl_dump_indent(aTHX_ level, file,
9427              "%sCan't dump inversion list because is in middle of iterating\n",
9428              indent);
9429         return;
9430     }
9431
9432     invlist_iterinit(invlist);
9433     while (invlist_iternext(invlist, &start, &end)) {
9434         if (end == UV_MAX) {
9435             Perl_dump_indent(aTHX_ level, file,
9436                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9437                                    indent, (UV)count, start);
9438         }
9439         else if (end != start) {
9440             Perl_dump_indent(aTHX_ level, file,
9441                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9442                                 indent, (UV)count, start,         end);
9443         }
9444         else {
9445             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9446                                             indent, (UV)count, start);
9447         }
9448         count += 2;
9449     }
9450 }
9451
9452 void
9453 Perl__load_PL_utf8_foldclosures (pTHX)
9454 {
9455     assert(! PL_utf8_foldclosures);
9456
9457     /* If the folds haven't been read in, call a fold function
9458      * to force that */
9459     if (! PL_utf8_tofold) {
9460         U8 dummy[UTF8_MAXBYTES_CASE+1];
9461
9462         /* This string is just a short named one above \xff */
9463         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9464         assert(PL_utf8_tofold); /* Verify that worked */
9465     }
9466     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9467 }
9468 #endif
9469
9470 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
9471 bool
9472 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9473 {
9474     /* Return a boolean as to if the two passed in inversion lists are
9475      * identical.  The final argument, if TRUE, says to take the complement of
9476      * the second inversion list before doing the comparison */
9477
9478     const UV* array_a = invlist_array(a);
9479     const UV* array_b = invlist_array(b);
9480     UV len_a = _invlist_len(a);
9481     UV len_b = _invlist_len(b);
9482
9483     UV i = 0;               /* current index into the arrays */
9484     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9485
9486     PERL_ARGS_ASSERT__INVLISTEQ;
9487
9488     /* If are to compare 'a' with the complement of b, set it
9489      * up so are looking at b's complement. */
9490     if (complement_b) {
9491
9492         /* The complement of nothing is everything, so <a> would have to have
9493          * just one element, starting at zero (ending at infinity) */
9494         if (len_b == 0) {
9495             return (len_a == 1 && array_a[0] == 0);
9496         }
9497         else if (array_b[0] == 0) {
9498
9499             /* Otherwise, to complement, we invert.  Here, the first element is
9500              * 0, just remove it.  To do this, we just pretend the array starts
9501              * one later */
9502
9503             array_b++;
9504             len_b--;
9505         }
9506         else {
9507
9508             /* But if the first element is not zero, we pretend the list starts
9509              * at the 0 that is always stored immediately before the array. */
9510             array_b--;
9511             len_b++;
9512         }
9513     }
9514
9515     /* Make sure that the lengths are the same, as well as the final element
9516      * before looping through the remainder.  (Thus we test the length, final,
9517      * and first elements right off the bat) */
9518     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9519         retval = FALSE;
9520     }
9521     else for (i = 0; i < len_a - 1; i++) {
9522         if (array_a[i] != array_b[i]) {
9523             retval = FALSE;
9524             break;
9525         }
9526     }
9527
9528     return retval;
9529 }
9530 #endif
9531
9532 /*
9533  * As best we can, determine the characters that can match the start of
9534  * the given EXACTF-ish node.
9535  *
9536  * Returns the invlist as a new SV*; it is the caller's responsibility to
9537  * call SvREFCNT_dec() when done with it.
9538  */
9539 STATIC SV*
9540 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9541 {
9542     const U8 * s = (U8*)STRING(node);
9543     SSize_t bytelen = STR_LEN(node);
9544     UV uc;
9545     /* Start out big enough for 2 separate code points */
9546     SV* invlist = _new_invlist(4);
9547
9548     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9549
9550     if (! UTF) {
9551         uc = *s;
9552
9553         /* We punt and assume can match anything if the node begins
9554          * with a multi-character fold.  Things are complicated.  For
9555          * example, /ffi/i could match any of:
9556          *  "\N{LATIN SMALL LIGATURE FFI}"
9557          *  "\N{LATIN SMALL LIGATURE FF}I"
9558          *  "F\N{LATIN SMALL LIGATURE FI}"
9559          *  plus several other things; and making sure we have all the
9560          *  possibilities is hard. */
9561         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9562             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9563         }
9564         else {
9565             /* Any Latin1 range character can potentially match any
9566              * other depending on the locale */
9567             if (OP(node) == EXACTFL) {
9568                 _invlist_union(invlist, PL_Latin1, &invlist);
9569             }
9570             else {
9571                 /* But otherwise, it matches at least itself.  We can
9572                  * quickly tell if it has a distinct fold, and if so,
9573                  * it matches that as well */
9574                 invlist = add_cp_to_invlist(invlist, uc);
9575                 if (IS_IN_SOME_FOLD_L1(uc))
9576                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9577             }
9578
9579             /* Some characters match above-Latin1 ones under /i.  This
9580              * is true of EXACTFL ones when the locale is UTF-8 */
9581             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9582                 && (! isASCII(uc) || (OP(node) != EXACTFA
9583                                     && OP(node) != EXACTFA_NO_TRIE)))
9584             {
9585                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9586             }
9587         }
9588     }
9589     else {  /* Pattern is UTF-8 */
9590         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9591         STRLEN foldlen = UTF8SKIP(s);
9592         const U8* e = s + bytelen;
9593         SV** listp;
9594
9595         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9596
9597         /* The only code points that aren't folded in a UTF EXACTFish
9598          * node are are the problematic ones in EXACTFL nodes */
9599         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9600             /* We need to check for the possibility that this EXACTFL
9601              * node begins with a multi-char fold.  Therefore we fold
9602              * the first few characters of it so that we can make that
9603              * check */
9604             U8 *d = folded;
9605             int i;
9606
9607             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9608                 if (isASCII(*s)) {
9609                     *(d++) = (U8) toFOLD(*s);
9610                     s++;
9611                 }
9612                 else {
9613                     STRLEN len;
9614                     to_utf8_fold(s, d, &len);
9615                     d += len;
9616                     s += UTF8SKIP(s);
9617                 }
9618             }
9619
9620             /* And set up so the code below that looks in this folded
9621              * buffer instead of the node's string */
9622             e = d;
9623             foldlen = UTF8SKIP(folded);
9624             s = folded;
9625         }
9626
9627         /* When we reach here 's' points to the fold of the first
9628          * character(s) of the node; and 'e' points to far enough along
9629          * the folded string to be just past any possible multi-char
9630          * fold. 'foldlen' is the length in bytes of the first
9631          * character in 's'
9632          *
9633          * Unlike the non-UTF-8 case, the macro for determining if a
9634          * string is a multi-char fold requires all the characters to
9635          * already be folded.  This is because of all the complications
9636          * if not.  Note that they are folded anyway, except in EXACTFL
9637          * nodes.  Like the non-UTF case above, we punt if the node
9638          * begins with a multi-char fold  */
9639
9640         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9641             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9642         }
9643         else {  /* Single char fold */
9644
9645             /* It matches all the things that fold to it, which are
9646              * found in PL_utf8_foldclosures (including itself) */
9647             invlist = add_cp_to_invlist(invlist, uc);
9648             if (! PL_utf8_foldclosures)
9649                 _load_PL_utf8_foldclosures();
9650             if ((listp = hv_fetch(PL_utf8_foldclosures,
9651                                 (char *) s, foldlen, FALSE)))
9652             {
9653                 AV* list = (AV*) *listp;
9654                 IV k;
9655                 for (k = 0; k <= av_tindex(list); k++) {
9656                     SV** c_p = av_fetch(list, k, FALSE);
9657                     UV c;
9658                     assert(c_p);
9659
9660                     c = SvUV(*c_p);
9661
9662                     /* /aa doesn't allow folds between ASCII and non- */
9663                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9664                         && isASCII(c) != isASCII(uc))
9665                     {
9666                         continue;
9667                     }
9668
9669                     invlist = add_cp_to_invlist(invlist, c);
9670                 }
9671             }
9672         }
9673     }
9674
9675     return invlist;
9676 }
9677
9678 #undef HEADER_LENGTH
9679 #undef TO_INTERNAL_SIZE
9680 #undef FROM_INTERNAL_SIZE
9681 #undef INVLIST_VERSION_ID
9682
9683 /* End of inversion list object */
9684
9685 STATIC void
9686 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9687 {
9688     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9689      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9690      * should point to the first flag; it is updated on output to point to the
9691      * final ')' or ':'.  There needs to be at least one flag, or this will
9692      * abort */
9693
9694     /* for (?g), (?gc), and (?o) warnings; warning
9695        about (?c) will warn about (?g) -- japhy    */
9696
9697 #define WASTED_O  0x01
9698 #define WASTED_G  0x02
9699 #define WASTED_C  0x04
9700 #define WASTED_GC (WASTED_G|WASTED_C)
9701     I32 wastedflags = 0x00;
9702     U32 posflags = 0, negflags = 0;
9703     U32 *flagsp = &posflags;
9704     char has_charset_modifier = '\0';
9705     regex_charset cs;
9706     bool has_use_defaults = FALSE;
9707     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9708     int x_mod_count = 0;
9709
9710     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9711
9712     /* '^' as an initial flag sets certain defaults */
9713     if (UCHARAT(RExC_parse) == '^') {
9714         RExC_parse++;
9715         has_use_defaults = TRUE;
9716         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9717         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9718                                         ? REGEX_UNICODE_CHARSET
9719                                         : REGEX_DEPENDS_CHARSET);
9720     }
9721
9722     cs = get_regex_charset(RExC_flags);
9723     if (cs == REGEX_DEPENDS_CHARSET
9724         && (RExC_utf8 || RExC_uni_semantics))
9725     {
9726         cs = REGEX_UNICODE_CHARSET;
9727     }
9728
9729     while (*RExC_parse) {
9730         /* && strchr("iogcmsx", *RExC_parse) */
9731         /* (?g), (?gc) and (?o) are useless here
9732            and must be globally applied -- japhy */
9733         switch (*RExC_parse) {
9734
9735             /* Code for the imsxn flags */
9736             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9737
9738             case LOCALE_PAT_MOD:
9739                 if (has_charset_modifier) {
9740                     goto excess_modifier;
9741                 }
9742                 else if (flagsp == &negflags) {
9743                     goto neg_modifier;
9744                 }
9745                 cs = REGEX_LOCALE_CHARSET;
9746                 has_charset_modifier = LOCALE_PAT_MOD;
9747                 break;
9748             case UNICODE_PAT_MOD:
9749                 if (has_charset_modifier) {
9750                     goto excess_modifier;
9751                 }
9752                 else if (flagsp == &negflags) {
9753                     goto neg_modifier;
9754                 }
9755                 cs = REGEX_UNICODE_CHARSET;
9756                 has_charset_modifier = UNICODE_PAT_MOD;
9757                 break;
9758             case ASCII_RESTRICT_PAT_MOD:
9759                 if (flagsp == &negflags) {
9760                     goto neg_modifier;
9761                 }
9762                 if (has_charset_modifier) {
9763                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9764                         goto excess_modifier;
9765                     }
9766                     /* Doubled modifier implies more restricted */
9767                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9768                 }
9769                 else {
9770                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9771                 }
9772                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9773                 break;
9774             case DEPENDS_PAT_MOD:
9775                 if (has_use_defaults) {
9776                     goto fail_modifiers;
9777                 }
9778                 else if (flagsp == &negflags) {
9779                     goto neg_modifier;
9780                 }
9781                 else if (has_charset_modifier) {
9782                     goto excess_modifier;
9783                 }
9784
9785                 /* The dual charset means unicode semantics if the
9786                  * pattern (or target, not known until runtime) are
9787                  * utf8, or something in the pattern indicates unicode
9788                  * semantics */
9789                 cs = (RExC_utf8 || RExC_uni_semantics)
9790                      ? REGEX_UNICODE_CHARSET
9791                      : REGEX_DEPENDS_CHARSET;
9792                 has_charset_modifier = DEPENDS_PAT_MOD;
9793                 break;
9794               excess_modifier:
9795                 RExC_parse++;
9796                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9797                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9798                 }
9799                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9800                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9801                                         *(RExC_parse - 1));
9802                 }
9803                 else {
9804                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9805                 }
9806                 NOT_REACHED; /*NOTREACHED*/
9807               neg_modifier:
9808                 RExC_parse++;
9809                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9810                                     *(RExC_parse - 1));
9811                 NOT_REACHED; /*NOTREACHED*/
9812             case ONCE_PAT_MOD: /* 'o' */
9813             case GLOBAL_PAT_MOD: /* 'g' */
9814                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9815                     const I32 wflagbit = *RExC_parse == 'o'
9816                                          ? WASTED_O
9817                                          : WASTED_G;
9818                     if (! (wastedflags & wflagbit) ) {
9819                         wastedflags |= wflagbit;
9820                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9821                         vWARN5(
9822                             RExC_parse + 1,
9823                             "Useless (%s%c) - %suse /%c modifier",
9824                             flagsp == &negflags ? "?-" : "?",
9825                             *RExC_parse,
9826                             flagsp == &negflags ? "don't " : "",
9827                             *RExC_parse
9828                         );
9829                     }
9830                 }
9831                 break;
9832
9833             case CONTINUE_PAT_MOD: /* 'c' */
9834                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9835                     if (! (wastedflags & WASTED_C) ) {
9836                         wastedflags |= WASTED_GC;
9837                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9838                         vWARN3(
9839                             RExC_parse + 1,
9840                             "Useless (%sc) - %suse /gc modifier",
9841                             flagsp == &negflags ? "?-" : "?",
9842                             flagsp == &negflags ? "don't " : ""
9843                         );
9844                     }
9845                 }
9846                 break;
9847             case KEEPCOPY_PAT_MOD: /* 'p' */
9848                 if (flagsp == &negflags) {
9849                     if (PASS2)
9850                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9851                 } else {
9852                     *flagsp |= RXf_PMf_KEEPCOPY;
9853                 }
9854                 break;
9855             case '-':
9856                 /* A flag is a default iff it is following a minus, so
9857                  * if there is a minus, it means will be trying to
9858                  * re-specify a default which is an error */
9859                 if (has_use_defaults || flagsp == &negflags) {
9860                     goto fail_modifiers;
9861                 }
9862                 flagsp = &negflags;
9863                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9864                 break;
9865             case ':':
9866             case ')':
9867                 RExC_flags |= posflags;
9868                 RExC_flags &= ~negflags;
9869                 set_regex_charset(&RExC_flags, cs);
9870                 if (RExC_flags & RXf_PMf_FOLD) {
9871                     RExC_contains_i = 1;
9872                 }
9873                 if (PASS2) {
9874                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9875                 }
9876                 return;
9877                 /*NOTREACHED*/
9878             default:
9879               fail_modifiers:
9880                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9881                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9882                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9883                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9884                 NOT_REACHED; /*NOTREACHED*/
9885         }
9886
9887         ++RExC_parse;
9888     }
9889
9890     vFAIL("Sequence (?... not terminated");
9891 }
9892
9893 /*
9894  - reg - regular expression, i.e. main body or parenthesized thing
9895  *
9896  * Caller must absorb opening parenthesis.
9897  *
9898  * Combining parenthesis handling with the base level of regular expression
9899  * is a trifle forced, but the need to tie the tails of the branches to what
9900  * follows makes it hard to avoid.
9901  */
9902 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9903 #ifdef DEBUGGING
9904 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9905 #else
9906 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9907 #endif
9908
9909 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9910    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
9911    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
9912    upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
9913    NULL, which cannot happen.  */
9914 STATIC regnode *
9915 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9916     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9917      * 2 is like 1, but indicates that nextchar() has been called to advance
9918      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9919      * this flag alerts us to the need to check for that */
9920 {
9921     regnode *ret;               /* Will be the head of the group. */
9922     regnode *br;
9923     regnode *lastbr;
9924     regnode *ender = NULL;
9925     I32 parno = 0;
9926     I32 flags;
9927     U32 oregflags = RExC_flags;
9928     bool have_branch = 0;
9929     bool is_open = 0;
9930     I32 freeze_paren = 0;
9931     I32 after_freeze = 0;
9932     I32 num; /* numeric backreferences */
9933
9934     char * parse_start = RExC_parse; /* MJD */
9935     char * const oregcomp_parse = RExC_parse;
9936
9937     GET_RE_DEBUG_FLAGS_DECL;
9938
9939     PERL_ARGS_ASSERT_REG;
9940     DEBUG_PARSE("reg ");
9941
9942     *flagp = 0;                         /* Tentatively. */
9943
9944     /* Having this true makes it feasible to have a lot fewer tests for the
9945      * parse pointer being in scope.  For example, we can write
9946      *      while(isFOO(*RExC_parse)) RExC_parse++;
9947      * instead of
9948      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
9949      */
9950     assert(*RExC_end == '\0');
9951
9952     /* Make an OPEN node, if parenthesized. */
9953     if (paren) {
9954
9955         /* Under /x, space and comments can be gobbled up between the '(' and
9956          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9957          * intervening space, as the sequence is a token, and a token should be
9958          * indivisible */
9959         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9960
9961         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9962             char *start_verb = RExC_parse;
9963             STRLEN verb_len = 0;
9964             char *start_arg = NULL;
9965             unsigned char op = 0;
9966             int arg_required = 0;
9967             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
9968
9969             if (has_intervening_patws) {
9970                 RExC_parse++;
9971                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9972             }
9973             while ( *RExC_parse && *RExC_parse != ')' ) {
9974                 if ( *RExC_parse == ':' ) {
9975                     start_arg = RExC_parse + 1;
9976                     break;
9977                 }
9978                 RExC_parse++;
9979             }
9980             ++start_verb;
9981             verb_len = RExC_parse - start_verb;
9982             if ( start_arg ) {
9983                 RExC_parse++;
9984                 while ( *RExC_parse && *RExC_parse != ')' )
9985                     RExC_parse++;
9986                 if ( *RExC_parse != ')' )
9987                     vFAIL("Unterminated verb pattern argument");
9988                 if ( RExC_parse == start_arg )
9989                     start_arg = NULL;
9990             } else {
9991                 if ( *RExC_parse != ')' )
9992                     vFAIL("Unterminated verb pattern");
9993             }
9994
9995             switch ( *start_verb ) {
9996             case 'A':  /* (*ACCEPT) */
9997                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9998                     op = ACCEPT;
9999                     internal_argval = RExC_nestroot;
10000                 }
10001                 break;
10002             case 'C':  /* (*COMMIT) */
10003                 if ( memEQs(start_verb,verb_len,"COMMIT") )
10004                     op = COMMIT;
10005                 break;
10006             case 'F':  /* (*FAIL) */
10007                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10008                     op = OPFAIL;
10009                 }
10010                 break;
10011             case ':':  /* (*:NAME) */
10012             case 'M':  /* (*MARK:NAME) */
10013                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10014                     op = MARKPOINT;
10015                     arg_required = 1;
10016                 }
10017                 break;
10018             case 'P':  /* (*PRUNE) */
10019                 if ( memEQs(start_verb,verb_len,"PRUNE") )
10020                     op = PRUNE;
10021                 break;
10022             case 'S':   /* (*SKIP) */
10023                 if ( memEQs(start_verb,verb_len,"SKIP") )
10024                     op = SKIP;
10025                 break;
10026             case 'T':  /* (*THEN) */
10027                 /* [19:06] <TimToady> :: is then */
10028                 if ( memEQs(start_verb,verb_len,"THEN") ) {
10029                     op = CUTGROUP;
10030                     RExC_seen |= REG_CUTGROUP_SEEN;
10031                 }
10032                 break;
10033             }
10034             if ( ! op ) {
10035                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10036                 vFAIL2utf8f(
10037                     "Unknown verb pattern '%"UTF8f"'",
10038                     UTF8fARG(UTF, verb_len, start_verb));
10039             }
10040             if ( arg_required && !start_arg ) {
10041                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10042                     verb_len, start_verb);
10043             }
10044             if (internal_argval == -1) {
10045                 ret = reganode(pRExC_state, op, 0);
10046             } else {
10047                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10048             }
10049             RExC_seen |= REG_VERBARG_SEEN;
10050             if ( ! SIZE_ONLY ) {
10051                 if (start_arg) {
10052                     SV *sv = newSVpvn( start_arg,
10053                                        RExC_parse - start_arg);
10054                     ARG(ret) = add_data( pRExC_state,
10055                                          STR_WITH_LEN("S"));
10056                     RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10057                     ret->flags = 1;
10058                 } else {
10059                     ret->flags = 0;
10060                 }
10061                 if ( internal_argval != -1 )
10062                     ARG2L_SET(ret, internal_argval);
10063             }
10064             nextchar(pRExC_state);
10065             return ret;
10066         }
10067         else if (*RExC_parse == '?') { /* (?...) */
10068             bool is_logical = 0;
10069             const char * const seqstart = RExC_parse;
10070             const char * endptr;
10071             if (has_intervening_patws) {
10072                 RExC_parse++;
10073                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10074             }
10075
10076             RExC_parse++;
10077             paren = *RExC_parse++;
10078             ret = NULL;                 /* For lookahead/behind. */
10079             switch (paren) {
10080
10081             case 'P':   /* (?P...) variants for those used to PCRE/Python */
10082                 paren = *RExC_parse++;
10083                 if ( paren == '<')         /* (?P<...>) named capture */
10084                     goto named_capture;
10085                 else if (paren == '>') {   /* (?P>name) named recursion */
10086                     goto named_recursion;
10087                 }
10088                 else if (paren == '=') {   /* (?P=...)  named backref */
10089                     /* this pretty much dupes the code for \k<NAME> in
10090                      * regatom(), if you change this make sure you change that
10091                      * */
10092                     char* name_start = RExC_parse;
10093                     U32 num = 0;
10094                     SV *sv_dat = reg_scan_name(pRExC_state,
10095                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10096                     if (RExC_parse == name_start || *RExC_parse != ')')
10097                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
10098                         vFAIL2("Sequence %.3s... not terminated",parse_start);
10099
10100                     if (!SIZE_ONLY) {
10101                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10102                         RExC_rxi->data->data[num]=(void*)sv_dat;
10103                         SvREFCNT_inc_simple_void(sv_dat);
10104                     }
10105                     RExC_sawback = 1;
10106                     ret = reganode(pRExC_state,
10107                                    ((! FOLD)
10108                                      ? NREF
10109                                      : (ASCII_FOLD_RESTRICTED)
10110                                        ? NREFFA
10111                                        : (AT_LEAST_UNI_SEMANTICS)
10112                                          ? NREFFU
10113                                          : (LOC)
10114                                            ? NREFFL
10115                                            : NREFF),
10116                                     num);
10117                     *flagp |= HASWIDTH;
10118
10119                     Set_Node_Offset(ret, parse_start+1);
10120                     Set_Node_Cur_Length(ret, parse_start);
10121
10122                     nextchar(pRExC_state);
10123                     return ret;
10124                 }
10125                 --RExC_parse;
10126                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10127                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10128                 vFAIL3("Sequence (%.*s...) not recognized",
10129                                 RExC_parse-seqstart, seqstart);
10130                 NOT_REACHED; /*NOTREACHED*/
10131             case '<':           /* (?<...) */
10132                 if (*RExC_parse == '!')
10133                     paren = ',';
10134                 else if (*RExC_parse != '=')
10135               named_capture:
10136                 {               /* (?<...>) */
10137                     char *name_start;
10138                     SV *svname;
10139                     paren= '>';
10140             case '\'':          /* (?'...') */
10141                     name_start= RExC_parse;
10142                     svname = reg_scan_name(pRExC_state,
10143                         SIZE_ONLY    /* reverse test from the others */
10144                         ? REG_RSN_RETURN_NAME
10145                         : REG_RSN_RETURN_NULL);
10146                     if (RExC_parse == name_start || *RExC_parse != paren)
10147                         vFAIL2("Sequence (?%c... not terminated",
10148                             paren=='>' ? '<' : paren);
10149                     if (SIZE_ONLY) {
10150                         HE *he_str;
10151                         SV *sv_dat = NULL;
10152                         if (!svname) /* shouldn't happen */
10153                             Perl_croak(aTHX_
10154                                 "panic: reg_scan_name returned NULL");
10155                         if (!RExC_paren_names) {
10156                             RExC_paren_names= newHV();
10157                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10158 #ifdef DEBUGGING
10159                             RExC_paren_name_list= newAV();
10160                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10161 #endif
10162                         }
10163                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10164                         if ( he_str )
10165                             sv_dat = HeVAL(he_str);
10166                         if ( ! sv_dat ) {
10167                             /* croak baby croak */
10168                             Perl_croak(aTHX_
10169                                 "panic: paren_name hash element allocation failed");
10170                         } else if ( SvPOK(sv_dat) ) {
10171                             /* (?|...) can mean we have dupes so scan to check
10172                                its already been stored. Maybe a flag indicating
10173                                we are inside such a construct would be useful,
10174                                but the arrays are likely to be quite small, so
10175                                for now we punt -- dmq */
10176                             IV count = SvIV(sv_dat);
10177                             I32 *pv = (I32*)SvPVX(sv_dat);
10178                             IV i;
10179                             for ( i = 0 ; i < count ; i++ ) {
10180                                 if ( pv[i] == RExC_npar ) {
10181                                     count = 0;
10182                                     break;
10183                                 }
10184                             }
10185                             if ( count ) {
10186                                 pv = (I32*)SvGROW(sv_dat,
10187                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10188                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10189                                 pv[count] = RExC_npar;
10190                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10191                             }
10192                         } else {
10193                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10194                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10195                                                                 sizeof(I32));
10196                             SvIOK_on(sv_dat);
10197                             SvIV_set(sv_dat, 1);
10198                         }
10199 #ifdef DEBUGGING
10200                         /* Yes this does cause a memory leak in debugging Perls
10201                          * */
10202                         if (!av_store(RExC_paren_name_list,
10203                                       RExC_npar, SvREFCNT_inc(svname)))
10204                             SvREFCNT_dec_NN(svname);
10205 #endif
10206
10207                         /*sv_dump(sv_dat);*/
10208                     }
10209                     nextchar(pRExC_state);
10210                     paren = 1;
10211                     goto capturing_parens;
10212                 }
10213                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10214                 RExC_in_lookbehind++;
10215                 RExC_parse++;
10216                 /* FALLTHROUGH */
10217             case '=':           /* (?=...) */
10218                 RExC_seen_zerolen++;
10219                 break;
10220             case '!':           /* (?!...) */
10221                 RExC_seen_zerolen++;
10222                 /* check if we're really just a "FAIL" assertion */
10223                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10224                                         FALSE /* Don't force to /x */ );
10225                 if (*RExC_parse == ')') {
10226                     ret=reganode(pRExC_state, OPFAIL, 0);
10227                     nextchar(pRExC_state);
10228                     return ret;
10229                 }
10230                 break;
10231             case '|':           /* (?|...) */
10232                 /* branch reset, behave like a (?:...) except that
10233                    buffers in alternations share the same numbers */
10234                 paren = ':';
10235                 after_freeze = freeze_paren = RExC_npar;
10236                 break;
10237             case ':':           /* (?:...) */
10238             case '>':           /* (?>...) */
10239                 break;
10240             case '$':           /* (?$...) */
10241             case '@':           /* (?@...) */
10242                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10243                 break;
10244             case '0' :           /* (?0) */
10245             case 'R' :           /* (?R) */
10246                 if (*RExC_parse != ')')
10247                     FAIL("Sequence (?R) not terminated");
10248                 ret = reg_node(pRExC_state, GOSTART);
10249                     RExC_seen |= REG_GOSTART_SEEN;
10250                 *flagp |= POSTPONED;
10251                 nextchar(pRExC_state);
10252                 return ret;
10253                 /*notreached*/
10254             /* named and numeric backreferences */
10255             case '&':            /* (?&NAME) */
10256                 parse_start = RExC_parse - 1;
10257               named_recursion:
10258                 {
10259                     SV *sv_dat = reg_scan_name(pRExC_state,
10260                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10261                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10262                 }
10263                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10264                     vFAIL("Sequence (?&... not terminated");
10265                 goto gen_recurse_regop;
10266                 /* NOTREACHED */
10267             case '+':
10268                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10269                     RExC_parse++;
10270                     vFAIL("Illegal pattern");
10271                 }
10272                 goto parse_recursion;
10273                 /* NOTREACHED*/
10274             case '-': /* (?-1) */
10275                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10276                     RExC_parse--; /* rewind to let it be handled later */
10277                     goto parse_flags;
10278                 }
10279                 /* FALLTHROUGH */
10280             case '1': case '2': case '3': case '4': /* (?1) */
10281             case '5': case '6': case '7': case '8': case '9':
10282                 RExC_parse--;
10283               parse_recursion:
10284                 {
10285                     bool is_neg = FALSE;
10286                     UV unum;
10287                     parse_start = RExC_parse - 1; /* MJD */
10288                     if (*RExC_parse == '-') {
10289                         RExC_parse++;
10290                         is_neg = TRUE;
10291                     }
10292                     if (grok_atoUV(RExC_parse, &unum, &endptr)
10293                         && unum <= I32_MAX
10294                     ) {
10295                         num = (I32)unum;
10296                         RExC_parse = (char*)endptr;
10297                     } else
10298                         num = I32_MAX;
10299                     if (is_neg) {
10300                         /* Some limit for num? */
10301                         num = -num;
10302                     }
10303                 }
10304                 if (*RExC_parse!=')')
10305                     vFAIL("Expecting close bracket");
10306
10307               gen_recurse_regop:
10308                 if ( paren == '-' ) {
10309                     /*
10310                     Diagram of capture buffer numbering.
10311                     Top line is the normal capture buffer numbers
10312                     Bottom line is the negative indexing as from
10313                     the X (the (?-2))
10314
10315                     +   1 2    3 4 5 X          6 7
10316                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10317                     -   5 4    3 2 1 X          x x
10318
10319                     */
10320                     num = RExC_npar + num;
10321                     if (num < 1)  {
10322                         RExC_parse++;
10323                         vFAIL("Reference to nonexistent group");
10324                     }
10325                 } else if ( paren == '+' ) {
10326                     num = RExC_npar + num - 1;
10327                 }
10328
10329                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10330                 if (!SIZE_ONLY) {
10331                     if (num > (I32)RExC_rx->nparens) {
10332                         RExC_parse++;
10333                         vFAIL("Reference to nonexistent group");
10334                     }
10335                     RExC_recurse_count++;
10336                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10337                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10338                               22, "|    |", (int)(depth * 2 + 1), "",
10339                               (UV)ARG(ret), (IV)ARG2L(ret)));
10340                 }
10341                 RExC_seen |= REG_RECURSE_SEEN;
10342                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10343                 Set_Node_Offset(ret, parse_start); /* MJD */
10344
10345                 *flagp |= POSTPONED;
10346                 nextchar(pRExC_state);
10347                 return ret;
10348
10349             /* NOTREACHED */
10350
10351             case '?':           /* (??...) */
10352                 is_logical = 1;
10353                 if (*RExC_parse != '{') {
10354                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
10355                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10356                     vFAIL2utf8f(
10357                         "Sequence (%"UTF8f"...) not recognized",
10358                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10359                     NOT_REACHED; /*NOTREACHED*/
10360                 }
10361                 *flagp |= POSTPONED;
10362                 paren = *RExC_parse++;
10363                 /* FALLTHROUGH */
10364             case '{':           /* (?{...}) */
10365             {
10366                 U32 n = 0;
10367                 struct reg_code_block *cb;
10368
10369                 RExC_seen_zerolen++;
10370
10371                 if (   !pRExC_state->num_code_blocks
10372                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10373                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10374                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10375                             - RExC_start)
10376                 ) {
10377                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10378                         FAIL("panic: Sequence (?{...}): no code block found\n");
10379                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10380                 }
10381                 /* this is a pre-compiled code block (?{...}) */
10382                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10383                 RExC_parse = RExC_start + cb->end;
10384                 if (!SIZE_ONLY) {
10385                     OP *o = cb->block;
10386                     if (cb->src_regex) {
10387                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10388                         RExC_rxi->data->data[n] =
10389                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10390                         RExC_rxi->data->data[n+1] = (void*)o;
10391                     }
10392                     else {
10393                         n = add_data(pRExC_state,
10394                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10395                         RExC_rxi->data->data[n] = (void*)o;
10396                     }
10397                 }
10398                 pRExC_state->code_index++;
10399                 nextchar(pRExC_state);
10400
10401                 if (is_logical) {
10402                     regnode *eval;
10403                     ret = reg_node(pRExC_state, LOGICAL);
10404
10405                     eval = reg2Lanode(pRExC_state, EVAL,
10406                                        n,
10407
10408                                        /* for later propagation into (??{})
10409                                         * return value */
10410                                        RExC_flags & RXf_PMf_COMPILETIME
10411                                       );
10412                     if (!SIZE_ONLY) {
10413                         ret->flags = 2;
10414                     }
10415                     REGTAIL(pRExC_state, ret, eval);
10416                     /* deal with the length of this later - MJD */
10417                     return ret;
10418                 }
10419                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10420                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10421                 Set_Node_Offset(ret, parse_start);
10422                 return ret;
10423             }
10424             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10425             {
10426                 int is_define= 0;
10427                 const int DEFINE_len = sizeof("DEFINE") - 1;
10428                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10429                     if (
10430                         RExC_parse[1] == '=' ||
10431                         RExC_parse[1] == '!' ||
10432                         RExC_parse[1] == '<' ||
10433                         RExC_parse[1] == '{'
10434                     ) { /* Lookahead or eval. */
10435                         I32 flag;
10436                         regnode *tail;
10437
10438                         ret = reg_node(pRExC_state, LOGICAL);
10439                         if (!SIZE_ONLY)
10440                             ret->flags = 1;
10441
10442                         tail = reg(pRExC_state, 1, &flag, depth+1);
10443                         if (flag & (RESTART_PASS1|NEED_UTF8)) {
10444                             *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10445                             return NULL;
10446                         }
10447                         REGTAIL(pRExC_state, ret, tail);
10448                         goto insert_if;
10449                     }
10450                     /* Fall through to ‘Unknown switch condition’ at the
10451                        end of the if/else chain. */
10452                 }
10453                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10454                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10455                 {
10456                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10457                     char *name_start= RExC_parse++;
10458                     U32 num = 0;
10459                     SV *sv_dat=reg_scan_name(pRExC_state,
10460                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10461                     if (RExC_parse == name_start || *RExC_parse != ch)
10462                         vFAIL2("Sequence (?(%c... not terminated",
10463                             (ch == '>' ? '<' : ch));
10464                     RExC_parse++;
10465                     if (!SIZE_ONLY) {
10466                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10467                         RExC_rxi->data->data[num]=(void*)sv_dat;
10468                         SvREFCNT_inc_simple_void(sv_dat);
10469                     }
10470                     ret = reganode(pRExC_state,NGROUPP,num);
10471                     goto insert_if_check_paren;
10472                 }
10473                 else if (RExC_end - RExC_parse >= DEFINE_len
10474                         && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10475                 {
10476                     ret = reganode(pRExC_state,DEFINEP,0);
10477                     RExC_parse += DEFINE_len;
10478                     is_define = 1;
10479                     goto insert_if_check_paren;
10480                 }
10481                 else if (RExC_parse[0] == 'R') {
10482                     RExC_parse++;
10483                     parno = 0;
10484                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10485                         UV uv;
10486                         if (grok_atoUV(RExC_parse, &uv, &endptr)
10487                             && uv <= I32_MAX
10488                         ) {
10489                             parno = (I32)uv;
10490                             RExC_parse = (char*)endptr;
10491                         }
10492                         /* else "Switch condition not recognized" below */
10493                     } else if (RExC_parse[0] == '&') {
10494                         SV *sv_dat;
10495                         RExC_parse++;
10496                         sv_dat = reg_scan_name(pRExC_state,
10497                             SIZE_ONLY
10498                             ? REG_RSN_RETURN_NULL
10499                             : REG_RSN_RETURN_DATA);
10500                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10501                     }
10502                     ret = reganode(pRExC_state,INSUBP,parno);
10503                     goto insert_if_check_paren;
10504                 }
10505                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10506                     /* (?(1)...) */
10507                     char c;
10508                     UV uv;
10509                     if (grok_atoUV(RExC_parse, &uv, &endptr)
10510                         && uv <= I32_MAX
10511                     ) {
10512                         parno = (I32)uv;
10513                         RExC_parse = (char*)endptr;
10514                     }
10515                     else {
10516                         vFAIL("panic: grok_atoUV returned FALSE");
10517                     }
10518                     ret = reganode(pRExC_state, GROUPP, parno);
10519
10520                  insert_if_check_paren:
10521                     if (UCHARAT(RExC_parse) != ')') {
10522                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10523                         vFAIL("Switch condition not recognized");
10524                     }
10525                     nextchar(pRExC_state);
10526                   insert_if:
10527                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10528                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10529                     if (br == NULL) {
10530                         if (flags & (RESTART_PASS1|NEED_UTF8)) {
10531                             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10532                             return NULL;
10533                         }
10534                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10535                               (UV) flags);
10536                     } else
10537                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10538                                                           LONGJMP, 0));
10539                     c = UCHARAT(RExC_parse);
10540                     nextchar(pRExC_state);
10541                     if (flags&HASWIDTH)
10542                         *flagp |= HASWIDTH;
10543                     if (c == '|') {
10544                         if (is_define)
10545                             vFAIL("(?(DEFINE)....) does not allow branches");
10546
10547                         /* Fake one for optimizer.  */
10548                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10549
10550                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10551                             if (flags & (RESTART_PASS1|NEED_UTF8)) {
10552                                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10553                                 return NULL;
10554                             }
10555                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10556                                   (UV) flags);
10557                         }
10558                         REGTAIL(pRExC_state, ret, lastbr);
10559                         if (flags&HASWIDTH)
10560                             *flagp |= HASWIDTH;
10561                         c = UCHARAT(RExC_parse);
10562                         nextchar(pRExC_state);
10563                     }
10564                     else
10565                         lastbr = NULL;
10566                     if (c != ')') {
10567                         if (RExC_parse>RExC_end)
10568                             vFAIL("Switch (?(condition)... not terminated");
10569                         else
10570                             vFAIL("Switch (?(condition)... contains too many branches");
10571                     }
10572                     ender = reg_node(pRExC_state, TAIL);
10573                     REGTAIL(pRExC_state, br, ender);
10574                     if (lastbr) {
10575                         REGTAIL(pRExC_state, lastbr, ender);
10576                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10577                     }
10578                     else
10579                         REGTAIL(pRExC_state, ret, ender);
10580                     RExC_size++; /* XXX WHY do we need this?!!
10581                                     For large programs it seems to be required
10582                                     but I can't figure out why. -- dmq*/
10583                     return ret;
10584                 }
10585                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10586                 vFAIL("Unknown switch condition (?(...))");
10587             }
10588             case '[':           /* (?[ ... ]) */
10589                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10590                                          oregcomp_parse);
10591             case 0:
10592                 RExC_parse--; /* for vFAIL to print correctly */
10593                 vFAIL("Sequence (? incomplete");
10594                 break;
10595             default: /* e.g., (?i) */
10596                 --RExC_parse;
10597               parse_flags:
10598                 parse_lparen_question_flags(pRExC_state);
10599                 if (UCHARAT(RExC_parse) != ':') {
10600                     if (*RExC_parse)
10601                         nextchar(pRExC_state);
10602                     *flagp = TRYAGAIN;
10603                     return NULL;
10604                 }
10605                 paren = ':';
10606                 nextchar(pRExC_state);
10607                 ret = NULL;
10608                 goto parse_rest;
10609             } /* end switch */
10610         }
10611         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
10612           capturing_parens:
10613             parno = RExC_npar;
10614             RExC_npar++;
10615
10616             ret = reganode(pRExC_state, OPEN, parno);
10617             if (!SIZE_ONLY ){
10618                 if (!RExC_nestroot)
10619                     RExC_nestroot = parno;
10620                 if (RExC_seen & REG_RECURSE_SEEN
10621                     && !RExC_open_parens[parno-1])
10622                 {
10623                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10624                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10625                         22, "|    |", (int)(depth * 2 + 1), "",
10626                         (IV)parno, REG_NODE_NUM(ret)));
10627                     RExC_open_parens[parno-1]= ret;
10628                 }
10629             }
10630             Set_Node_Length(ret, 1); /* MJD */
10631             Set_Node_Offset(ret, RExC_parse); /* MJD */
10632             is_open = 1;
10633         } else {
10634             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10635             paren = ':';
10636             ret = NULL;
10637         }
10638     }
10639     else                        /* ! paren */
10640         ret = NULL;
10641
10642    parse_rest:
10643     /* Pick up the branches, linking them together. */
10644     parse_start = RExC_parse;   /* MJD */
10645     br = regbranch(pRExC_state, &flags, 1,depth+1);
10646
10647     /*     branch_len = (paren != 0); */
10648
10649     if (br == NULL) {
10650         if (flags & (RESTART_PASS1|NEED_UTF8)) {
10651             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10652             return NULL;
10653         }
10654         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10655     }
10656     if (*RExC_parse == '|') {
10657         if (!SIZE_ONLY && RExC_extralen) {
10658             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10659         }
10660         else {                  /* MJD */
10661             reginsert(pRExC_state, BRANCH, br, depth+1);
10662             Set_Node_Length(br, paren != 0);
10663             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10664         }
10665         have_branch = 1;
10666         if (SIZE_ONLY)
10667             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10668     }
10669     else if (paren == ':') {
10670         *flagp |= flags&SIMPLE;
10671     }
10672     if (is_open) {                              /* Starts with OPEN. */
10673         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10674     }
10675     else if (paren != '?')              /* Not Conditional */
10676         ret = br;
10677     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10678     lastbr = br;
10679     while (*RExC_parse == '|') {
10680         if (!SIZE_ONLY && RExC_extralen) {
10681             ender = reganode(pRExC_state, LONGJMP,0);
10682
10683             /* Append to the previous. */
10684             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10685         }
10686         if (SIZE_ONLY)
10687             RExC_extralen += 2;         /* Account for LONGJMP. */
10688         nextchar(pRExC_state);
10689         if (freeze_paren) {
10690             if (RExC_npar > after_freeze)
10691                 after_freeze = RExC_npar;
10692             RExC_npar = freeze_paren;
10693         }
10694         br = regbranch(pRExC_state, &flags, 0, depth+1);
10695
10696         if (br == NULL) {
10697             if (flags & (RESTART_PASS1|NEED_UTF8)) {
10698                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10699                 return NULL;
10700             }
10701             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10702         }
10703         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10704         lastbr = br;
10705         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10706     }
10707
10708     if (have_branch || paren != ':') {
10709         /* Make a closing node, and hook it on the end. */
10710         switch (paren) {
10711         case ':':
10712             ender = reg_node(pRExC_state, TAIL);
10713             break;
10714         case 1: case 2:
10715             ender = reganode(pRExC_state, CLOSE, parno);
10716             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10717                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10718                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10719                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10720                 RExC_close_parens[parno-1]= ender;
10721                 if (RExC_nestroot == parno)
10722                     RExC_nestroot = 0;
10723             }
10724             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10725             Set_Node_Length(ender,1); /* MJD */
10726             break;
10727         case '<':
10728         case ',':
10729         case '=':
10730         case '!':
10731             *flagp &= ~HASWIDTH;
10732             /* FALLTHROUGH */
10733         case '>':
10734             ender = reg_node(pRExC_state, SUCCEED);
10735             break;
10736         case 0:
10737             ender = reg_node(pRExC_state, END);
10738             if (!SIZE_ONLY) {
10739                 assert(!RExC_opend); /* there can only be one! */
10740                 RExC_opend = ender;
10741             }
10742             break;
10743         }
10744         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10745             DEBUG_PARSE_MSG("lsbr");
10746             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10747             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10748             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10749                           SvPV_nolen_const(RExC_mysv1),
10750                           (IV)REG_NODE_NUM(lastbr),
10751                           SvPV_nolen_const(RExC_mysv2),
10752                           (IV)REG_NODE_NUM(ender),
10753                           (IV)(ender - lastbr)
10754             );
10755         });
10756         REGTAIL(pRExC_state, lastbr, ender);
10757
10758         if (have_branch && !SIZE_ONLY) {
10759             char is_nothing= 1;
10760             if (depth==1)
10761                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10762
10763             /* Hook the tails of the branches to the closing node. */
10764             for (br = ret; br; br = regnext(br)) {
10765                 const U8 op = PL_regkind[OP(br)];
10766                 if (op == BRANCH) {
10767                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10768                     if ( OP(NEXTOPER(br)) != NOTHING
10769                          || regnext(NEXTOPER(br)) != ender)
10770                         is_nothing= 0;
10771                 }
10772                 else if (op == BRANCHJ) {
10773                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10774                     /* for now we always disable this optimisation * /
10775                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10776                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10777                     */
10778                         is_nothing= 0;
10779                 }
10780             }
10781             if (is_nothing) {
10782                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10783                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10784                     DEBUG_PARSE_MSG("NADA");
10785                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10786                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10787                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10788                                   SvPV_nolen_const(RExC_mysv1),
10789                                   (IV)REG_NODE_NUM(ret),
10790                                   SvPV_nolen_const(RExC_mysv2),
10791                                   (IV)REG_NODE_NUM(ender),
10792                                   (IV)(ender - ret)
10793                     );
10794                 });
10795                 OP(br)= NOTHING;
10796                 if (OP(ender) == TAIL) {
10797                     NEXT_OFF(br)= 0;
10798                     RExC_emit= br + 1;
10799                 } else {
10800                     regnode *opt;
10801                     for ( opt= br + 1; opt < ender ; opt++ )
10802                         OP(opt)= OPTIMIZED;
10803                     NEXT_OFF(br)= ender - br;
10804                 }
10805             }
10806         }
10807     }
10808
10809     {
10810         const char *p;
10811         static const char parens[] = "=!<,>";
10812
10813         if (paren && (p = strchr(parens, paren))) {
10814             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10815             int flag = (p - parens) > 1;
10816
10817             if (paren == '>')
10818                 node = SUSPEND, flag = 0;
10819             reginsert(pRExC_state, node,ret, depth+1);
10820             Set_Node_Cur_Length(ret, parse_start);
10821             Set_Node_Offset(ret, parse_start + 1);
10822             ret->flags = flag;
10823             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10824         }
10825     }
10826
10827     /* Check for proper termination. */
10828     if (paren) {
10829         /* restore original flags, but keep (?p) and, if we've changed from /d
10830          * rules to /u, keep the /u */
10831         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10832         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
10833             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
10834         }
10835         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
10836             RExC_parse = oregcomp_parse;
10837             vFAIL("Unmatched (");
10838         }
10839         nextchar(pRExC_state);
10840     }
10841     else if (!paren && RExC_parse < RExC_end) {
10842         if (*RExC_parse == ')') {
10843             RExC_parse++;
10844             vFAIL("Unmatched )");
10845         }
10846         else
10847             FAIL("Junk on end of regexp");      /* "Can't happen". */
10848         NOT_REACHED; /* NOTREACHED */
10849     }
10850
10851     if (RExC_in_lookbehind) {
10852         RExC_in_lookbehind--;
10853     }
10854     if (after_freeze > RExC_npar)
10855         RExC_npar = after_freeze;
10856     return(ret);
10857 }
10858
10859 /*
10860  - regbranch - one alternative of an | operator
10861  *
10862  * Implements the concatenation operator.
10863  *
10864  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10865  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10866  */
10867 STATIC regnode *
10868 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10869 {
10870     regnode *ret;
10871     regnode *chain = NULL;
10872     regnode *latest;
10873     I32 flags = 0, c = 0;
10874     GET_RE_DEBUG_FLAGS_DECL;
10875
10876     PERL_ARGS_ASSERT_REGBRANCH;
10877
10878     DEBUG_PARSE("brnc");
10879
10880     if (first)
10881         ret = NULL;
10882     else {
10883         if (!SIZE_ONLY && RExC_extralen)
10884             ret = reganode(pRExC_state, BRANCHJ,0);
10885         else {
10886             ret = reg_node(pRExC_state, BRANCH);
10887             Set_Node_Length(ret, 1);
10888         }
10889     }
10890
10891     if (!first && SIZE_ONLY)
10892         RExC_extralen += 1;                     /* BRANCHJ */
10893
10894     *flagp = WORST;                     /* Tentatively. */
10895
10896     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10897                             FALSE /* Don't force to /x */ );
10898     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10899         flags &= ~TRYAGAIN;
10900         latest = regpiece(pRExC_state, &flags,depth+1);
10901         if (latest == NULL) {
10902             if (flags & TRYAGAIN)
10903                 continue;
10904             if (flags & (RESTART_PASS1|NEED_UTF8)) {
10905                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10906                 return NULL;
10907             }
10908             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10909         }
10910         else if (ret == NULL)
10911             ret = latest;
10912         *flagp |= flags&(HASWIDTH|POSTPONED);
10913         if (chain == NULL)      /* First piece. */
10914             *flagp |= flags&SPSTART;
10915         else {
10916             /* FIXME adding one for every branch after the first is probably
10917              * excessive now we have TRIE support. (hv) */
10918             MARK_NAUGHTY(1);
10919             REGTAIL(pRExC_state, chain, latest);
10920         }
10921         chain = latest;
10922         c++;
10923     }
10924     if (chain == NULL) {        /* Loop ran zero times. */
10925         chain = reg_node(pRExC_state, NOTHING);
10926         if (ret == NULL)
10927             ret = chain;
10928     }
10929     if (c == 1) {
10930         *flagp |= flags&SIMPLE;
10931     }
10932
10933     return ret;
10934 }
10935
10936 /*
10937  - regpiece - something followed by possible [*+?]
10938  *
10939  * Note that the branching code sequences used for ? and the general cases
10940  * of * and + are somewhat optimized:  they use the same NOTHING node as
10941  * both the endmarker for their branch list and the body of the last branch.
10942  * It might seem that this node could be dispensed with entirely, but the
10943  * endmarker role is not redundant.
10944  *
10945  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10946  * TRYAGAIN.
10947  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10948  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10949  */
10950 STATIC regnode *
10951 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10952 {
10953     regnode *ret;
10954     char op;
10955     char *next;
10956     I32 flags;
10957     const char * const origparse = RExC_parse;
10958     I32 min;
10959     I32 max = REG_INFTY;
10960 #ifdef RE_TRACK_PATTERN_OFFSETS
10961     char *parse_start;
10962 #endif
10963     const char *maxpos = NULL;
10964     UV uv;
10965
10966     /* Save the original in case we change the emitted regop to a FAIL. */
10967     regnode * const orig_emit = RExC_emit;
10968
10969     GET_RE_DEBUG_FLAGS_DECL;
10970
10971     PERL_ARGS_ASSERT_REGPIECE;
10972
10973     DEBUG_PARSE("piec");
10974
10975     ret = regatom(pRExC_state, &flags,depth+1);
10976     if (ret == NULL) {
10977         if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
10978             *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
10979         else
10980             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10981         return(NULL);
10982     }
10983
10984     op = *RExC_parse;
10985
10986     if (op == '{' && regcurly(RExC_parse)) {
10987         maxpos = NULL;
10988 #ifdef RE_TRACK_PATTERN_OFFSETS
10989         parse_start = RExC_parse; /* MJD */
10990 #endif
10991         next = RExC_parse + 1;
10992         while (isDIGIT(*next) || *next == ',') {
10993             if (*next == ',') {
10994                 if (maxpos)
10995                     break;
10996                 else
10997                     maxpos = next;
10998             }
10999             next++;
11000         }
11001         if (*next == '}') {             /* got one */
11002             const char* endptr;
11003             if (!maxpos)
11004                 maxpos = next;
11005             RExC_parse++;
11006             if (isDIGIT(*RExC_parse)) {
11007                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11008                     vFAIL("Invalid quantifier in {,}");
11009                 if (uv >= REG_INFTY)
11010                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11011                 min = (I32)uv;
11012             } else {
11013                 min = 0;
11014             }
11015             if (*maxpos == ',')
11016                 maxpos++;
11017             else
11018                 maxpos = RExC_parse;
11019             if (isDIGIT(*maxpos)) {
11020                 if (!grok_atoUV(maxpos, &uv, &endptr))
11021                     vFAIL("Invalid quantifier in {,}");
11022                 if (uv >= REG_INFTY)
11023                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11024                 max = (I32)uv;
11025             } else {
11026                 max = REG_INFTY;                /* meaning "infinity" */
11027             }
11028             RExC_parse = next;
11029             nextchar(pRExC_state);
11030             if (max < min) {    /* If can't match, warn and optimize to fail
11031                                    unconditionally */
11032                 if (SIZE_ONLY) {
11033
11034                     /* We can't back off the size because we have to reserve
11035                      * enough space for all the things we are about to throw
11036                      * away, but we can shrink it by the ammount we are about
11037                      * to re-use here */
11038                     RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
11039                 }
11040                 else {
11041                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11042                     RExC_emit = orig_emit;
11043                 }
11044                 ret = reganode(pRExC_state, OPFAIL, 0);
11045                 return ret;
11046             }
11047             else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
11048             {
11049                 if (PASS2) {
11050                     ckWARN2reg(RExC_parse + 1,
11051                                "Useless use of greediness modifier '%c'",
11052                                *RExC_parse);
11053                 }
11054             }
11055
11056           do_curly:
11057             if ((flags&SIMPLE)) {
11058                 if (min == 0 && max == REG_INFTY) {
11059                     reginsert(pRExC_state, STAR, ret, depth+1);
11060                     ret->flags = 0;
11061                     MARK_NAUGHTY(4);
11062                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11063                     goto nest_check;
11064                 }
11065                 if (min == 1 && max == REG_INFTY) {
11066                     reginsert(pRExC_state, PLUS, ret, depth+1);
11067                     ret->flags = 0;
11068                     MARK_NAUGHTY(3);
11069                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11070                     goto nest_check;
11071                 }
11072                 MARK_NAUGHTY_EXP(2, 2);
11073                 reginsert(pRExC_state, CURLY, ret, depth+1);
11074                 Set_Node_Offset(ret, parse_start+1); /* MJD */
11075                 Set_Node_Cur_Length(ret, parse_start);
11076             }
11077             else {
11078                 regnode * const w = reg_node(pRExC_state, WHILEM);
11079
11080                 w->flags = 0;
11081                 REGTAIL(pRExC_state, ret, w);
11082                 if (!SIZE_ONLY && RExC_extralen) {
11083                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
11084                     reginsert(pRExC_state, NOTHING,ret, depth+1);
11085                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
11086                 }
11087                 reginsert(pRExC_state, CURLYX,ret, depth+1);
11088                                 /* MJD hk */
11089                 Set_Node_Offset(ret, parse_start+1);
11090                 Set_Node_Length(ret,
11091                                 op == '{' ? (RExC_parse - parse_start) : 1);
11092
11093                 if (!SIZE_ONLY && RExC_extralen)
11094                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
11095                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11096                 if (SIZE_ONLY)
11097                     RExC_whilem_seen++, RExC_extralen += 3;
11098                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
11099             }
11100             ret->flags = 0;
11101
11102             if (min > 0)
11103                 *flagp = WORST;
11104             if (max > 0)
11105                 *flagp |= HASWIDTH;
11106             if (!SIZE_ONLY) {
11107                 ARG1_SET(ret, (U16)min);
11108                 ARG2_SET(ret, (U16)max);
11109             }
11110             if (max == REG_INFTY)
11111                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11112
11113             goto nest_check;
11114         }
11115     }
11116
11117     if (!ISMULT1(op)) {
11118         *flagp = flags;
11119         return(ret);
11120     }
11121
11122 #if 0                           /* Now runtime fix should be reliable. */
11123
11124     /* if this is reinstated, don't forget to put this back into perldiag:
11125
11126             =item Regexp *+ operand could be empty at {#} in regex m/%s/
11127
11128            (F) The part of the regexp subject to either the * or + quantifier
11129            could match an empty string. The {#} shows in the regular
11130            expression about where the problem was discovered.
11131
11132     */
11133
11134     if (!(flags&HASWIDTH) && op != '?')
11135       vFAIL("Regexp *+ operand could be empty");
11136 #endif
11137
11138 #ifdef RE_TRACK_PATTERN_OFFSETS
11139     parse_start = RExC_parse;
11140 #endif
11141     nextchar(pRExC_state);
11142
11143     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11144
11145     if (op == '*') {
11146         min = 0;
11147         goto do_curly;
11148     }
11149     else if (op == '+') {
11150         min = 1;
11151         goto do_curly;
11152     }
11153     else if (op == '?') {
11154         min = 0; max = 1;
11155         goto do_curly;
11156     }
11157   nest_check:
11158     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11159         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11160         ckWARN2reg(RExC_parse,
11161                    "%"UTF8f" matches null string many times",
11162                    UTF8fARG(UTF, (RExC_parse >= origparse
11163                                  ? RExC_parse - origparse
11164                                  : 0),
11165                    origparse));
11166         (void)ReREFCNT_inc(RExC_rx_sv);
11167     }
11168
11169     if (RExC_parse < RExC_end && *RExC_parse == '?') {
11170         nextchar(pRExC_state);
11171         reginsert(pRExC_state, MINMOD, ret, depth+1);
11172         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11173     }
11174     else
11175     if (RExC_parse < RExC_end && *RExC_parse == '+') {
11176         regnode *ender;
11177         nextchar(pRExC_state);
11178         ender = reg_node(pRExC_state, SUCCEED);
11179         REGTAIL(pRExC_state, ret, ender);
11180         reginsert(pRExC_state, SUSPEND, ret, depth+1);
11181         ret->flags = 0;
11182         ender = reg_node(pRExC_state, TAIL);
11183         REGTAIL(pRExC_state, ret, ender);
11184     }
11185
11186     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11187         RExC_parse++;
11188         vFAIL("Nested quantifiers");
11189     }
11190
11191     return(ret);
11192 }
11193
11194 STATIC bool
11195 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11196                 regnode ** node_p,
11197                 UV * code_point_p,
11198                 int * cp_count,
11199                 I32 * flagp,
11200                 const U32 depth
11201     )
11202 {
11203  /* This routine teases apart the various meanings of \N and returns
11204   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11205   * in the current context.
11206   *
11207   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11208   *
11209   * If <code_point_p> is not NULL, the context is expecting the result to be a
11210   * single code point.  If this \N instance turns out to a single code point,
11211   * the function returns TRUE and sets *code_point_p to that code point.
11212   *
11213   * If <node_p> is not NULL, the context is expecting the result to be one of
11214   * the things representable by a regnode.  If this \N instance turns out to be
11215   * one such, the function generates the regnode, returns TRUE and sets *node_p
11216   * to point to that regnode.
11217   *
11218   * If this instance of \N isn't legal in any context, this function will
11219   * generate a fatal error and not return.
11220   *
11221   * On input, RExC_parse should point to the first char following the \N at the
11222   * time of the call.  On successful return, RExC_parse will have been updated
11223   * to point to just after the sequence identified by this routine.  Also
11224   * *flagp has been updated as needed.
11225   *
11226   * When there is some problem with the current context and this \N instance,
11227   * the function returns FALSE, without advancing RExC_parse, nor setting
11228   * *node_p, nor *code_point_p, nor *flagp.
11229   *
11230   * If <cp_count> is not NULL, the caller wants to know the length (in code
11231   * points) that this \N sequence matches.  This is set even if the function
11232   * returns FALSE, as detailed below.
11233   *
11234   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11235   *
11236   * Probably the most common case is for the \N to specify a single code point.
11237   * *cp_count will be set to 1, and *code_point_p will be set to that code
11238   * point.
11239   *
11240   * Another possibility is for the input to be an empty \N{}, which for
11241   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11242   * will be set to a generated NOTHING node.
11243   *
11244   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11245   * set to 0. *node_p will be set to a generated REG_ANY node.
11246   *
11247   * The fourth possibility is that \N resolves to a sequence of more than one
11248   * code points.  *cp_count will be set to the number of code points in the
11249   * sequence. *node_p * will be set to a generated node returned by this
11250   * function calling S_reg().
11251   *
11252   * The final possibility is that it is premature to be calling this function;
11253   * that pass1 needs to be restarted.  This can happen when this changes from
11254   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
11255   * latter occurs only when the fourth possibility would otherwise be in
11256   * effect, and is because one of those code points requires the pattern to be
11257   * recompiled as UTF-8.  The function returns FALSE, and sets the
11258   * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
11259   * happens, the caller needs to desist from continuing parsing, and return
11260   * this information to its caller.  This is not set for when there is only one
11261   * code point, as this can be called as part of an ANYOF node, and they can
11262   * store above-Latin1 code points without the pattern having to be in UTF-8.
11263   *
11264   * For non-single-quoted regexes, the tokenizer has resolved character and
11265   * sequence names inside \N{...} into their Unicode values, normalizing the
11266   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11267   * hex-represented code points in the sequence.  This is done there because
11268   * the names can vary based on what charnames pragma is in scope at the time,
11269   * so we need a way to take a snapshot of what they resolve to at the time of
11270   * the original parse. [perl #56444].
11271   *
11272   * That parsing is skipped for single-quoted regexes, so we may here get
11273   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11274   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11275   * is legal and handled here.  The code point is Unicode, and has to be
11276   * translated into the native character set for non-ASCII platforms.
11277   */
11278
11279     char * endbrace;    /* points to '}' following the name */
11280     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11281                            stream */
11282     char* p = RExC_parse; /* Temporary */
11283
11284     GET_RE_DEBUG_FLAGS_DECL;
11285
11286     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11287
11288     GET_RE_DEBUG_FLAGS;
11289
11290     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11291     assert(! (node_p && cp_count));               /* At most 1 should be set */
11292
11293     if (cp_count) {     /* Initialize return for the most common case */
11294         *cp_count = 1;
11295     }
11296
11297     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11298      * modifier.  The other meanings do not, so use a temporary until we find
11299      * out which we are being called with */
11300     skip_to_be_ignored_text(pRExC_state, &p,
11301                             FALSE /* Don't force to /x */ );
11302
11303     /* Disambiguate between \N meaning a named character versus \N meaning
11304      * [^\n].  The latter is assumed when the {...} following the \N is a legal
11305      * quantifier, or there is no '{' at all */
11306     if (*p != '{' || regcurly(p)) {
11307         RExC_parse = p;
11308         if (cp_count) {
11309             *cp_count = -1;
11310         }
11311
11312         if (! node_p) {
11313             return FALSE;
11314         }
11315
11316         *node_p = reg_node(pRExC_state, REG_ANY);
11317         *flagp |= HASWIDTH|SIMPLE;
11318         MARK_NAUGHTY(1);
11319         Set_Node_Length(*node_p, 1); /* MJD */
11320         return TRUE;
11321     }
11322
11323     /* Here, we have decided it should be a named character or sequence */
11324
11325     /* The test above made sure that the next real character is a '{', but
11326      * under the /x modifier, it could be separated by space (or a comment and
11327      * \n) and this is not allowed (for consistency with \x{...} and the
11328      * tokenizer handling of \N{NAME}). */
11329     if (*RExC_parse != '{') {
11330         vFAIL("Missing braces on \\N{}");
11331     }
11332
11333     RExC_parse++;       /* Skip past the '{' */
11334
11335     if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11336         || ! (endbrace == RExC_parse            /* nothing between the {} */
11337               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked... */
11338                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11339                                                        error msg) */
11340     {
11341         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11342         vFAIL("\\N{NAME} must be resolved by the lexer");
11343     }
11344
11345     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11346                                         semantics */
11347
11348     if (endbrace == RExC_parse) {   /* empty: \N{} */
11349         if (cp_count) {
11350             *cp_count = 0;
11351         }
11352         nextchar(pRExC_state);
11353         if (! node_p) {
11354             return FALSE;
11355         }
11356
11357         *node_p = reg_node(pRExC_state,NOTHING);
11358         return TRUE;
11359     }
11360
11361     RExC_parse += 2;    /* Skip past the 'U+' */
11362
11363     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11364
11365     /* Code points are separated by dots.  If none, there is only one code
11366      * point, and is terminated by the brace */
11367
11368     if (endchar >= endbrace) {
11369         STRLEN length_of_hex;
11370         I32 grok_hex_flags;
11371
11372         /* Here, exactly one code point.  If that isn't what is wanted, fail */
11373         if (! code_point_p) {
11374             RExC_parse = p;
11375             return FALSE;
11376         }
11377
11378         /* Convert code point from hex */
11379         length_of_hex = (STRLEN)(endchar - RExC_parse);
11380         grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11381                            | PERL_SCAN_DISALLOW_PREFIX
11382
11383                              /* No errors in the first pass (See [perl
11384                               * #122671].)  We let the code below find the
11385                               * errors when there are multiple chars. */
11386                            | ((SIZE_ONLY)
11387                               ? PERL_SCAN_SILENT_ILLDIGIT
11388                               : 0);
11389
11390         /* This routine is the one place where both single- and double-quotish
11391          * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
11392          * must be converted to native. */
11393         *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11394                                          &length_of_hex,
11395                                          &grok_hex_flags,
11396                                          NULL));
11397
11398         /* The tokenizer should have guaranteed validity, but it's possible to
11399          * bypass it by using single quoting, so check.  Don't do the check
11400          * here when there are multiple chars; we do it below anyway. */
11401         if (length_of_hex == 0
11402             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11403         {
11404             RExC_parse += length_of_hex;        /* Includes all the valid */
11405             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
11406                             ? UTF8SKIP(RExC_parse)
11407                             : 1;
11408             /* Guard against malformed utf8 */
11409             if (RExC_parse >= endchar) {
11410                 RExC_parse = endchar;
11411             }
11412             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11413         }
11414
11415         RExC_parse = endbrace + 1;
11416         return TRUE;
11417     }
11418     else {  /* Is a multiple character sequence */
11419         SV * substitute_parse;
11420         STRLEN len;
11421         char *orig_end = RExC_end;
11422         char *save_start = RExC_start;
11423         I32 flags;
11424
11425         /* Count the code points, if desired, in the sequence */
11426         if (cp_count) {
11427             *cp_count = 0;
11428             while (RExC_parse < endbrace) {
11429                 /* Point to the beginning of the next character in the sequence. */
11430                 RExC_parse = endchar + 1;
11431                 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11432                 (*cp_count)++;
11433             }
11434         }
11435
11436         /* Fail if caller doesn't want to handle a multi-code-point sequence.
11437          * But don't backup up the pointer if the caller want to know how many
11438          * code points there are (they can then handle things) */
11439         if (! node_p) {
11440             if (! cp_count) {
11441                 RExC_parse = p;
11442             }
11443             return FALSE;
11444         }
11445
11446         /* What is done here is to convert this to a sub-pattern of the form
11447          * \x{char1}\x{char2}...  and then call reg recursively to parse it
11448          * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
11449          * while not having to worry about special handling that some code
11450          * points may have. */
11451
11452         substitute_parse = newSVpvs("?:");
11453
11454         while (RExC_parse < endbrace) {
11455
11456             /* Convert to notation the rest of the code understands */
11457             sv_catpv(substitute_parse, "\\x{");
11458             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11459             sv_catpv(substitute_parse, "}");
11460
11461             /* Point to the beginning of the next character in the sequence. */
11462             RExC_parse = endchar + 1;
11463             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11464
11465         }
11466         sv_catpv(substitute_parse, ")");
11467
11468         RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
11469                                                              len);
11470
11471         /* Don't allow empty number */
11472         if (len < (STRLEN) 8) {
11473             RExC_parse = endbrace;
11474             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11475         }
11476         RExC_end = RExC_parse + len;
11477
11478         /* The values are Unicode, and therefore not subject to recoding, but
11479          * have to be converted to native on a non-Unicode (meaning non-ASCII)
11480          * platform. */
11481         RExC_override_recoding = 1;
11482 #ifdef EBCDIC
11483         RExC_recode_x_to_native = 1;
11484 #endif
11485
11486         if (node_p) {
11487             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11488                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11489                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11490                     return FALSE;
11491                 }
11492                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11493                     (UV) flags);
11494             }
11495             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11496         }
11497
11498         /* Restore the saved values */
11499         RExC_start = RExC_adjusted_start = save_start;
11500         RExC_parse = endbrace;
11501         RExC_end = orig_end;
11502         RExC_override_recoding = 0;
11503 #ifdef EBCDIC
11504         RExC_recode_x_to_native = 0;
11505 #endif
11506
11507         SvREFCNT_dec_NN(substitute_parse);
11508         nextchar(pRExC_state);
11509
11510         return TRUE;
11511     }
11512 }
11513
11514
11515 /*
11516  * reg_recode
11517  *
11518  * It returns the code point in utf8 for the value in *encp.
11519  *    value: a code value in the source encoding
11520  *    encp:  a pointer to an Encode object
11521  *
11522  * If the result from Encode is not a single character,
11523  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11524  */
11525 STATIC UV
11526 S_reg_recode(pTHX_ const U8 value, SV **encp)
11527 {
11528     STRLEN numlen = 1;
11529     SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
11530     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11531     const STRLEN newlen = SvCUR(sv);
11532     UV uv = UNICODE_REPLACEMENT;
11533
11534     PERL_ARGS_ASSERT_REG_RECODE;
11535
11536     if (newlen)
11537         uv = SvUTF8(sv)
11538              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11539              : *(U8*)s;
11540
11541     if (!newlen || numlen != newlen) {
11542         uv = UNICODE_REPLACEMENT;
11543         *encp = NULL;
11544     }
11545     return uv;
11546 }
11547
11548 PERL_STATIC_INLINE U8
11549 S_compute_EXACTish(RExC_state_t *pRExC_state)
11550 {
11551     U8 op;
11552
11553     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11554
11555     if (! FOLD) {
11556         return (LOC)
11557                 ? EXACTL
11558                 : EXACT;
11559     }
11560
11561     op = get_regex_charset(RExC_flags);
11562     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11563         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11564                  been, so there is no hole */
11565     }
11566
11567     return op + EXACTF;
11568 }
11569
11570 PERL_STATIC_INLINE void
11571 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11572                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11573                          bool downgradable)
11574 {
11575     /* This knows the details about sizing an EXACTish node, setting flags for
11576      * it (by setting <*flagp>, and potentially populating it with a single
11577      * character.
11578      *
11579      * If <len> (the length in bytes) is non-zero, this function assumes that
11580      * the node has already been populated, and just does the sizing.  In this
11581      * case <code_point> should be the final code point that has already been
11582      * placed into the node.  This value will be ignored except that under some
11583      * circumstances <*flagp> is set based on it.
11584      *
11585      * If <len> is zero, the function assumes that the node is to contain only
11586      * the single character given by <code_point> and calculates what <len>
11587      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11588      * additionally will populate the node's STRING with <code_point> or its
11589      * fold if folding.
11590      *
11591      * In both cases <*flagp> is appropriately set
11592      *
11593      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11594      * 255, must be folded (the former only when the rules indicate it can
11595      * match 'ss')
11596      *
11597      * When it does the populating, it looks at the flag 'downgradable'.  If
11598      * true with a node that folds, it checks if the single code point
11599      * participates in a fold, and if not downgrades the node to an EXACT.
11600      * This helps the optimizer */
11601
11602     bool len_passed_in = cBOOL(len != 0);
11603     U8 character[UTF8_MAXBYTES_CASE+1];
11604
11605     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11606
11607     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11608      * sizing difference, and is extra work that is thrown away */
11609     if (downgradable && ! PASS2) {
11610         downgradable = FALSE;
11611     }
11612
11613     if (! len_passed_in) {
11614         if (UTF) {
11615             if (UVCHR_IS_INVARIANT(code_point)) {
11616                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11617                     *character = (U8) code_point;
11618                 }
11619                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11620                           ASCII, which isn't the same thing as INVARIANT on
11621                           EBCDIC, but it works there, as the extra invariants
11622                           fold to themselves) */
11623                     *character = toFOLD((U8) code_point);
11624
11625                     /* We can downgrade to an EXACT node if this character
11626                      * isn't a folding one.  Note that this assumes that
11627                      * nothing above Latin1 folds to some other invariant than
11628                      * one of these alphabetics; otherwise we would also have
11629                      * to check:
11630                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11631                      *      || ASCII_FOLD_RESTRICTED))
11632                      */
11633                     if (downgradable && PL_fold[code_point] == code_point) {
11634                         OP(node) = EXACT;
11635                     }
11636                 }
11637                 len = 1;
11638             }
11639             else if (FOLD && (! LOC
11640                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11641             {   /* Folding, and ok to do so now */
11642                 UV folded = _to_uni_fold_flags(
11643                                    code_point,
11644                                    character,
11645                                    &len,
11646                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11647                                                       ? FOLD_FLAGS_NOMIX_ASCII
11648                                                       : 0));
11649                 if (downgradable
11650                     && folded == code_point /* This quickly rules out many
11651                                                cases, avoiding the
11652                                                _invlist_contains_cp() overhead
11653                                                for those.  */
11654                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11655                 {
11656                     OP(node) = (LOC)
11657                                ? EXACTL
11658                                : EXACT;
11659                 }
11660             }
11661             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11662
11663                 /* Not folding this cp, and can output it directly */
11664                 *character = UTF8_TWO_BYTE_HI(code_point);
11665                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11666                 len = 2;
11667             }
11668             else {
11669                 uvchr_to_utf8( character, code_point);
11670                 len = UTF8SKIP(character);
11671             }
11672         } /* Else pattern isn't UTF8.  */
11673         else if (! FOLD) {
11674             *character = (U8) code_point;
11675             len = 1;
11676         } /* Else is folded non-UTF8 */
11677 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
11678    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
11679                                       || UNICODE_DOT_DOT_VERSION > 0)
11680         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11681 #else
11682         else if (1) {
11683 #endif
11684             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11685              * comments at join_exact()); */
11686             *character = (U8) code_point;
11687             len = 1;
11688
11689             /* Can turn into an EXACT node if we know the fold at compile time,
11690              * and it folds to itself and doesn't particpate in other folds */
11691             if (downgradable
11692                 && ! LOC
11693                 && PL_fold_latin1[code_point] == code_point
11694                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11695                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11696             {
11697                 OP(node) = EXACT;
11698             }
11699         } /* else is Sharp s.  May need to fold it */
11700         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11701             *character = 's';
11702             *(character + 1) = 's';
11703             len = 2;
11704         }
11705         else {
11706             *character = LATIN_SMALL_LETTER_SHARP_S;
11707             len = 1;
11708         }
11709     }
11710
11711     if (SIZE_ONLY) {
11712         RExC_size += STR_SZ(len);
11713     }
11714     else {
11715         RExC_emit += STR_SZ(len);
11716         STR_LEN(node) = len;
11717         if (! len_passed_in) {
11718             Copy((char *) character, STRING(node), len, char);
11719         }
11720     }
11721
11722     *flagp |= HASWIDTH;
11723
11724     /* A single character node is SIMPLE, except for the special-cased SHARP S
11725      * under /di. */
11726     if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
11727 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
11728    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
11729                                       || UNICODE_DOT_DOT_VERSION > 0)
11730         && ( code_point != LATIN_SMALL_LETTER_SHARP_S
11731             || ! FOLD || ! DEPENDS_SEMANTICS)
11732 #endif
11733     ) {
11734         *flagp |= SIMPLE;
11735     }
11736
11737     /* The OP may not be well defined in PASS1 */
11738     if (PASS2 && OP(node) == EXACTFL) {
11739         RExC_contains_locale = 1;
11740     }
11741 }
11742
11743
11744 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11745  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11746
11747 static I32
11748 S_backref_value(char *p)
11749 {
11750     const char* endptr;
11751     UV val;
11752     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11753         return (I32)val;
11754     return I32_MAX;
11755 }
11756
11757
11758 /*
11759  - regatom - the lowest level
11760
11761    Try to identify anything special at the start of the pattern. If there
11762    is, then handle it as required. This may involve generating a single regop,
11763    such as for an assertion; or it may involve recursing, such as to
11764    handle a () structure.
11765
11766    If the string doesn't start with something special then we gobble up
11767    as much literal text as we can.
11768
11769    Once we have been able to handle whatever type of thing started the
11770    sequence, we return.
11771
11772    Note: we have to be careful with escapes, as they can be both literal
11773    and special, and in the case of \10 and friends, context determines which.
11774
11775    A summary of the code structure is:
11776
11777    switch (first_byte) {
11778         cases for each special:
11779             handle this special;
11780             break;
11781         case '\\':
11782             switch (2nd byte) {
11783                 cases for each unambiguous special:
11784                     handle this special;
11785                     break;
11786                 cases for each ambigous special/literal:
11787                     disambiguate;
11788                     if (special)  handle here
11789                     else goto defchar;
11790                 default: // unambiguously literal:
11791                     goto defchar;
11792             }
11793         default:  // is a literal char
11794             // FALL THROUGH
11795         defchar:
11796             create EXACTish node for literal;
11797             while (more input and node isn't full) {
11798                 switch (input_byte) {
11799                    cases for each special;
11800                        make sure parse pointer is set so that the next call to
11801                            regatom will see this special first
11802                        goto loopdone; // EXACTish node terminated by prev. char
11803                    default:
11804                        append char to EXACTISH node;
11805                 }
11806                 get next input byte;
11807             }
11808         loopdone:
11809    }
11810    return the generated node;
11811
11812    Specifically there are two separate switches for handling
11813    escape sequences, with the one for handling literal escapes requiring
11814    a dummy entry for all of the special escapes that are actually handled
11815    by the other.
11816
11817    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11818    TRYAGAIN.
11819    Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11820    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11821    Otherwise does not return NULL.
11822 */
11823
11824 STATIC regnode *
11825 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11826 {
11827     regnode *ret = NULL;
11828     I32 flags = 0;
11829     char *parse_start;
11830     U8 op;
11831     int invert = 0;
11832     U8 arg;
11833
11834     GET_RE_DEBUG_FLAGS_DECL;
11835
11836     *flagp = WORST;             /* Tentatively. */
11837
11838     DEBUG_PARSE("atom");
11839
11840     PERL_ARGS_ASSERT_REGATOM;
11841
11842   tryagain:
11843     parse_start = RExC_parse;
11844     switch ((U8)*RExC_parse) {
11845     case '^':
11846         RExC_seen_zerolen++;
11847         nextchar(pRExC_state);
11848         if (RExC_flags & RXf_PMf_MULTILINE)
11849             ret = reg_node(pRExC_state, MBOL);
11850         else
11851             ret = reg_node(pRExC_state, SBOL);
11852         Set_Node_Length(ret, 1); /* MJD */
11853         break;
11854     case '$':
11855         nextchar(pRExC_state);
11856         if (*RExC_parse)
11857             RExC_seen_zerolen++;
11858         if (RExC_flags & RXf_PMf_MULTILINE)
11859             ret = reg_node(pRExC_state, MEOL);
11860         else
11861             ret = reg_node(pRExC_state, SEOL);
11862         Set_Node_Length(ret, 1); /* MJD */
11863         break;
11864     case '.':
11865         nextchar(pRExC_state);
11866         if (RExC_flags & RXf_PMf_SINGLELINE)
11867             ret = reg_node(pRExC_state, SANY);
11868         else
11869             ret = reg_node(pRExC_state, REG_ANY);
11870         *flagp |= HASWIDTH|SIMPLE;
11871         MARK_NAUGHTY(1);
11872         Set_Node_Length(ret, 1); /* MJD */
11873         break;
11874     case '[':
11875     {
11876         char * const oregcomp_parse = ++RExC_parse;
11877         ret = regclass(pRExC_state, flagp,depth+1,
11878                        FALSE, /* means parse the whole char class */
11879                        TRUE, /* allow multi-char folds */
11880                        FALSE, /* don't silence non-portable warnings. */
11881                        (bool) RExC_strict,
11882                        TRUE, /* Allow an optimized regnode result */
11883                        NULL);
11884         if (ret == NULL) {
11885             if (*flagp & (RESTART_PASS1|NEED_UTF8))
11886                 return NULL;
11887             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11888                   (UV) *flagp);
11889         }
11890         if (*RExC_parse != ']') {
11891             RExC_parse = oregcomp_parse;
11892             vFAIL("Unmatched [");
11893         }
11894         nextchar(pRExC_state);
11895         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11896         break;
11897     }
11898     case '(':
11899         nextchar(pRExC_state);
11900         ret = reg(pRExC_state, 2, &flags,depth+1);
11901         if (ret == NULL) {
11902                 if (flags & TRYAGAIN) {
11903                     if (RExC_parse == RExC_end) {
11904                          /* Make parent create an empty node if needed. */
11905                         *flagp |= TRYAGAIN;
11906                         return(NULL);
11907                     }
11908                     goto tryagain;
11909                 }
11910                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11911                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11912                     return NULL;
11913                 }
11914                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11915                                                                  (UV) flags);
11916         }
11917         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11918         break;
11919     case '|':
11920     case ')':
11921         if (flags & TRYAGAIN) {
11922             *flagp |= TRYAGAIN;
11923             return NULL;
11924         }
11925         vFAIL("Internal urp");
11926                                 /* Supposed to be caught earlier. */
11927         break;
11928     case '?':
11929     case '+':
11930     case '*':
11931         RExC_parse++;
11932         vFAIL("Quantifier follows nothing");
11933         break;
11934     case '\\':
11935         /* Special Escapes
11936
11937            This switch handles escape sequences that resolve to some kind
11938            of special regop and not to literal text. Escape sequnces that
11939            resolve to literal text are handled below in the switch marked
11940            "Literal Escapes".
11941
11942            Every entry in this switch *must* have a corresponding entry
11943            in the literal escape switch. However, the opposite is not
11944            required, as the default for this switch is to jump to the
11945            literal text handling code.
11946         */
11947         switch ((U8)*++RExC_parse) {
11948         /* Special Escapes */
11949         case 'A':
11950             RExC_seen_zerolen++;
11951             ret = reg_node(pRExC_state, SBOL);
11952             /* SBOL is shared with /^/ so we set the flags so we can tell
11953              * /\A/ from /^/ in split. We check ret because first pass we
11954              * have no regop struct to set the flags on. */
11955             if (PASS2)
11956                 ret->flags = 1;
11957             *flagp |= SIMPLE;
11958             goto finish_meta_pat;
11959         case 'G':
11960             ret = reg_node(pRExC_state, GPOS);
11961             RExC_seen |= REG_GPOS_SEEN;
11962             *flagp |= SIMPLE;
11963             goto finish_meta_pat;
11964         case 'K':
11965             RExC_seen_zerolen++;
11966             ret = reg_node(pRExC_state, KEEPS);
11967             *flagp |= SIMPLE;
11968             /* XXX:dmq : disabling in-place substitution seems to
11969              * be necessary here to avoid cases of memory corruption, as
11970              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11971              */
11972             RExC_seen |= REG_LOOKBEHIND_SEEN;
11973             goto finish_meta_pat;
11974         case 'Z':
11975             ret = reg_node(pRExC_state, SEOL);
11976             *flagp |= SIMPLE;
11977             RExC_seen_zerolen++;                /* Do not optimize RE away */
11978             goto finish_meta_pat;
11979         case 'z':
11980             ret = reg_node(pRExC_state, EOS);
11981             *flagp |= SIMPLE;
11982             RExC_seen_zerolen++;                /* Do not optimize RE away */
11983             goto finish_meta_pat;
11984         case 'C':
11985             vFAIL("\\C no longer supported");
11986         case 'X':
11987             ret = reg_node(pRExC_state, CLUMP);
11988             *flagp |= HASWIDTH;
11989             goto finish_meta_pat;
11990
11991         case 'W':
11992             invert = 1;
11993             /* FALLTHROUGH */
11994         case 'w':
11995             arg = ANYOF_WORDCHAR;
11996             goto join_posix;
11997
11998         case 'B':
11999             invert = 1;
12000             /* FALLTHROUGH */
12001         case 'b':
12002           {
12003             regex_charset charset = get_regex_charset(RExC_flags);
12004
12005             RExC_seen_zerolen++;
12006             RExC_seen |= REG_LOOKBEHIND_SEEN;
12007             op = BOUND + charset;
12008
12009             if (op == BOUNDL) {
12010                 RExC_contains_locale = 1;
12011             }
12012
12013             ret = reg_node(pRExC_state, op);
12014             *flagp |= SIMPLE;
12015             if (*(RExC_parse + 1) != '{') {
12016                 FLAGS(ret) = TRADITIONAL_BOUND;
12017                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
12018                     OP(ret) = BOUNDA;
12019                 }
12020             }
12021             else {
12022                 STRLEN length;
12023                 char name = *RExC_parse;
12024                 char * endbrace;
12025                 RExC_parse += 2;
12026                 endbrace = strchr(RExC_parse, '}');
12027
12028                 if (! endbrace) {
12029                     vFAIL2("Missing right brace on \\%c{}", name);
12030                 }
12031                 /* XXX Need to decide whether to take spaces or not.  Should be
12032                  * consistent with \p{}, but that currently is SPACE, which
12033                  * means vertical too, which seems wrong
12034                  * while (isBLANK(*RExC_parse)) {
12035                     RExC_parse++;
12036                 }*/
12037                 if (endbrace == RExC_parse) {
12038                     RExC_parse++;  /* After the '}' */
12039                     vFAIL2("Empty \\%c{}", name);
12040                 }
12041                 length = endbrace - RExC_parse;
12042                 /*while (isBLANK(*(RExC_parse + length - 1))) {
12043                     length--;
12044                 }*/
12045                 switch (*RExC_parse) {
12046                     case 'g':
12047                         if (length != 1
12048                             && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12049                         {
12050                             goto bad_bound_type;
12051                         }
12052                         FLAGS(ret) = GCB_BOUND;
12053                         break;
12054                     case 'l':
12055                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12056                             goto bad_bound_type;
12057                         }
12058                         FLAGS(ret) = LB_BOUND;
12059                         break;
12060                     case 's':
12061                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12062                             goto bad_bound_type;
12063                         }
12064                         FLAGS(ret) = SB_BOUND;
12065                         break;
12066                     case 'w':
12067                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12068                             goto bad_bound_type;
12069                         }
12070                         FLAGS(ret) = WB_BOUND;
12071                         break;
12072                     default:
12073                       bad_bound_type:
12074                         RExC_parse = endbrace;
12075                         vFAIL2utf8f(
12076                             "'%"UTF8f"' is an unknown bound type",
12077                             UTF8fARG(UTF, length, endbrace - length));
12078                         NOT_REACHED; /*NOTREACHED*/
12079                 }
12080                 RExC_parse = endbrace;
12081                 REQUIRE_UNI_RULES(flagp, NULL);
12082
12083                 if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
12084                     OP(ret) = BOUNDU;
12085                     length += 4;
12086
12087                     /* Don't have to worry about UTF-8, in this message because
12088                      * to get here the contents of the \b must be ASCII */
12089                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
12090                               "Using /u for '%.*s' instead of /%s",
12091                               (unsigned) length,
12092                               endbrace - length + 1,
12093                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12094                               ? ASCII_RESTRICT_PAT_MODS
12095                               : ASCII_MORE_RESTRICT_PAT_MODS);
12096                 }
12097             }
12098
12099             if (PASS2 && invert) {
12100                 OP(ret) += NBOUND - BOUND;
12101             }
12102             goto finish_meta_pat;
12103           }
12104
12105         case 'D':
12106             invert = 1;
12107             /* FALLTHROUGH */
12108         case 'd':
12109             arg = ANYOF_DIGIT;
12110             if (! DEPENDS_SEMANTICS) {
12111                 goto join_posix;
12112             }
12113
12114             /* \d doesn't have any matches in the upper Latin1 range, hence /d
12115              * is equivalent to /u.  Changing to /u saves some branches at
12116              * runtime */
12117             op = POSIXU;
12118             goto join_posix_op_known;
12119
12120         case 'R':
12121             ret = reg_node(pRExC_state, LNBREAK);
12122             *flagp |= HASWIDTH|SIMPLE;
12123             goto finish_meta_pat;
12124
12125         case 'H':
12126             invert = 1;
12127             /* FALLTHROUGH */
12128         case 'h':
12129             arg = ANYOF_BLANK;
12130             op = POSIXU;
12131             goto join_posix_op_known;
12132
12133         case 'V':
12134             invert = 1;
12135             /* FALLTHROUGH */
12136         case 'v':
12137             arg = ANYOF_VERTWS;
12138             op = POSIXU;
12139             goto join_posix_op_known;
12140
12141         case 'S':
12142             invert = 1;
12143             /* FALLTHROUGH */
12144         case 's':
12145             arg = ANYOF_SPACE;
12146
12147           join_posix:
12148
12149             op = POSIXD + get_regex_charset(RExC_flags);
12150             if (op > POSIXA) {  /* /aa is same as /a */
12151                 op = POSIXA;
12152             }
12153             else if (op == POSIXL) {
12154                 RExC_contains_locale = 1;
12155             }
12156
12157           join_posix_op_known:
12158
12159             if (invert) {
12160                 op += NPOSIXD - POSIXD;
12161             }
12162
12163             ret = reg_node(pRExC_state, op);
12164             if (! SIZE_ONLY) {
12165                 FLAGS(ret) = namedclass_to_classnum(arg);
12166             }
12167
12168             *flagp |= HASWIDTH|SIMPLE;
12169             /* FALLTHROUGH */
12170
12171           finish_meta_pat:
12172             nextchar(pRExC_state);
12173             Set_Node_Length(ret, 2); /* MJD */
12174             break;
12175         case 'p':
12176         case 'P':
12177             RExC_parse--;
12178
12179             ret = regclass(pRExC_state, flagp,depth+1,
12180                            TRUE, /* means just parse this element */
12181                            FALSE, /* don't allow multi-char folds */
12182                            FALSE, /* don't silence non-portable warnings.  It
12183                                      would be a bug if these returned
12184                                      non-portables */
12185                            (bool) RExC_strict,
12186                            TRUE, /* Allow an optimized regnode result */
12187                            NULL);
12188             if (*flagp & RESTART_PASS1)
12189                 return NULL;
12190             /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12191              * multi-char folds are allowed.  */
12192             if (!ret)
12193                 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12194                       (UV) *flagp);
12195
12196             RExC_parse--;
12197
12198             Set_Node_Offset(ret, parse_start);
12199             Set_Node_Cur_Length(ret, parse_start - 2);
12200             nextchar(pRExC_state);
12201             break;
12202         case 'N':
12203             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12204              * \N{...} evaluates to a sequence of more than one code points).
12205              * The function call below returns a regnode, which is our result.
12206              * The parameters cause it to fail if the \N{} evaluates to a
12207              * single code point; we handle those like any other literal.  The
12208              * reason that the multicharacter case is handled here and not as
12209              * part of the EXACtish code is because of quantifiers.  In
12210              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12211              * this way makes that Just Happen. dmq.
12212              * join_exact() will join this up with adjacent EXACTish nodes
12213              * later on, if appropriate. */
12214             ++RExC_parse;
12215             if (grok_bslash_N(pRExC_state,
12216                               &ret,     /* Want a regnode returned */
12217                               NULL,     /* Fail if evaluates to a single code
12218                                            point */
12219                               NULL,     /* Don't need a count of how many code
12220                                            points */
12221                               flagp,
12222                               depth)
12223             ) {
12224                 break;
12225             }
12226
12227             if (*flagp & RESTART_PASS1)
12228                 return NULL;
12229
12230             /* Here, evaluates to a single code point.  Go get that */
12231             RExC_parse = parse_start;
12232             goto defchar;
12233
12234         case 'k':    /* Handle \k<NAME> and \k'NAME' */
12235       parse_named_seq:
12236         {
12237             char ch= RExC_parse[1];
12238             if (ch != '<' && ch != '\'' && ch != '{') {
12239                 RExC_parse++;
12240                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12241                 vFAIL2("Sequence %.2s... not terminated",parse_start);
12242             } else {
12243                 /* this pretty much dupes the code for (?P=...) in reg(), if
12244                    you change this make sure you change that */
12245                 char* name_start = (RExC_parse += 2);
12246                 U32 num = 0;
12247                 SV *sv_dat = reg_scan_name(pRExC_state,
12248                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12249                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12250                 if (RExC_parse == name_start || *RExC_parse != ch)
12251                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12252                     vFAIL2("Sequence %.3s... not terminated",parse_start);
12253
12254                 if (!SIZE_ONLY) {
12255                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
12256                     RExC_rxi->data->data[num]=(void*)sv_dat;
12257                     SvREFCNT_inc_simple_void(sv_dat);
12258                 }
12259
12260                 RExC_sawback = 1;
12261                 ret = reganode(pRExC_state,
12262                                ((! FOLD)
12263                                  ? NREF
12264                                  : (ASCII_FOLD_RESTRICTED)
12265                                    ? NREFFA
12266                                    : (AT_LEAST_UNI_SEMANTICS)
12267                                      ? NREFFU
12268                                      : (LOC)
12269                                        ? NREFFL
12270                                        : NREFF),
12271                                 num);
12272                 *flagp |= HASWIDTH;
12273
12274                 /* override incorrect value set in reganode MJD */
12275                 Set_Node_Offset(ret, parse_start+1);
12276                 Set_Node_Cur_Length(ret, parse_start);
12277                 nextchar(pRExC_state);
12278
12279             }
12280             break;
12281         }
12282         case 'g':
12283         case '1': case '2': case '3': case '4':
12284         case '5': case '6': case '7': case '8': case '9':
12285             {
12286                 I32 num;
12287                 bool hasbrace = 0;
12288
12289                 if (*RExC_parse == 'g') {
12290                     bool isrel = 0;
12291
12292                     RExC_parse++;
12293                     if (*RExC_parse == '{') {
12294                         RExC_parse++;
12295                         hasbrace = 1;
12296                     }
12297                     if (*RExC_parse == '-') {
12298                         RExC_parse++;
12299                         isrel = 1;
12300                     }
12301                     if (hasbrace && !isDIGIT(*RExC_parse)) {
12302                         if (isrel) RExC_parse--;
12303                         RExC_parse -= 2;
12304                         goto parse_named_seq;
12305                     }
12306
12307                     num = S_backref_value(RExC_parse);
12308                     if (num == 0)
12309                         vFAIL("Reference to invalid group 0");
12310                     else if (num == I32_MAX) {
12311                          if (isDIGIT(*RExC_parse))
12312                             vFAIL("Reference to nonexistent group");
12313                         else
12314                             vFAIL("Unterminated \\g... pattern");
12315                     }
12316
12317                     if (isrel) {
12318                         num = RExC_npar - num;
12319                         if (num < 1)
12320                             vFAIL("Reference to nonexistent or unclosed group");
12321                     }
12322                 }
12323                 else {
12324                     num = S_backref_value(RExC_parse);
12325                     /* bare \NNN might be backref or octal - if it is larger
12326                      * than or equal RExC_npar then it is assumed to be an
12327                      * octal escape. Note RExC_npar is +1 from the actual
12328                      * number of parens. */
12329                     /* Note we do NOT check if num == I32_MAX here, as that is
12330                      * handled by the RExC_npar check */
12331
12332                     if (
12333                         /* any numeric escape < 10 is always a backref */
12334                         num > 9
12335                         /* any numeric escape < RExC_npar is a backref */
12336                         && num >= RExC_npar
12337                         /* cannot be an octal escape if it starts with 8 */
12338                         && *RExC_parse != '8'
12339                         /* cannot be an octal escape it it starts with 9 */
12340                         && *RExC_parse != '9'
12341                     )
12342                     {
12343                         /* Probably not a backref, instead likely to be an
12344                          * octal character escape, e.g. \35 or \777.
12345                          * The above logic should make it obvious why using
12346                          * octal escapes in patterns is problematic. - Yves */
12347                         RExC_parse = parse_start;
12348                         goto defchar;
12349                     }
12350                 }
12351
12352                 /* At this point RExC_parse points at a numeric escape like
12353                  * \12 or \88 or something similar, which we should NOT treat
12354                  * as an octal escape. It may or may not be a valid backref
12355                  * escape. For instance \88888888 is unlikely to be a valid
12356                  * backref. */
12357                 while (isDIGIT(*RExC_parse))
12358                     RExC_parse++;
12359                 if (hasbrace) {
12360                     if (*RExC_parse != '}')
12361                         vFAIL("Unterminated \\g{...} pattern");
12362                     RExC_parse++;
12363                 }
12364                 if (!SIZE_ONLY) {
12365                     if (num > (I32)RExC_rx->nparens)
12366                         vFAIL("Reference to nonexistent group");
12367                 }
12368                 RExC_sawback = 1;
12369                 ret = reganode(pRExC_state,
12370                                ((! FOLD)
12371                                  ? REF
12372                                  : (ASCII_FOLD_RESTRICTED)
12373                                    ? REFFA
12374                                    : (AT_LEAST_UNI_SEMANTICS)
12375                                      ? REFFU
12376                                      : (LOC)
12377                                        ? REFFL
12378                                        : REFF),
12379                                 num);
12380                 *flagp |= HASWIDTH;
12381
12382                 /* override incorrect value set in reganode MJD */
12383                 Set_Node_Offset(ret, parse_start);
12384                 Set_Node_Cur_Length(ret, parse_start-1);
12385                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12386                                         FALSE /* Don't force to /x */ );
12387             }
12388             break;
12389         case '\0':
12390             if (RExC_parse >= RExC_end)
12391                 FAIL("Trailing \\");
12392             /* FALLTHROUGH */
12393         default:
12394             /* Do not generate "unrecognized" warnings here, we fall
12395                back into the quick-grab loop below */
12396             RExC_parse = parse_start;
12397             goto defchar;
12398         } /* end of switch on a \foo sequence */
12399         break;
12400
12401     case '#':
12402
12403         /* '#' comments should have been spaced over before this function was
12404          * called */
12405         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12406         /*
12407         if (RExC_flags & RXf_PMf_EXTENDED) {
12408             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12409             if (RExC_parse < RExC_end)
12410                 goto tryagain;
12411         }
12412         */
12413
12414         /* FALLTHROUGH */
12415
12416     default:
12417           defchar: {
12418
12419             /* Here, we have determined that the next thing is probably a
12420              * literal character.  RExC_parse points to the first byte of its
12421              * definition.  (It still may be an escape sequence that evaluates
12422              * to a single character) */
12423
12424             STRLEN len = 0;
12425             UV ender = 0;
12426             char *p;
12427             char *s;
12428 #define MAX_NODE_STRING_SIZE 127
12429             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12430             char *s0;
12431             U8 upper_parse = MAX_NODE_STRING_SIZE;
12432             U8 node_type = compute_EXACTish(pRExC_state);
12433             bool next_is_quantifier;
12434             char * oldp = NULL;
12435
12436             /* We can convert EXACTF nodes to EXACTFU if they contain only
12437              * characters that match identically regardless of the target
12438              * string's UTF8ness.  The reason to do this is that EXACTF is not
12439              * trie-able, EXACTFU is.
12440              *
12441              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12442              * contain only above-Latin1 characters (hence must be in UTF8),
12443              * which don't participate in folds with Latin1-range characters,
12444              * as the latter's folds aren't known until runtime.  (We don't
12445              * need to figure this out until pass 2) */
12446             bool maybe_exactfu = PASS2
12447                                && (node_type == EXACTF || node_type == EXACTFL);
12448
12449             /* If a folding node contains only code points that don't
12450              * participate in folds, it can be changed into an EXACT node,
12451              * which allows the optimizer more things to look for */
12452             bool maybe_exact;
12453
12454             ret = reg_node(pRExC_state, node_type);
12455
12456             /* In pass1, folded, we use a temporary buffer instead of the
12457              * actual node, as the node doesn't exist yet */
12458             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12459
12460             s0 = s;
12461
12462           reparse:
12463
12464             /* We look for the EXACTFish to EXACT node optimizaton only if
12465              * folding.  (And we don't need to figure this out until pass 2).
12466              * XXX It might actually make sense to split the node into portions
12467              * that are exact and ones that aren't, so that we could later use
12468              * the exact ones to find the longest fixed and floating strings.
12469              * One would want to join them back into a larger node.  One could
12470              * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
12471             maybe_exact = FOLD && PASS2;
12472
12473             /* XXX The node can hold up to 255 bytes, yet this only goes to
12474              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12475              * 255 allows us to not have to worry about overflow due to
12476              * converting to utf8 and fold expansion, but that value is
12477              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12478              * split up by this limit into a single one using the real max of
12479              * 255.  Even at 127, this breaks under rare circumstances.  If
12480              * folding, we do not want to split a node at a character that is a
12481              * non-final in a multi-char fold, as an input string could just
12482              * happen to want to match across the node boundary.  The join
12483              * would solve that problem if the join actually happens.  But a
12484              * series of more than two nodes in a row each of 127 would cause
12485              * the first join to succeed to get to 254, but then there wouldn't
12486              * be room for the next one, which could at be one of those split
12487              * multi-char folds.  I don't know of any fool-proof solution.  One
12488              * could back off to end with only a code point that isn't such a
12489              * non-final, but it is possible for there not to be any in the
12490              * entire node. */
12491
12492             assert(   ! UTF     /* Is at the beginning of a character */
12493                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12494                    || UTF8_IS_START(UCHARAT(RExC_parse)));
12495
12496             for (p = RExC_parse;
12497                  len < upper_parse && p < RExC_end;
12498                  len++)
12499             {
12500                 oldp = p;
12501
12502                 /* White space has already been ignored */
12503                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
12504                        || ! is_PATWS_safe((p), RExC_end, UTF));
12505
12506                 switch ((U8)*p) {
12507                 case '^':
12508                 case '$':
12509                 case '.':
12510                 case '[':
12511                 case '(':
12512                 case ')':
12513                 case '|':
12514                     goto loopdone;
12515                 case '\\':
12516                     /* Literal Escapes Switch
12517
12518                        This switch is meant to handle escape sequences that
12519                        resolve to a literal character.
12520
12521                        Every escape sequence that represents something
12522                        else, like an assertion or a char class, is handled
12523                        in the switch marked 'Special Escapes' above in this
12524                        routine, but also has an entry here as anything that
12525                        isn't explicitly mentioned here will be treated as
12526                        an unescaped equivalent literal.
12527                     */
12528
12529                     switch ((U8)*++p) {
12530                     /* These are all the special escapes. */
12531                     case 'A':             /* Start assertion */
12532                     case 'b': case 'B':   /* Word-boundary assertion*/
12533                     case 'C':             /* Single char !DANGEROUS! */
12534                     case 'd': case 'D':   /* digit class */
12535                     case 'g': case 'G':   /* generic-backref, pos assertion */
12536                     case 'h': case 'H':   /* HORIZWS */
12537                     case 'k': case 'K':   /* named backref, keep marker */
12538                     case 'p': case 'P':   /* Unicode property */
12539                               case 'R':   /* LNBREAK */
12540                     case 's': case 'S':   /* space class */
12541                     case 'v': case 'V':   /* VERTWS */
12542                     case 'w': case 'W':   /* word class */
12543                     case 'X':             /* eXtended Unicode "combining
12544                                              character sequence" */
12545                     case 'z': case 'Z':   /* End of line/string assertion */
12546                         --p;
12547                         goto loopdone;
12548
12549                     /* Anything after here is an escape that resolves to a
12550                        literal. (Except digits, which may or may not)
12551                      */
12552                     case 'n':
12553                         ender = '\n';
12554                         p++;
12555                         break;
12556                     case 'N': /* Handle a single-code point named character. */
12557                         RExC_parse = p + 1;
12558                         if (! grok_bslash_N(pRExC_state,
12559                                             NULL,   /* Fail if evaluates to
12560                                                        anything other than a
12561                                                        single code point */
12562                                             &ender, /* The returned single code
12563                                                        point */
12564                                             NULL,   /* Don't need a count of
12565                                                        how many code points */
12566                                             flagp,
12567                                             depth)
12568                         ) {
12569                             if (*flagp & NEED_UTF8)
12570                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
12571                             if (*flagp & RESTART_PASS1)
12572                                 return NULL;
12573
12574                             /* Here, it wasn't a single code point.  Go close
12575                              * up this EXACTish node.  The switch() prior to
12576                              * this switch handles the other cases */
12577                             RExC_parse = p = oldp;
12578                             goto loopdone;
12579                         }
12580                         p = RExC_parse;
12581                         if (ender > 0xff) {
12582                             REQUIRE_UTF8(flagp);
12583                         }
12584                         break;
12585                     case 'r':
12586                         ender = '\r';
12587                         p++;
12588                         break;
12589                     case 't':
12590                         ender = '\t';
12591                         p++;
12592                         break;
12593                     case 'f':
12594                         ender = '\f';
12595                         p++;
12596                         break;
12597                     case 'e':
12598                         ender = ESC_NATIVE;
12599                         p++;
12600                         break;
12601                     case 'a':
12602                         ender = '\a';
12603                         p++;
12604                         break;
12605                     case 'o':
12606                         {
12607                             UV result;
12608                             const char* error_msg;
12609
12610                             bool valid = grok_bslash_o(&p,
12611                                                        &result,
12612                                                        &error_msg,
12613                                                        PASS2, /* out warnings */
12614                                                        (bool) RExC_strict,
12615                                                        TRUE, /* Output warnings
12616                                                                 for non-
12617                                                                 portables */
12618                                                        UTF);
12619                             if (! valid) {
12620                                 RExC_parse = p; /* going to die anyway; point
12621                                                    to exact spot of failure */
12622                                 vFAIL(error_msg);
12623                             }
12624                             ender = result;
12625                             if (IN_ENCODING && ender < 0x100) {
12626                                 goto recode_encoding;
12627                             }
12628                             if (ender > 0xff) {
12629                                 REQUIRE_UTF8(flagp);
12630                             }
12631                             break;
12632                         }
12633                     case 'x':
12634                         {
12635                             UV result = UV_MAX; /* initialize to erroneous
12636                                                    value */
12637                             const char* error_msg;
12638
12639                             bool valid = grok_bslash_x(&p,
12640                                                        &result,
12641                                                        &error_msg,
12642                                                        PASS2, /* out warnings */
12643                                                        (bool) RExC_strict,
12644                                                        TRUE, /* Silence warnings
12645                                                                 for non-
12646                                                                 portables */
12647                                                        UTF);
12648                             if (! valid) {
12649                                 RExC_parse = p; /* going to die anyway; point
12650                                                    to exact spot of failure */
12651                                 vFAIL(error_msg);
12652                             }
12653                             ender = result;
12654
12655                             if (ender < 0x100) {
12656 #ifdef EBCDIC
12657                                 if (RExC_recode_x_to_native) {
12658                                     ender = LATIN1_TO_NATIVE(ender);
12659                                 }
12660                                 else
12661 #endif
12662                                 if (IN_ENCODING) {
12663                                     goto recode_encoding;
12664                                 }
12665                             }
12666                             else {
12667                                 REQUIRE_UTF8(flagp);
12668                             }
12669                             break;
12670                         }
12671                     case 'c':
12672                         p++;
12673                         ender = grok_bslash_c(*p++, PASS2);
12674                         break;
12675                     case '8': case '9': /* must be a backreference */
12676                         --p;
12677                         /* we have an escape like \8 which cannot be an octal escape
12678                          * so we exit the loop, and let the outer loop handle this
12679                          * escape which may or may not be a legitimate backref. */
12680                         goto loopdone;
12681                     case '1': case '2': case '3':case '4':
12682                     case '5': case '6': case '7':
12683                         /* When we parse backslash escapes there is ambiguity
12684                          * between backreferences and octal escapes. Any escape
12685                          * from \1 - \9 is a backreference, any multi-digit
12686                          * escape which does not start with 0 and which when
12687                          * evaluated as decimal could refer to an already
12688                          * parsed capture buffer is a back reference. Anything
12689                          * else is octal.
12690                          *
12691                          * Note this implies that \118 could be interpreted as
12692                          * 118 OR as "\11" . "8" depending on whether there
12693                          * were 118 capture buffers defined already in the
12694                          * pattern.  */
12695
12696                         /* NOTE, RExC_npar is 1 more than the actual number of
12697                          * parens we have seen so far, hence the < RExC_npar below. */
12698
12699                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12700                         {  /* Not to be treated as an octal constant, go
12701                                    find backref */
12702                             --p;
12703                             goto loopdone;
12704                         }
12705                         /* FALLTHROUGH */
12706                     case '0':
12707                         {
12708                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12709                             STRLEN numlen = 3;
12710                             ender = grok_oct(p, &numlen, &flags, NULL);
12711                             if (ender > 0xff) {
12712                                 REQUIRE_UTF8(flagp);
12713                             }
12714                             p += numlen;
12715                             if (PASS2   /* like \08, \178 */
12716                                 && numlen < 3
12717                                 && p < RExC_end
12718                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12719                             {
12720                                 reg_warn_non_literal_string(
12721                                          p + 1,
12722                                          form_short_octal_warning(p, numlen));
12723                             }
12724                         }
12725                         if (IN_ENCODING && ender < 0x100)
12726                             goto recode_encoding;
12727                         break;
12728                       recode_encoding:
12729                         if (! RExC_override_recoding) {
12730                             SV* enc = _get_encoding();
12731                             ender = reg_recode((U8)ender, &enc);
12732                             if (!enc && PASS2)
12733                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12734                             REQUIRE_UTF8(flagp);
12735                         }
12736                         break;
12737                     case '\0':
12738                         if (p >= RExC_end)
12739                             FAIL("Trailing \\");
12740                         /* FALLTHROUGH */
12741                     default:
12742                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12743                             /* Include any left brace following the alpha to emphasize
12744                              * that it could be part of an escape at some point
12745                              * in the future */
12746                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12747                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12748                         }
12749                         goto normal_default;
12750                     } /* End of switch on '\' */
12751                     break;
12752                 case '{':
12753                     /* Currently we don't warn when the lbrace is at the start
12754                      * of a construct.  This catches it in the middle of a
12755                      * literal string, or when it's the first thing after
12756                      * something like "\b" */
12757                     if (! SIZE_ONLY
12758                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12759                     {
12760                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12761                     }
12762                     /*FALLTHROUGH*/
12763                 default:    /* A literal character */
12764                   normal_default:
12765                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
12766                         STRLEN numlen;
12767                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12768                                                &numlen, UTF8_ALLOW_DEFAULT);
12769                         p += numlen;
12770                     }
12771                     else
12772                         ender = (U8) *p++;
12773                     break;
12774                 } /* End of switch on the literal */
12775
12776                 /* Here, have looked at the literal character and <ender>
12777                  * contains its ordinal, <p> points to the character after it.
12778                  * We need to check if the next non-ignored thing is a
12779                  * quantifier.  Move <p> to after anything that should be
12780                  * ignored, which, as a side effect, positions <p> for the next
12781                  * loop iteration */
12782                 skip_to_be_ignored_text(pRExC_state, &p,
12783                                         FALSE /* Don't force to /x */ );
12784
12785                 /* If the next thing is a quantifier, it applies to this
12786                  * character only, which means that this character has to be in
12787                  * its own node and can't just be appended to the string in an
12788                  * existing node, so if there are already other characters in
12789                  * the node, close the node with just them, and set up to do
12790                  * this character again next time through, when it will be the
12791                  * only thing in its new node */
12792                 if ((next_is_quantifier = (   LIKELY(p < RExC_end)
12793                                            && UNLIKELY(ISMULT2(p))))
12794                     && LIKELY(len))
12795                 {
12796                     p = oldp;
12797                     goto loopdone;
12798                 }
12799
12800                 /* Ready to add 'ender' to the node */
12801
12802                 if (! FOLD) {  /* The simple case, just append the literal */
12803
12804                     /* In the sizing pass, we need only the size of the
12805                      * character we are appending, hence we can delay getting
12806                      * its representation until PASS2. */
12807                     if (SIZE_ONLY) {
12808                         if (UTF) {
12809                             const STRLEN unilen = UVCHR_SKIP(ender);
12810                             s += unilen;
12811
12812                             /* We have to subtract 1 just below (and again in
12813                              * the corresponding PASS2 code) because the loop
12814                              * increments <len> each time, as all but this path
12815                              * (and one other) through it add a single byte to
12816                              * the EXACTish node.  But these paths would change
12817                              * len to be the correct final value, so cancel out
12818                              * the increment that follows */
12819                             len += unilen - 1;
12820                         }
12821                         else {
12822                             s++;
12823                         }
12824                     } else { /* PASS2 */
12825                       not_fold_common:
12826                         if (UTF) {
12827                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12828                             len += (char *) new_s - s - 1;
12829                             s = (char *) new_s;
12830                         }
12831                         else {
12832                             *(s++) = (char) ender;
12833                         }
12834                     }
12835                 }
12836                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12837
12838                     /* Here are folding under /l, and the code point is
12839                      * problematic.  First, we know we can't simplify things */
12840                     maybe_exact = FALSE;
12841                     maybe_exactfu = FALSE;
12842
12843                     /* A problematic code point in this context means that its
12844                      * fold isn't known until runtime, so we can't fold it now.
12845                      * (The non-problematic code points are the above-Latin1
12846                      * ones that fold to also all above-Latin1.  Their folds
12847                      * don't vary no matter what the locale is.) But here we
12848                      * have characters whose fold depends on the locale.
12849                      * Unlike the non-folding case above, we have to keep track
12850                      * of these in the sizing pass, so that we can make sure we
12851                      * don't split too-long nodes in the middle of a potential
12852                      * multi-char fold.  And unlike the regular fold case
12853                      * handled in the else clauses below, we don't actually
12854                      * fold and don't have special cases to consider.  What we
12855                      * do for both passes is the PASS2 code for non-folding */
12856                     goto not_fold_common;
12857                 }
12858                 else /* A regular FOLD code point */
12859                     if (! (   UTF
12860 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12861    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12862                                       || UNICODE_DOT_DOT_VERSION > 0)
12863                             /* See comments for join_exact() as to why we fold
12864                              * this non-UTF at compile time */
12865                             || (   node_type == EXACTFU
12866                                 && ender == LATIN_SMALL_LETTER_SHARP_S)
12867 #endif
12868                 )) {
12869                     /* Here, are folding and are not UTF-8 encoded; therefore
12870                      * the character must be in the range 0-255, and is not /l
12871                      * (Not /l because we already handled these under /l in
12872                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12873                     if (IS_IN_SOME_FOLD_L1(ender)) {
12874                         maybe_exact = FALSE;
12875
12876                         /* See if the character's fold differs between /d and
12877                          * /u.  This includes the multi-char fold SHARP S to
12878                          * 'ss' */
12879                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
12880                             RExC_seen_unfolded_sharp_s = 1;
12881                             maybe_exactfu = FALSE;
12882                         }
12883                         else if (maybe_exactfu
12884                             && (PL_fold[ender] != PL_fold_latin1[ender]
12885 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12886    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12887                                       || UNICODE_DOT_DOT_VERSION > 0)
12888                                 || (   len > 0
12889                                     && isALPHA_FOLD_EQ(ender, 's')
12890                                     && isALPHA_FOLD_EQ(*(s-1), 's'))
12891 #endif
12892                         )) {
12893                             maybe_exactfu = FALSE;
12894                         }
12895                     }
12896
12897                     /* Even when folding, we store just the input character, as
12898                      * we have an array that finds its fold quickly */
12899                     *(s++) = (char) ender;
12900                 }
12901                 else {  /* FOLD, and UTF (or sharp s) */
12902                     /* Unlike the non-fold case, we do actually have to
12903                      * calculate the results here in pass 1.  This is for two
12904                      * reasons, the folded length may be longer than the
12905                      * unfolded, and we have to calculate how many EXACTish
12906                      * nodes it will take; and we may run out of room in a node
12907                      * in the middle of a potential multi-char fold, and have
12908                      * to back off accordingly.  */
12909
12910                     UV folded;
12911                     if (isASCII_uni(ender)) {
12912                         folded = toFOLD(ender);
12913                         *(s)++ = (U8) folded;
12914                     }
12915                     else {
12916                         STRLEN foldlen;
12917
12918                         folded = _to_uni_fold_flags(
12919                                      ender,
12920                                      (U8 *) s,
12921                                      &foldlen,
12922                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12923                                                         ? FOLD_FLAGS_NOMIX_ASCII
12924                                                         : 0));
12925                         s += foldlen;
12926
12927                         /* The loop increments <len> each time, as all but this
12928                          * path (and one other) through it add a single byte to
12929                          * the EXACTish node.  But this one has changed len to
12930                          * be the correct final value, so subtract one to
12931                          * cancel out the increment that follows */
12932                         len += foldlen - 1;
12933                     }
12934                     /* If this node only contains non-folding code points so
12935                      * far, see if this new one is also non-folding */
12936                     if (maybe_exact) {
12937                         if (folded != ender) {
12938                             maybe_exact = FALSE;
12939                         }
12940                         else {
12941                             /* Here the fold is the original; we have to check
12942                              * further to see if anything folds to it */
12943                             if (_invlist_contains_cp(PL_utf8_foldable,
12944                                                         ender))
12945                             {
12946                                 maybe_exact = FALSE;
12947                             }
12948                         }
12949                     }
12950                     ender = folded;
12951                 }
12952
12953                 if (next_is_quantifier) {
12954
12955                     /* Here, the next input is a quantifier, and to get here,
12956                      * the current character is the only one in the node.
12957                      * Also, here <len> doesn't include the final byte for this
12958                      * character */
12959                     len++;
12960                     goto loopdone;
12961                 }
12962
12963             } /* End of loop through literal characters */
12964
12965             /* Here we have either exhausted the input or ran out of room in
12966              * the node.  (If we encountered a character that can't be in the
12967              * node, transfer is made directly to <loopdone>, and so we
12968              * wouldn't have fallen off the end of the loop.)  In the latter
12969              * case, we artificially have to split the node into two, because
12970              * we just don't have enough space to hold everything.  This
12971              * creates a problem if the final character participates in a
12972              * multi-character fold in the non-final position, as a match that
12973              * should have occurred won't, due to the way nodes are matched,
12974              * and our artificial boundary.  So back off until we find a non-
12975              * problematic character -- one that isn't at the beginning or
12976              * middle of such a fold.  (Either it doesn't participate in any
12977              * folds, or appears only in the final position of all the folds it
12978              * does participate in.)  A better solution with far fewer false
12979              * positives, and that would fill the nodes more completely, would
12980              * be to actually have available all the multi-character folds to
12981              * test against, and to back-off only far enough to be sure that
12982              * this node isn't ending with a partial one.  <upper_parse> is set
12983              * further below (if we need to reparse the node) to include just
12984              * up through that final non-problematic character that this code
12985              * identifies, so when it is set to less than the full node, we can
12986              * skip the rest of this */
12987             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12988
12989                 const STRLEN full_len = len;
12990
12991                 assert(len >= MAX_NODE_STRING_SIZE);
12992
12993                 /* Here, <s> points to the final byte of the final character.
12994                  * Look backwards through the string until find a non-
12995                  * problematic character */
12996
12997                 if (! UTF) {
12998
12999                     /* This has no multi-char folds to non-UTF characters */
13000                     if (ASCII_FOLD_RESTRICTED) {
13001                         goto loopdone;
13002                     }
13003
13004                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13005                     len = s - s0 + 1;
13006                 }
13007                 else {
13008                     if (!  PL_NonL1NonFinalFold) {
13009                         PL_NonL1NonFinalFold = _new_invlist_C_array(
13010                                         NonL1_Perl_Non_Final_Folds_invlist);
13011                     }
13012
13013                     /* Point to the first byte of the final character */
13014                     s = (char *) utf8_hop((U8 *) s, -1);
13015
13016                     while (s >= s0) {   /* Search backwards until find
13017                                            non-problematic char */
13018                         if (UTF8_IS_INVARIANT(*s)) {
13019
13020                             /* There are no ascii characters that participate
13021                              * in multi-char folds under /aa.  In EBCDIC, the
13022                              * non-ascii invariants are all control characters,
13023                              * so don't ever participate in any folds. */
13024                             if (ASCII_FOLD_RESTRICTED
13025                                 || ! IS_NON_FINAL_FOLD(*s))
13026                             {
13027                                 break;
13028                             }
13029                         }
13030                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13031                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13032                                                                   *s, *(s+1))))
13033                             {
13034                                 break;
13035                             }
13036                         }
13037                         else if (! _invlist_contains_cp(
13038                                         PL_NonL1NonFinalFold,
13039                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
13040                         {
13041                             break;
13042                         }
13043
13044                         /* Here, the current character is problematic in that
13045                          * it does occur in the non-final position of some
13046                          * fold, so try the character before it, but have to
13047                          * special case the very first byte in the string, so
13048                          * we don't read outside the string */
13049                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13050                     } /* End of loop backwards through the string */
13051
13052                     /* If there were only problematic characters in the string,
13053                      * <s> will point to before s0, in which case the length
13054                      * should be 0, otherwise include the length of the
13055                      * non-problematic character just found */
13056                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13057                 }
13058
13059                 /* Here, have found the final character, if any, that is
13060                  * non-problematic as far as ending the node without splitting
13061                  * it across a potential multi-char fold.  <len> contains the
13062                  * number of bytes in the node up-to and including that
13063                  * character, or is 0 if there is no such character, meaning
13064                  * the whole node contains only problematic characters.  In
13065                  * this case, give up and just take the node as-is.  We can't
13066                  * do any better */
13067                 if (len == 0) {
13068                     len = full_len;
13069
13070                     /* If the node ends in an 's' we make sure it stays EXACTF,
13071                      * as if it turns into an EXACTFU, it could later get
13072                      * joined with another 's' that would then wrongly match
13073                      * the sharp s */
13074                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13075                     {
13076                         maybe_exactfu = FALSE;
13077                     }
13078                 } else {
13079
13080                     /* Here, the node does contain some characters that aren't
13081                      * problematic.  If one such is the final character in the
13082                      * node, we are done */
13083                     if (len == full_len) {
13084                         goto loopdone;
13085                     }
13086                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13087
13088                         /* If the final character is problematic, but the
13089                          * penultimate is not, back-off that last character to
13090                          * later start a new node with it */
13091                         p = oldp;
13092                         goto loopdone;
13093                     }
13094
13095                     /* Here, the final non-problematic character is earlier
13096                      * in the input than the penultimate character.  What we do
13097                      * is reparse from the beginning, going up only as far as
13098                      * this final ok one, thus guaranteeing that the node ends
13099                      * in an acceptable character.  The reason we reparse is
13100                      * that we know how far in the character is, but we don't
13101                      * know how to correlate its position with the input parse.
13102                      * An alternate implementation would be to build that
13103                      * correlation as we go along during the original parse,
13104                      * but that would entail extra work for every node, whereas
13105                      * this code gets executed only when the string is too
13106                      * large for the node, and the final two characters are
13107                      * problematic, an infrequent occurrence.  Yet another
13108                      * possible strategy would be to save the tail of the
13109                      * string, and the next time regatom is called, initialize
13110                      * with that.  The problem with this is that unless you
13111                      * back off one more character, you won't be guaranteed
13112                      * regatom will get called again, unless regbranch,
13113                      * regpiece ... are also changed.  If you do back off that
13114                      * extra character, so that there is input guaranteed to
13115                      * force calling regatom, you can't handle the case where
13116                      * just the first character in the node is acceptable.  I
13117                      * (khw) decided to try this method which doesn't have that
13118                      * pitfall; if performance issues are found, we can do a
13119                      * combination of the current approach plus that one */
13120                     upper_parse = len;
13121                     len = 0;
13122                     s = s0;
13123                     goto reparse;
13124                 }
13125             }   /* End of verifying node ends with an appropriate char */
13126
13127           loopdone:   /* Jumped to when encounters something that shouldn't be
13128                          in the node */
13129
13130             /* I (khw) don't know if you can get here with zero length, but the
13131              * old code handled this situation by creating a zero-length EXACT
13132              * node.  Might as well be NOTHING instead */
13133             if (len == 0) {
13134                 OP(ret) = NOTHING;
13135             }
13136             else {
13137                 if (FOLD) {
13138                     /* If 'maybe_exact' is still set here, means there are no
13139                      * code points in the node that participate in folds;
13140                      * similarly for 'maybe_exactfu' and code points that match
13141                      * differently depending on UTF8ness of the target string
13142                      * (for /u), or depending on locale for /l */
13143                     if (maybe_exact) {
13144                         OP(ret) = (LOC)
13145                                   ? EXACTL
13146                                   : EXACT;
13147                     }
13148                     else if (maybe_exactfu) {
13149                         OP(ret) = (LOC)
13150                                   ? EXACTFLU8
13151                                   : EXACTFU;
13152                     }
13153                 }
13154                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13155                                            FALSE /* Don't look to see if could
13156                                                     be turned into an EXACT
13157                                                     node, as we have already
13158                                                     computed that */
13159                                           );
13160             }
13161
13162             RExC_parse = p - 1;
13163             Set_Node_Cur_Length(ret, parse_start);
13164             RExC_parse = p;
13165             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13166                                     FALSE /* Don't force to /x */ );
13167             {
13168                 /* len is STRLEN which is unsigned, need to copy to signed */
13169                 IV iv = len;
13170                 if (iv < 0)
13171                     vFAIL("Internal disaster");
13172             }
13173
13174         } /* End of label 'defchar:' */
13175         break;
13176     } /* End of giant switch on input character */
13177
13178     return(ret);
13179 }
13180
13181
13182 STATIC void
13183 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13184 {
13185     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
13186      * sets up the bitmap and any flags, removing those code points from the
13187      * inversion list, setting it to NULL should it become completely empty */
13188
13189     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13190     assert(PL_regkind[OP(node)] == ANYOF);
13191
13192     ANYOF_BITMAP_ZERO(node);
13193     if (*invlist_ptr) {
13194
13195         /* This gets set if we actually need to modify things */
13196         bool change_invlist = FALSE;
13197
13198         UV start, end;
13199
13200         /* Start looking through *invlist_ptr */
13201         invlist_iterinit(*invlist_ptr);
13202         while (invlist_iternext(*invlist_ptr, &start, &end)) {
13203             UV high;
13204             int i;
13205
13206             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13207                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13208             }
13209
13210             /* Quit if are above what we should change */
13211             if (start >= NUM_ANYOF_CODE_POINTS) {
13212                 break;
13213             }
13214
13215             change_invlist = TRUE;
13216
13217             /* Set all the bits in the range, up to the max that we are doing */
13218             high = (end < NUM_ANYOF_CODE_POINTS - 1)
13219                    ? end
13220                    : NUM_ANYOF_CODE_POINTS - 1;
13221             for (i = start; i <= (int) high; i++) {
13222                 if (! ANYOF_BITMAP_TEST(node, i)) {
13223                     ANYOF_BITMAP_SET(node, i);
13224                 }
13225             }
13226         }
13227         invlist_iterfinish(*invlist_ptr);
13228
13229         /* Done with loop; remove any code points that are in the bitmap from
13230          * *invlist_ptr; similarly for code points above the bitmap if we have
13231          * a flag to match all of them anyways */
13232         if (change_invlist) {
13233             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13234         }
13235         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13236             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13237         }
13238
13239         /* If have completely emptied it, remove it completely */
13240         if (_invlist_len(*invlist_ptr) == 0) {
13241             SvREFCNT_dec_NN(*invlist_ptr);
13242             *invlist_ptr = NULL;
13243         }
13244     }
13245 }
13246
13247 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13248    Character classes ([:foo:]) can also be negated ([:^foo:]).
13249    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13250    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13251    but trigger failures because they are currently unimplemented. */
13252
13253 #define POSIXCC_DONE(c)   ((c) == ':')
13254 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13255 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13256
13257 PERL_STATIC_INLINE I32
13258 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13259 {
13260     I32 namedclass = OOB_NAMEDCLASS;
13261
13262     PERL_ARGS_ASSERT_REGPPOSIXCC;
13263
13264     if (value == '[' && RExC_parse + 1 < RExC_end &&
13265         /* I smell either [: or [= or [. -- POSIX has been here, right? */
13266         POSIXCC(UCHARAT(RExC_parse)))
13267     {
13268         const char c = UCHARAT(RExC_parse);
13269         char* const s = RExC_parse++;
13270
13271         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13272             RExC_parse++;
13273         if (RExC_parse == RExC_end) {
13274             if (strict) {
13275
13276                 /* Try to give a better location for the error (than the end of
13277                  * the string) by looking for the matching ']' */
13278                 RExC_parse = s;
13279                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13280                     RExC_parse++;
13281                 }
13282                 vFAIL2("Unmatched '%c' in POSIX class", c);
13283             }
13284             /* Grandfather lone [:, [=, [. */
13285             RExC_parse = s;
13286         }
13287         else {
13288             const char* const t = RExC_parse++; /* skip over the c */
13289             assert(*t == c);
13290
13291             if (UCHARAT(RExC_parse) == ']') {
13292                 const char *posixcc = s + 1;
13293                 RExC_parse++; /* skip over the ending ] */
13294
13295                 if (*s == ':') {
13296                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13297                     const I32 skip = t - posixcc;
13298
13299                     /* Initially switch on the length of the name.  */
13300                     switch (skip) {
13301                     case 4:
13302                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13303                                                           this is the Perl \w
13304                                                         */
13305                             namedclass = ANYOF_WORDCHAR;
13306                         break;
13307                     case 5:
13308                         /* Names all of length 5.  */
13309                         /* alnum alpha ascii blank cntrl digit graph lower
13310                            print punct space upper  */
13311                         /* Offset 4 gives the best switch position.  */
13312                         switch (posixcc[4]) {
13313                         case 'a':
13314                             if (memEQ(posixcc, "alph", 4)) /* alpha */
13315                                 namedclass = ANYOF_ALPHA;
13316                             break;
13317                         case 'e':
13318                             if (memEQ(posixcc, "spac", 4)) /* space */
13319                                 namedclass = ANYOF_SPACE;
13320                             break;
13321                         case 'h':
13322                             if (memEQ(posixcc, "grap", 4)) /* graph */
13323                                 namedclass = ANYOF_GRAPH;
13324                             break;
13325                         case 'i':
13326                             if (memEQ(posixcc, "asci", 4)) /* ascii */
13327                                 namedclass = ANYOF_ASCII;
13328                             break;
13329                         case 'k':
13330                             if (memEQ(posixcc, "blan", 4)) /* blank */
13331                                 namedclass = ANYOF_BLANK;
13332                             break;
13333                         case 'l':
13334                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13335                                 namedclass = ANYOF_CNTRL;
13336                             break;
13337                         case 'm':
13338                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
13339                                 namedclass = ANYOF_ALPHANUMERIC;
13340                             break;
13341                         case 'r':
13342                             if (memEQ(posixcc, "lowe", 4)) /* lower */
13343                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13344                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
13345                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13346                             break;
13347                         case 't':
13348                             if (memEQ(posixcc, "digi", 4)) /* digit */
13349                                 namedclass = ANYOF_DIGIT;
13350                             else if (memEQ(posixcc, "prin", 4)) /* print */
13351                                 namedclass = ANYOF_PRINT;
13352                             else if (memEQ(posixcc, "punc", 4)) /* punct */
13353                                 namedclass = ANYOF_PUNCT;
13354                             break;
13355                         }
13356                         break;
13357                     case 6:
13358                         if (memEQ(posixcc, "xdigit", 6))
13359                             namedclass = ANYOF_XDIGIT;
13360                         break;
13361                     }
13362
13363                     if (namedclass == OOB_NAMEDCLASS)
13364                         vFAIL2utf8f(
13365                             "POSIX class [:%"UTF8f":] unknown",
13366                             UTF8fARG(UTF, t - s - 1, s + 1));
13367
13368                     /* The #defines are structured so each complement is +1 to
13369                      * the normal one */
13370                     if (complement) {
13371                         namedclass++;
13372                     }
13373                     assert (posixcc[skip] == ':');
13374                     assert (posixcc[skip+1] == ']');
13375                 } else if (!SIZE_ONLY) {
13376                     /* [[=foo=]] and [[.foo.]] are still future. */
13377
13378                     /* adjust RExC_parse so the warning shows after
13379                        the class closes */
13380                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13381                         RExC_parse++;
13382                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13383                 }
13384             } else {
13385                 /* Maternal grandfather:
13386                  * "[:" ending in ":" but not in ":]" */
13387                 if (strict) {
13388                     vFAIL("Unmatched '[' in POSIX class");
13389                 }
13390
13391                 /* Grandfather lone [:, [=, [. */
13392                 RExC_parse = s;
13393             }
13394         }
13395     }
13396
13397     return namedclass;
13398 }
13399
13400 STATIC bool
13401 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13402 {
13403     /* This applies some heuristics at the current parse position (which should
13404      * be at a '[') to see if what follows might be intended to be a [:posix:]
13405      * class.  It returns true if it really is a posix class, of course, but it
13406      * also can return true if it thinks that what was intended was a posix
13407      * class that didn't quite make it.
13408      *
13409      * It will return true for
13410      *      [:alphanumerics:
13411      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13412      *                         ')' indicating the end of the (?[
13413      *      [:any garbage including %^&$ punctuation:]
13414      *
13415      * This is designed to be called only from S_handle_regex_sets; it could be
13416      * easily adapted to be called from the spot at the beginning of regclass()
13417      * that checks to see in a normal bracketed class if the surrounding []
13418      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13419      * change long-standing behavior, so I (khw) didn't do that */
13420     char* p = RExC_parse + 1;
13421     char first_char = *p;
13422
13423     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13424
13425     assert(*(p - 1) == '[');
13426
13427     if (! POSIXCC(first_char)) {
13428         return FALSE;
13429     }
13430
13431     p++;
13432     while (p < RExC_end && isWORDCHAR(*p)) p++;
13433
13434     if (p >= RExC_end) {
13435         return FALSE;
13436     }
13437
13438     if (p - RExC_parse > 2    /* Got at least 1 word character */
13439         && (*p == first_char
13440             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13441     {
13442         return TRUE;
13443     }
13444
13445     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13446
13447     return (p
13448             && p - RExC_parse > 2 /* [:] evaluates to colon;
13449                                       [::] is a bad posix class. */
13450             && first_char == *(p - 1));
13451 }
13452
13453 STATIC unsigned  int
13454 S_regex_set_precedence(const U8 my_operator) {
13455
13456     /* Returns the precedence in the (?[...]) construct of the input operator,
13457      * specified by its character representation.  The precedence follows
13458      * general Perl rules, but it extends this so that ')' and ']' have (low)
13459      * precedence even though they aren't really operators */
13460
13461     switch (my_operator) {
13462         case '!':
13463             return 5;
13464         case '&':
13465             return 4;
13466         case '^':
13467         case '|':
13468         case '+':
13469         case '-':
13470             return 3;
13471         case ')':
13472             return 2;
13473         case ']':
13474             return 1;
13475     }
13476
13477     NOT_REACHED; /* NOTREACHED */
13478     return 0;   /* Silence compiler warning */
13479 }
13480
13481 STATIC regnode *
13482 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13483                     I32 *flagp, U32 depth,
13484                     char * const oregcomp_parse)
13485 {
13486     /* Handle the (?[...]) construct to do set operations */
13487
13488     U8 curchar;                     /* Current character being parsed */
13489     UV start, end;                  /* End points of code point ranges */
13490     SV* final = NULL;               /* The end result inversion list */
13491     SV* result_string;              /* 'final' stringified */
13492     AV* stack;                      /* stack of operators and operands not yet
13493                                        resolved */
13494     AV* fence_stack = NULL;         /* A stack containing the positions in
13495                                        'stack' of where the undealt-with left
13496                                        parens would be if they were actually
13497                                        put there */
13498     IV fence = 0;                   /* Position of where most recent undealt-
13499                                        with left paren in stack is; -1 if none.
13500                                      */
13501     STRLEN len;                     /* Temporary */
13502     regnode* node;                  /* Temporary, and final regnode returned by
13503                                        this function */
13504     const bool save_fold = FOLD;    /* Temporary */
13505     char *save_end, *save_parse;    /* Temporaries */
13506     const bool in_locale = LOC;     /* we turn off /l during processing */
13507
13508     GET_RE_DEBUG_FLAGS_DECL;
13509
13510     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13511
13512     if (in_locale) {
13513         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
13514     }
13515
13516     REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
13517                                          This is required so that the compile
13518                                          time values are valid in all runtime
13519                                          cases */
13520
13521     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13522      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13523      * call regclass to handle '[]' so as to not have to reinvent its parsing
13524      * rules here (throwing away the size it computes each time).  And, we exit
13525      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13526      * these things, we need to realize that something preceded by a backslash
13527      * is escaped, so we have to keep track of backslashes */
13528     if (SIZE_ONLY) {
13529         UV depth = 0; /* how many nested (?[...]) constructs */
13530
13531         while (RExC_parse < RExC_end) {
13532             SV* current = NULL;
13533
13534             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13535                                     TRUE /* Force /x */ );
13536
13537             switch (*RExC_parse) {
13538                 case '?':
13539                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13540                     /* FALLTHROUGH */
13541                 default:
13542                     break;
13543                 case '\\':
13544                     /* Skip the next byte (which could cause us to end up in
13545                      * the middle of a UTF-8 character, but since none of those
13546                      * are confusable with anything we currently handle in this
13547                      * switch (invariants all), it's safe.  We'll just hit the
13548                      * default: case next time and keep on incrementing until
13549                      * we find one of the invariants we do handle. */
13550                     RExC_parse++;
13551                     if (*RExC_parse == 'c') {
13552                             /* Skip the \cX notation for control characters */
13553                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13554                     }
13555                     break;
13556                 case '[':
13557                 {
13558                     /* If this looks like it is a [:posix:] class, leave the
13559                      * parse pointer at the '[' to fool regclass() into
13560                      * thinking it is part of a '[[:posix:]]'.  That function
13561                      * will use strict checking to force a syntax error if it
13562                      * doesn't work out to a legitimate class */
13563                     bool is_posix_class
13564                                     = could_it_be_a_POSIX_class(pRExC_state);
13565                     if (! is_posix_class) {
13566                         RExC_parse++;
13567                     }
13568
13569                     /* regclass() can only return RESTART_PASS1 and NEED_UTF8
13570                      * if multi-char folds are allowed.  */
13571                     if (!regclass(pRExC_state, flagp,depth+1,
13572                                   is_posix_class, /* parse the whole char
13573                                                      class only if not a
13574                                                      posix class */
13575                                   FALSE, /* don't allow multi-char folds */
13576                                   TRUE, /* silence non-portable warnings. */
13577                                   TRUE, /* strict */
13578                                   FALSE, /* Require return to be an ANYOF */
13579                                   &current
13580                                  ))
13581                         FAIL2("panic: regclass returned NULL to handle_sets, "
13582                               "flags=%#"UVxf"", (UV) *flagp);
13583
13584                     /* function call leaves parse pointing to the ']', except
13585                      * if we faked it */
13586                     if (is_posix_class) {
13587                         RExC_parse--;
13588                     }
13589
13590                     SvREFCNT_dec(current);   /* In case it returned something */
13591                     break;
13592                 }
13593
13594                 case ']':
13595                     if (depth--) break;
13596                     RExC_parse++;
13597                     if (RExC_parse < RExC_end
13598                         && *RExC_parse == ')')
13599                     {
13600                         node = reganode(pRExC_state, ANYOF, 0);
13601                         RExC_size += ANYOF_SKIP;
13602                         nextchar(pRExC_state);
13603                         Set_Node_Length(node,
13604                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13605                         if (in_locale) {
13606                             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
13607                         }
13608
13609                         return node;
13610                     }
13611                     goto no_close;
13612             }
13613
13614             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13615         }
13616
13617       no_close:
13618         FAIL("Syntax error in (?[...])");
13619     }
13620
13621     /* Pass 2 only after this. */
13622     Perl_ck_warner_d(aTHX_
13623         packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13624         "The regex_sets feature is experimental" REPORT_LOCATION,
13625         REPORT_LOCATION_ARGS(RExC_parse));
13626
13627     /* Everything in this construct is a metacharacter.  Operands begin with
13628      * either a '\' (for an escape sequence), or a '[' for a bracketed
13629      * character class.  Any other character should be an operator, or
13630      * parenthesis for grouping.  Both types of operands are handled by calling
13631      * regclass() to parse them.  It is called with a parameter to indicate to
13632      * return the computed inversion list.  The parsing here is implemented via
13633      * a stack.  Each entry on the stack is a single character representing one
13634      * of the operators; or else a pointer to an operand inversion list. */
13635
13636 #define IS_OPERATOR(a) SvIOK(a)
13637 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
13638
13639     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
13640      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13641      * with pronouncing it called it Reverse Polish instead, but now that YOU
13642      * know how to pronounce it you can use the correct term, thus giving due
13643      * credit to the person who invented it, and impressing your geek friends.
13644      * Wikipedia says that the pronounciation of "Ł" has been changing so that
13645      * it is now more like an English initial W (as in wonk) than an L.)
13646      *
13647      * This means that, for example, 'a | b & c' is stored on the stack as
13648      *
13649      * c  [4]
13650      * b  [3]
13651      * &  [2]
13652      * a  [1]
13653      * |  [0]
13654      *
13655      * where the numbers in brackets give the stack [array] element number.
13656      * In this implementation, parentheses are not stored on the stack.
13657      * Instead a '(' creates a "fence" so that the part of the stack below the
13658      * fence is invisible except to the corresponding ')' (this allows us to
13659      * replace testing for parens, by using instead subtraction of the fence
13660      * position).  As new operands are processed they are pushed onto the stack
13661      * (except as noted in the next paragraph).  New operators of higher
13662      * precedence than the current final one are inserted on the stack before
13663      * the lhs operand (so that when the rhs is pushed next, everything will be
13664      * in the correct positions shown above.  When an operator of equal or
13665      * lower precedence is encountered in parsing, all the stacked operations
13666      * of equal or higher precedence are evaluated, leaving the result as the
13667      * top entry on the stack.  This makes higher precedence operations
13668      * evaluate before lower precedence ones, and causes operations of equal
13669      * precedence to left associate.
13670      *
13671      * The only unary operator '!' is immediately pushed onto the stack when
13672      * encountered.  When an operand is encountered, if the top of the stack is
13673      * a '!", the complement is immediately performed, and the '!' popped.  The
13674      * resulting value is treated as a new operand, and the logic in the
13675      * previous paragraph is executed.  Thus in the expression
13676      *      [a] + ! [b]
13677      * the stack looks like
13678      *
13679      * !
13680      * a
13681      * +
13682      *
13683      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13684      * becomes
13685      *
13686      * !b
13687      * a
13688      * +
13689      *
13690      * A ')' is treated as an operator with lower precedence than all the
13691      * aforementioned ones, which causes all operations on the stack above the
13692      * corresponding '(' to be evaluated down to a single resultant operand.
13693      * Then the fence for the '(' is removed, and the operand goes through the
13694      * algorithm above, without the fence.
13695      *
13696      * A separate stack is kept of the fence positions, so that the position of
13697      * the latest so-far unbalanced '(' is at the top of it.
13698      *
13699      * The ']' ending the construct is treated as the lowest operator of all,
13700      * so that everything gets evaluated down to a single operand, which is the
13701      * result */
13702
13703     sv_2mortal((SV *)(stack = newAV()));
13704     sv_2mortal((SV *)(fence_stack = newAV()));
13705
13706     while (RExC_parse < RExC_end) {
13707         I32 top_index;              /* Index of top-most element in 'stack' */
13708         SV** top_ptr;               /* Pointer to top 'stack' element */
13709         SV* current = NULL;         /* To contain the current inversion list
13710                                        operand */
13711         SV* only_to_avoid_leaks;
13712
13713         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13714                                 TRUE /* Force /x */ );
13715         if (RExC_parse >= RExC_end) {
13716             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13717         }
13718
13719         curchar = UCHARAT(RExC_parse);
13720
13721 redo_curchar:
13722
13723         top_index = av_tindex(stack);
13724
13725         switch (curchar) {
13726             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
13727             char stacked_operator;  /* The topmost operator on the 'stack'. */
13728             SV* lhs;                /* Operand to the left of the operator */
13729             SV* rhs;                /* Operand to the right of the operator */
13730             SV* fence_ptr;          /* Pointer to top element of the fence
13731                                        stack */
13732
13733             case '(':
13734
13735                 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13736                 {
13737                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13738                      * This happens when we have some thing like
13739                      *
13740                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13741                      *   ...
13742                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13743                      *
13744                      * Here we would be handling the interpolated
13745                      * '$thai_or_lao'.  We handle this by a recursive call to
13746                      * ourselves which returns the inversion list the
13747                      * interpolated expression evaluates to.  We use the flags
13748                      * from the interpolated pattern. */
13749                     U32 save_flags = RExC_flags;
13750                     const char * save_parse;
13751
13752                     RExC_parse += 2;        /* Skip past the '(?' */
13753                     save_parse = RExC_parse;
13754
13755                     /* Parse any flags for the '(?' */
13756                     parse_lparen_question_flags(pRExC_state);
13757
13758                     if (RExC_parse == save_parse  /* Makes sure there was at
13759                                                      least one flag (or else
13760                                                      this embedding wasn't
13761                                                      compiled) */
13762                         || RExC_parse >= RExC_end - 4
13763                         || UCHARAT(RExC_parse) != ':'
13764                         || UCHARAT(++RExC_parse) != '('
13765                         || UCHARAT(++RExC_parse) != '?'
13766                         || UCHARAT(++RExC_parse) != '[')
13767                     {
13768
13769                         /* In combination with the above, this moves the
13770                          * pointer to the point just after the first erroneous
13771                          * character (or if there are no flags, to where they
13772                          * should have been) */
13773                         if (RExC_parse >= RExC_end - 4) {
13774                             RExC_parse = RExC_end;
13775                         }
13776                         else if (RExC_parse != save_parse) {
13777                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13778                         }
13779                         vFAIL("Expecting '(?flags:(?[...'");
13780                     }
13781
13782                     /* Recurse, with the meat of the embedded expression */
13783                     RExC_parse++;
13784                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13785                                                     depth+1, oregcomp_parse);
13786
13787                     /* Here, 'current' contains the embedded expression's
13788                      * inversion list, and RExC_parse points to the trailing
13789                      * ']'; the next character should be the ')' */
13790                     RExC_parse++;
13791                     assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13792
13793                     /* Then the ')' matching the original '(' handled by this
13794                      * case: statement */
13795                     RExC_parse++;
13796                     assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13797
13798                     RExC_parse++;
13799                     RExC_flags = save_flags;
13800                     goto handle_operand;
13801                 }
13802
13803                 /* A regular '('.  Look behind for illegal syntax */
13804                 if (top_index - fence >= 0) {
13805                     /* If the top entry on the stack is an operator, it had
13806                      * better be a '!', otherwise the entry below the top
13807                      * operand should be an operator */
13808                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
13809                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
13810                         || (   IS_OPERAND(*top_ptr)
13811                             && (   top_index - fence < 1
13812                                 || ! (stacked_ptr = av_fetch(stack,
13813                                                              top_index - 1,
13814                                                              FALSE))
13815                                 || ! IS_OPERATOR(*stacked_ptr))))
13816                     {
13817                         RExC_parse++;
13818                         vFAIL("Unexpected '(' with no preceding operator");
13819                     }
13820                 }
13821
13822                 /* Stack the position of this undealt-with left paren */
13823                 fence = top_index + 1;
13824                 av_push(fence_stack, newSViv(fence));
13825                 break;
13826
13827             case '\\':
13828                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13829                  * multi-char folds are allowed.  */
13830                 if (!regclass(pRExC_state, flagp,depth+1,
13831                               TRUE, /* means parse just the next thing */
13832                               FALSE, /* don't allow multi-char folds */
13833                               FALSE, /* don't silence non-portable warnings.  */
13834                               TRUE,  /* strict */
13835                               FALSE, /* Require return to be an ANYOF */
13836                               &current))
13837                 {
13838                     FAIL2("panic: regclass returned NULL to handle_sets, "
13839                           "flags=%#"UVxf"", (UV) *flagp);
13840                 }
13841
13842                 /* regclass() will return with parsing just the \ sequence,
13843                  * leaving the parse pointer at the next thing to parse */
13844                 RExC_parse--;
13845                 goto handle_operand;
13846
13847             case '[':   /* Is a bracketed character class */
13848             {
13849                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13850
13851                 if (! is_posix_class) {
13852                     RExC_parse++;
13853                 }
13854
13855                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13856                  * multi-char folds are allowed.  */
13857                 if(!regclass(pRExC_state, flagp,depth+1,
13858                              is_posix_class, /* parse the whole char class
13859                                                 only if not a posix class */
13860                              FALSE, /* don't allow multi-char folds */
13861                              FALSE, /* don't silence non-portable warnings.  */
13862                              TRUE,   /* strict */
13863                              FALSE, /* Require return to be an ANYOF */
13864                              &current
13865                             ))
13866                 {
13867                     FAIL2("panic: regclass returned NULL to handle_sets, "
13868                           "flags=%#"UVxf"", (UV) *flagp);
13869                 }
13870
13871                 /* function call leaves parse pointing to the ']', except if we
13872                  * faked it */
13873                 if (is_posix_class) {
13874                     RExC_parse--;
13875                 }
13876
13877                 goto handle_operand;
13878             }
13879
13880             case ']':
13881                 if (top_index >= 1) {
13882                     goto join_operators;
13883                 }
13884
13885                 /* Only a single operand on the stack: are done */
13886                 goto done;
13887
13888             case ')':
13889                 if (av_tindex(fence_stack) < 0) {
13890                     RExC_parse++;
13891                     vFAIL("Unexpected ')'");
13892                 }
13893
13894                  /* If at least two thing on the stack, treat this as an
13895                   * operator */
13896                 if (top_index - fence >= 1) {
13897                     goto join_operators;
13898                 }
13899
13900                 /* Here only a single thing on the fenced stack, and there is a
13901                  * fence.  Get rid of it */
13902                 fence_ptr = av_pop(fence_stack);
13903                 assert(fence_ptr);
13904                 fence = SvIV(fence_ptr) - 1;
13905                 SvREFCNT_dec_NN(fence_ptr);
13906                 fence_ptr = NULL;
13907
13908                 if (fence < 0) {
13909                     fence = 0;
13910                 }
13911
13912                 /* Having gotten rid of the fence, we pop the operand at the
13913                  * stack top and process it as a newly encountered operand */
13914                 current = av_pop(stack);
13915                 if (IS_OPERAND(current)) {
13916                     goto handle_operand;
13917                 }
13918
13919                 RExC_parse++;
13920                 goto bad_syntax;
13921
13922             case '&':
13923             case '|':
13924             case '+':
13925             case '-':
13926             case '^':
13927
13928                 /* These binary operators should have a left operand already
13929                  * parsed */
13930                 if (   top_index - fence < 0
13931                     || top_index - fence == 1
13932                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13933                     || ! IS_OPERAND(*top_ptr))
13934                 {
13935                     goto unexpected_binary;
13936                 }
13937
13938                 /* If only the one operand is on the part of the stack visible
13939                  * to us, we just place this operator in the proper position */
13940                 if (top_index - fence < 2) {
13941
13942                     /* Place the operator before the operand */
13943
13944                     SV* lhs = av_pop(stack);
13945                     av_push(stack, newSVuv(curchar));
13946                     av_push(stack, lhs);
13947                     break;
13948                 }
13949
13950                 /* But if there is something else on the stack, we need to
13951                  * process it before this new operator if and only if the
13952                  * stacked operation has equal or higher precedence than the
13953                  * new one */
13954
13955              join_operators:
13956
13957                 /* The operator on the stack is supposed to be below both its
13958                  * operands */
13959                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13960                     || IS_OPERAND(*stacked_ptr))
13961                 {
13962                     /* But if not, it's legal and indicates we are completely
13963                      * done if and only if we're currently processing a ']',
13964                      * which should be the final thing in the expression */
13965                     if (curchar == ']') {
13966                         goto done;
13967                     }
13968
13969                   unexpected_binary:
13970                     RExC_parse++;
13971                     vFAIL2("Unexpected binary operator '%c' with no "
13972                            "preceding operand", curchar);
13973                 }
13974                 stacked_operator = (char) SvUV(*stacked_ptr);
13975
13976                 if (regex_set_precedence(curchar)
13977                     > regex_set_precedence(stacked_operator))
13978                 {
13979                     /* Here, the new operator has higher precedence than the
13980                      * stacked one.  This means we need to add the new one to
13981                      * the stack to await its rhs operand (and maybe more
13982                      * stuff).  We put it before the lhs operand, leaving
13983                      * untouched the stacked operator and everything below it
13984                      * */
13985                     lhs = av_pop(stack);
13986                     assert(IS_OPERAND(lhs));
13987
13988                     av_push(stack, newSVuv(curchar));
13989                     av_push(stack, lhs);
13990                     break;
13991                 }
13992
13993                 /* Here, the new operator has equal or lower precedence than
13994                  * what's already there.  This means the operation already
13995                  * there should be performed now, before the new one. */
13996
13997                 rhs = av_pop(stack);
13998                 if (! IS_OPERAND(rhs)) {
13999
14000                     /* This can happen when a ! is not followed by an operand,
14001                      * like in /(?[\t &!])/ */
14002                     goto bad_syntax;
14003                 }
14004
14005                 lhs = av_pop(stack);
14006
14007                 if (! IS_OPERAND(lhs)) {
14008
14009                     /* This can happen when there is an empty (), like in
14010                      * /(?[[0]+()+])/ */
14011                     goto bad_syntax;
14012                 }
14013
14014                 switch (stacked_operator) {
14015                     case '&':
14016                         _invlist_intersection(lhs, rhs, &rhs);
14017                         break;
14018
14019                     case '|':
14020                     case '+':
14021                         _invlist_union(lhs, rhs, &rhs);
14022                         break;
14023
14024                     case '-':
14025                         _invlist_subtract(lhs, rhs, &rhs);
14026                         break;
14027
14028                     case '^':   /* The union minus the intersection */
14029                     {
14030                         SV* i = NULL;
14031                         SV* u = NULL;
14032                         SV* element;
14033
14034                         _invlist_union(lhs, rhs, &u);
14035                         _invlist_intersection(lhs, rhs, &i);
14036                         /* _invlist_subtract will overwrite rhs
14037                             without freeing what it already contains */
14038                         element = rhs;
14039                         _invlist_subtract(u, i, &rhs);
14040                         SvREFCNT_dec_NN(i);
14041                         SvREFCNT_dec_NN(u);
14042                         SvREFCNT_dec_NN(element);
14043                         break;
14044                     }
14045                 }
14046                 SvREFCNT_dec(lhs);
14047
14048                 /* Here, the higher precedence operation has been done, and the
14049                  * result is in 'rhs'.  We overwrite the stacked operator with
14050                  * the result.  Then we redo this code to either push the new
14051                  * operator onto the stack or perform any higher precedence
14052                  * stacked operation */
14053                 only_to_avoid_leaks = av_pop(stack);
14054                 SvREFCNT_dec(only_to_avoid_leaks);
14055                 av_push(stack, rhs);
14056                 goto redo_curchar;
14057
14058             case '!':   /* Highest priority, right associative */
14059
14060                 /* If what's already at the top of the stack is another '!",
14061                  * they just cancel each other out */
14062                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
14063                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
14064                 {
14065                     only_to_avoid_leaks = av_pop(stack);
14066                     SvREFCNT_dec(only_to_avoid_leaks);
14067                 }
14068                 else { /* Otherwise, since it's right associative, just push
14069                           onto the stack */
14070                     av_push(stack, newSVuv(curchar));
14071                 }
14072                 break;
14073
14074             default:
14075                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14076                 vFAIL("Unexpected character");
14077
14078           handle_operand:
14079
14080             /* Here 'current' is the operand.  If something is already on the
14081              * stack, we have to check if it is a !. */
14082             top_index = av_tindex(stack);   /* Code above may have altered the
14083                                              * stack in the time since we
14084                                              * earlier set 'top_index'. */
14085             if (top_index - fence >= 0) {
14086                 /* If the top entry on the stack is an operator, it had better
14087                  * be a '!', otherwise the entry below the top operand should
14088                  * be an operator */
14089                 top_ptr = av_fetch(stack, top_index, FALSE);
14090                 assert(top_ptr);
14091                 if (IS_OPERATOR(*top_ptr)) {
14092
14093                     /* The only permissible operator at the top of the stack is
14094                      * '!', which is applied immediately to this operand. */
14095                     curchar = (char) SvUV(*top_ptr);
14096                     if (curchar != '!') {
14097                         SvREFCNT_dec(current);
14098                         vFAIL2("Unexpected binary operator '%c' with no "
14099                                 "preceding operand", curchar);
14100                     }
14101
14102                     _invlist_invert(current);
14103
14104                     only_to_avoid_leaks = av_pop(stack);
14105                     SvREFCNT_dec(only_to_avoid_leaks);
14106                     top_index = av_tindex(stack);
14107
14108                     /* And we redo with the inverted operand.  This allows
14109                      * handling multiple ! in a row */
14110                     goto handle_operand;
14111                 }
14112                           /* Single operand is ok only for the non-binary ')'
14113                            * operator */
14114                 else if ((top_index - fence == 0 && curchar != ')')
14115                          || (top_index - fence > 0
14116                              && (! (stacked_ptr = av_fetch(stack,
14117                                                            top_index - 1,
14118                                                            FALSE))
14119                                  || IS_OPERAND(*stacked_ptr))))
14120                 {
14121                     SvREFCNT_dec(current);
14122                     vFAIL("Operand with no preceding operator");
14123                 }
14124             }
14125
14126             /* Here there was nothing on the stack or the top element was
14127              * another operand.  Just add this new one */
14128             av_push(stack, current);
14129
14130         } /* End of switch on next parse token */
14131
14132         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14133     } /* End of loop parsing through the construct */
14134
14135   done:
14136     if (av_tindex(fence_stack) >= 0) {
14137         vFAIL("Unmatched (");
14138     }
14139
14140     if (av_tindex(stack) < 0   /* Was empty */
14141         || ((final = av_pop(stack)) == NULL)
14142         || ! IS_OPERAND(final)
14143         || SvTYPE(final) != SVt_INVLIST
14144         || av_tindex(stack) >= 0)  /* More left on stack */
14145     {
14146       bad_syntax:
14147         SvREFCNT_dec(final);
14148         vFAIL("Incomplete expression within '(?[ ])'");
14149     }
14150
14151     /* Here, 'final' is the resultant inversion list from evaluating the
14152      * expression.  Return it if so requested */
14153     if (return_invlist) {
14154         *return_invlist = final;
14155         return END;
14156     }
14157
14158     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
14159      * expecting a string of ranges and individual code points */
14160     invlist_iterinit(final);
14161     result_string = newSVpvs("");
14162     while (invlist_iternext(final, &start, &end)) {
14163         if (start == end) {
14164             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
14165         }
14166         else {
14167             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
14168                                                      start,          end);
14169         }
14170     }
14171
14172     /* About to generate an ANYOF (or similar) node from the inversion list we
14173      * have calculated */
14174     save_parse = RExC_parse;
14175     RExC_parse = SvPV(result_string, len);
14176     save_end = RExC_end;
14177     RExC_end = RExC_parse + len;
14178
14179     /* We turn off folding around the call, as the class we have constructed
14180      * already has all folding taken into consideration, and we don't want
14181      * regclass() to add to that */
14182     RExC_flags &= ~RXf_PMf_FOLD;
14183     /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
14184      * folds are allowed.  */
14185     node = regclass(pRExC_state, flagp,depth+1,
14186                     FALSE, /* means parse the whole char class */
14187                     FALSE, /* don't allow multi-char folds */
14188                     TRUE, /* silence non-portable warnings.  The above may very
14189                              well have generated non-portable code points, but
14190                              they're valid on this machine */
14191                     FALSE, /* similarly, no need for strict */
14192                     FALSE, /* Require return to be an ANYOF */
14193                     NULL
14194                 );
14195     if (!node)
14196         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
14197                     PTR2UV(flagp));
14198
14199     /* Fix up the node type if we are in locale.  (We have pretended we are
14200      * under /u for the purposes of regclass(), as this construct will only
14201      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
14202      * as to cause any warnings about bad locales to be output in regexec.c),
14203      * and add the flag that indicates to check if not in a UTF-8 locale.  The
14204      * reason we above forbid optimization into something other than an ANYOF
14205      * node is simply to minimize the number of code changes in regexec.c.
14206      * Otherwise we would have to create new EXACTish node types and deal with
14207      * them.  This decision could be revisited should this construct become
14208      * popular.
14209      *
14210      * (One might think we could look at the resulting ANYOF node and suppress
14211      * the flag if everything is above 255, as those would be UTF-8 only,
14212      * but this isn't true, as the components that led to that result could
14213      * have been locale-affected, and just happen to cancel each other out
14214      * under UTF-8 locales.) */
14215     if (in_locale) {
14216         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14217
14218         assert(OP(node) == ANYOF);
14219
14220         OP(node) = ANYOFL;
14221         ANYOF_FLAGS(node)
14222                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
14223     }
14224
14225     if (save_fold) {
14226         RExC_flags |= RXf_PMf_FOLD;
14227     }
14228
14229     RExC_parse = save_parse + 1;
14230     RExC_end = save_end;
14231     SvREFCNT_dec_NN(final);
14232     SvREFCNT_dec_NN(result_string);
14233
14234     nextchar(pRExC_state);
14235     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
14236     return node;
14237 }
14238 #undef IS_OPERATOR
14239 #undef IS_OPERAND
14240
14241 STATIC void
14242 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
14243 {
14244     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
14245      * innocent-looking character class, like /[ks]/i won't have to go out to
14246      * disk to find the possible matches.
14247      *
14248      * This should be called only for a Latin1-range code points, cp, which is
14249      * known to be involved in a simple fold with other code points above
14250      * Latin1.  It would give false results if /aa has been specified.
14251      * Multi-char folds are outside the scope of this, and must be handled
14252      * specially.
14253      *
14254      * XXX It would be better to generate these via regen, in case a new
14255      * version of the Unicode standard adds new mappings, though that is not
14256      * really likely, and may be caught by the default: case of the switch
14257      * below. */
14258
14259     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14260
14261     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14262
14263     switch (cp) {
14264         case 'k':
14265         case 'K':
14266           *invlist =
14267              add_cp_to_invlist(*invlist, KELVIN_SIGN);
14268             break;
14269         case 's':
14270         case 'S':
14271           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14272             break;
14273         case MICRO_SIGN:
14274           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14275           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14276             break;
14277         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14278         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14279           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14280             break;
14281         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14282           *invlist = add_cp_to_invlist(*invlist,
14283                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14284             break;
14285
14286 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
14287
14288         case LATIN_SMALL_LETTER_SHARP_S:
14289           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14290             break;
14291
14292 #endif
14293
14294 #if    UNICODE_MAJOR_VERSION < 3                                        \
14295    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
14296
14297         /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
14298          * U+0131.  */
14299         case 'i':
14300         case 'I':
14301           *invlist =
14302              add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
14303 #   if UNICODE_DOT_DOT_VERSION == 1
14304           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
14305 #   endif
14306             break;
14307 #endif
14308
14309         default:
14310             /* Use deprecated warning to increase the chances of this being
14311              * output */
14312             if (PASS2) {
14313                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14314             }
14315             break;
14316     }
14317 }
14318
14319 STATIC AV *
14320 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14321 {
14322     /* This adds the string scalar <multi_string> to the array
14323      * <multi_char_matches>.  <multi_string> is known to have exactly
14324      * <cp_count> code points in it.  This is used when constructing a
14325      * bracketed character class and we find something that needs to match more
14326      * than a single character.
14327      *
14328      * <multi_char_matches> is actually an array of arrays.  Each top-level
14329      * element is an array that contains all the strings known so far that are
14330      * the same length.  And that length (in number of code points) is the same
14331      * as the index of the top-level array.  Hence, the [2] element is an
14332      * array, each element thereof is a string containing TWO code points;
14333      * while element [3] is for strings of THREE characters, and so on.  Since
14334      * this is for multi-char strings there can never be a [0] nor [1] element.
14335      *
14336      * When we rewrite the character class below, we will do so such that the
14337      * longest strings are written first, so that it prefers the longest
14338      * matching strings first.  This is done even if it turns out that any
14339      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
14340      * Christiansen has agreed that this is ok.  This makes the test for the
14341      * ligature 'ffi' come before the test for 'ff', for example */
14342
14343     AV* this_array;
14344     AV** this_array_ptr;
14345
14346     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14347
14348     if (! multi_char_matches) {
14349         multi_char_matches = newAV();
14350     }
14351
14352     if (av_exists(multi_char_matches, cp_count)) {
14353         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14354         this_array = *this_array_ptr;
14355     }
14356     else {
14357         this_array = newAV();
14358         av_store(multi_char_matches, cp_count,
14359                  (SV*) this_array);
14360     }
14361     av_push(this_array, multi_string);
14362
14363     return multi_char_matches;
14364 }
14365
14366 /* The names of properties whose definitions are not known at compile time are
14367  * stored in this SV, after a constant heading.  So if the length has been
14368  * changed since initialization, then there is a run-time definition. */
14369 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
14370                                         (SvCUR(listsv) != initial_listsv_len)
14371
14372 /* There is a restricted set of white space characters that are legal when
14373  * ignoring white space in a bracketed character class.  This generates the
14374  * code to skip them.
14375  *
14376  * There is a line below that uses the same white space criteria but is outside
14377  * this macro.  Both here and there must use the same definition */
14378 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
14379     STMT_START {                                                        \
14380         if (do_skip) {                                                  \
14381             while (   p < RExC_end                                      \
14382                    && isBLANK_A(UCHARAT(p)))                            \
14383             {                                                           \
14384                 p++;                                                    \
14385             }                                                           \
14386         }                                                               \
14387     } STMT_END
14388
14389 STATIC regnode *
14390 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14391                  const bool stop_at_1,  /* Just parse the next thing, don't
14392                                            look for a full character class */
14393                  bool allow_multi_folds,
14394                  const bool silence_non_portable,   /* Don't output warnings
14395                                                        about too large
14396                                                        characters */
14397                  const bool strict,
14398                  bool optimizable,                  /* ? Allow a non-ANYOF return
14399                                                        node */
14400                  SV** ret_invlist  /* Return an inversion list, not a node */
14401           )
14402 {
14403     /* parse a bracketed class specification.  Most of these will produce an
14404      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14405      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
14406      * under /i with multi-character folds: it will be rewritten following the
14407      * paradigm of this example, where the <multi-fold>s are characters which
14408      * fold to multiple character sequences:
14409      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14410      * gets effectively rewritten as:
14411      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14412      * reg() gets called (recursively) on the rewritten version, and this
14413      * function will return what it constructs.  (Actually the <multi-fold>s
14414      * aren't physically removed from the [abcdefghi], it's just that they are
14415      * ignored in the recursion by means of a flag:
14416      * <RExC_in_multi_char_class>.)
14417      *
14418      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14419      * characters, with the corresponding bit set if that character is in the
14420      * list.  For characters above this, a range list or swash is used.  There
14421      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14422      * determinable at compile time
14423      *
14424      * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
14425      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
14426      * to UTF-8.  This can only happen if ret_invlist is non-NULL.
14427      */
14428
14429     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14430     IV range = 0;
14431     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14432     regnode *ret;
14433     STRLEN numlen;
14434     IV namedclass = OOB_NAMEDCLASS;
14435     char *rangebegin = NULL;
14436     bool need_class = 0;
14437     SV *listsv = NULL;
14438     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14439                                       than just initialized.  */
14440     SV* properties = NULL;    /* Code points that match \p{} \P{} */
14441     SV* posixes = NULL;     /* Code points that match classes like [:word:],
14442                                extended beyond the Latin1 range.  These have to
14443                                be kept separate from other code points for much
14444                                of this function because their handling  is
14445                                different under /i, and for most classes under
14446                                /d as well */
14447     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
14448                                separate for a while from the non-complemented
14449                                versions because of complications with /d
14450                                matching */
14451     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14452                                   treated more simply than the general case,
14453                                   leading to less compilation and execution
14454                                   work */
14455     UV element_count = 0;   /* Number of distinct elements in the class.
14456                                Optimizations may be possible if this is tiny */
14457     AV * multi_char_matches = NULL; /* Code points that fold to more than one
14458                                        character; used under /i */
14459     UV n;
14460     char * stop_ptr = RExC_end;    /* where to stop parsing */
14461     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14462                                                    space? */
14463
14464     /* Unicode properties are stored in a swash; this holds the current one
14465      * being parsed.  If this swash is the only above-latin1 component of the
14466      * character class, an optimization is to pass it directly on to the
14467      * execution engine.  Otherwise, it is set to NULL to indicate that there
14468      * are other things in the class that have to be dealt with at execution
14469      * time */
14470     SV* swash = NULL;           /* Code points that match \p{} \P{} */
14471
14472     /* Set if a component of this character class is user-defined; just passed
14473      * on to the engine */
14474     bool has_user_defined_property = FALSE;
14475
14476     /* inversion list of code points this node matches only when the target
14477      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
14478      * /d) */
14479     SV* has_upper_latin1_only_utf8_matches = NULL;
14480
14481     /* Inversion list of code points this node matches regardless of things
14482      * like locale, folding, utf8ness of the target string */
14483     SV* cp_list = NULL;
14484
14485     /* Like cp_list, but code points on this list need to be checked for things
14486      * that fold to/from them under /i */
14487     SV* cp_foldable_list = NULL;
14488
14489     /* Like cp_list, but code points on this list are valid only when the
14490      * runtime locale is UTF-8 */
14491     SV* only_utf8_locale_list = NULL;
14492
14493     /* In a range, if one of the endpoints is non-character-set portable,
14494      * meaning that it hard-codes a code point that may mean a different
14495      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14496      * mnemonic '\t' which each mean the same character no matter which
14497      * character set the platform is on. */
14498     unsigned int non_portable_endpoint = 0;
14499
14500     /* Is the range unicode? which means on a platform that isn't 1-1 native
14501      * to Unicode (i.e. non-ASCII), each code point in it should be considered
14502      * to be a Unicode value.  */
14503     bool unicode_range = FALSE;
14504     bool invert = FALSE;    /* Is this class to be complemented */
14505
14506     bool warn_super = ALWAYS_WARN_SUPER;
14507
14508     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14509         case we need to change the emitted regop to an EXACT. */
14510     const char * orig_parse = RExC_parse;
14511     const SSize_t orig_size = RExC_size;
14512     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14513     GET_RE_DEBUG_FLAGS_DECL;
14514
14515     PERL_ARGS_ASSERT_REGCLASS;
14516 #ifndef DEBUGGING
14517     PERL_UNUSED_ARG(depth);
14518 #endif
14519
14520     DEBUG_PARSE("clas");
14521
14522 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
14523     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
14524                                    && UNICODE_DOT_DOT_VERSION == 0)
14525     allow_multi_folds = FALSE;
14526 #endif
14527
14528     /* Assume we are going to generate an ANYOF node. */
14529     ret = reganode(pRExC_state,
14530                    (LOC)
14531                     ? ANYOFL
14532                     : ANYOF,
14533                    0);
14534
14535     if (SIZE_ONLY) {
14536         RExC_size += ANYOF_SKIP;
14537         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14538     }
14539     else {
14540         ANYOF_FLAGS(ret) = 0;
14541
14542         RExC_emit += ANYOF_SKIP;
14543         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14544         initial_listsv_len = SvCUR(listsv);
14545         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
14546     }
14547
14548     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14549
14550     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
14551         RExC_parse++;
14552         invert = TRUE;
14553         allow_multi_folds = FALSE;
14554         MARK_NAUGHTY(1);
14555         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14556     }
14557
14558     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14559     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14560         const char *s = RExC_parse;
14561         const char  c = *s++;
14562
14563         if (*s == '^') {
14564             s++;
14565         }
14566         while (isWORDCHAR(*s))
14567             s++;
14568         if (*s && c == *s && s[1] == ']') {
14569             SAVEFREESV(RExC_rx_sv);
14570             ckWARN3reg(s+2,
14571                        "POSIX syntax [%c %c] belongs inside character classes",
14572                        c, c);
14573             (void)ReREFCNT_inc(RExC_rx_sv);
14574         }
14575     }
14576
14577     /* If the caller wants us to just parse a single element, accomplish this
14578      * by faking the loop ending condition */
14579     if (stop_at_1 && RExC_end > RExC_parse) {
14580         stop_ptr = RExC_parse + 1;
14581     }
14582
14583     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14584     if (UCHARAT(RExC_parse) == ']')
14585         goto charclassloop;
14586
14587     while (1) {
14588         if  (RExC_parse >= stop_ptr) {
14589             break;
14590         }
14591
14592         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14593
14594         if  (UCHARAT(RExC_parse) == ']') {
14595             break;
14596         }
14597
14598       charclassloop:
14599
14600         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14601         save_value = value;
14602         save_prevvalue = prevvalue;
14603
14604         if (!range) {
14605             rangebegin = RExC_parse;
14606             element_count++;
14607             non_portable_endpoint = 0;
14608         }
14609         if (UTF) {
14610             value = utf8n_to_uvchr((U8*)RExC_parse,
14611                                    RExC_end - RExC_parse,
14612                                    &numlen, UTF8_ALLOW_DEFAULT);
14613             RExC_parse += numlen;
14614         }
14615         else
14616             value = UCHARAT(RExC_parse++);
14617
14618         if (value == '['
14619             && RExC_parse < RExC_end
14620             && POSIXCC(UCHARAT(RExC_parse)))
14621         {
14622             namedclass = regpposixcc(pRExC_state, value, strict);
14623         }
14624         else if (value == '\\') {
14625             /* Is a backslash; get the code point of the char after it */
14626             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14627                 value = utf8n_to_uvchr((U8*)RExC_parse,
14628                                    RExC_end - RExC_parse,
14629                                    &numlen, UTF8_ALLOW_DEFAULT);
14630                 RExC_parse += numlen;
14631             }
14632             else
14633                 value = UCHARAT(RExC_parse++);
14634
14635             /* Some compilers cannot handle switching on 64-bit integer
14636              * values, therefore value cannot be an UV.  Yes, this will
14637              * be a problem later if we want switch on Unicode.
14638              * A similar issue a little bit later when switching on
14639              * namedclass. --jhi */
14640
14641             /* If the \ is escaping white space when white space is being
14642              * skipped, it means that that white space is wanted literally, and
14643              * is already in 'value'.  Otherwise, need to translate the escape
14644              * into what it signifies. */
14645             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
14646
14647             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
14648             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
14649             case 's':   namedclass = ANYOF_SPACE;       break;
14650             case 'S':   namedclass = ANYOF_NSPACE;      break;
14651             case 'd':   namedclass = ANYOF_DIGIT;       break;
14652             case 'D':   namedclass = ANYOF_NDIGIT;      break;
14653             case 'v':   namedclass = ANYOF_VERTWS;      break;
14654             case 'V':   namedclass = ANYOF_NVERTWS;     break;
14655             case 'h':   namedclass = ANYOF_HORIZWS;     break;
14656             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
14657             case 'N':  /* Handle \N{NAME} in class */
14658                 {
14659                     const char * const backslash_N_beg = RExC_parse - 2;
14660                     int cp_count;
14661
14662                     if (! grok_bslash_N(pRExC_state,
14663                                         NULL,      /* No regnode */
14664                                         &value,    /* Yes single value */
14665                                         &cp_count, /* Multiple code pt count */
14666                                         flagp,
14667                                         depth)
14668                     ) {
14669
14670                         if (*flagp & NEED_UTF8)
14671                             FAIL("panic: grok_bslash_N set NEED_UTF8");
14672                         if (*flagp & RESTART_PASS1)
14673                             return NULL;
14674
14675                         if (cp_count < 0) {
14676                             vFAIL("\\N in a character class must be a named character: \\N{...}");
14677                         }
14678                         else if (cp_count == 0) {
14679                             if (strict) {
14680                                 RExC_parse++;   /* Position after the "}" */
14681                                 vFAIL("Zero length \\N{}");
14682                             }
14683                             else if (PASS2) {
14684                                 ckWARNreg(RExC_parse,
14685                                         "Ignoring zero length \\N{} in character class");
14686                             }
14687                         }
14688                         else { /* cp_count > 1 */
14689                             if (! RExC_in_multi_char_class) {
14690                                 if (invert || range || *RExC_parse == '-') {
14691                                     if (strict) {
14692                                         RExC_parse--;
14693                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14694                                     }
14695                                     else if (PASS2) {
14696                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14697                                     }
14698                                     break; /* <value> contains the first code
14699                                               point. Drop out of the switch to
14700                                               process it */
14701                                 }
14702                                 else {
14703                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
14704                                                  RExC_parse - backslash_N_beg);
14705                                     multi_char_matches
14706                                         = add_multi_match(multi_char_matches,
14707                                                           multi_char_N,
14708                                                           cp_count);
14709                                 }
14710                             }
14711                         } /* End of cp_count != 1 */
14712
14713                         /* This element should not be processed further in this
14714                          * class */
14715                         element_count--;
14716                         value = save_value;
14717                         prevvalue = save_prevvalue;
14718                         continue;   /* Back to top of loop to get next char */
14719                     }
14720
14721                     /* Here, is a single code point, and <value> contains it */
14722                     unicode_range = TRUE;   /* \N{} are Unicode */
14723                 }
14724                 break;
14725             case 'p':
14726             case 'P':
14727                 {
14728                 char *e;
14729
14730                 /* We will handle any undefined properties ourselves */
14731                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14732                                        /* And we actually would prefer to get
14733                                         * the straight inversion list of the
14734                                         * swash, since we will be accessing it
14735                                         * anyway, to save a little time */
14736                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14737
14738                 if (RExC_parse >= RExC_end)
14739                     vFAIL2("Empty \\%c{}", (U8)value);
14740                 if (*RExC_parse == '{') {
14741                     const U8 c = (U8)value;
14742                     e = strchr(RExC_parse, '}');
14743                     if (!e) {
14744                         RExC_parse++;
14745                         vFAIL2("Missing right brace on \\%c{}", c);
14746                     }
14747
14748                     RExC_parse++;
14749                     while (isSPACE(*RExC_parse)) {
14750                          RExC_parse++;
14751                     }
14752
14753                     if (UCHARAT(RExC_parse) == '^') {
14754
14755                         /* toggle.  (The rhs xor gets the single bit that
14756                          * differs between P and p; the other xor inverts just
14757                          * that bit) */
14758                         value ^= 'P' ^ 'p';
14759
14760                         RExC_parse++;
14761                         while (isSPACE(*RExC_parse)) {
14762                             RExC_parse++;
14763                         }
14764                     }
14765
14766                     if (e == RExC_parse)
14767                         vFAIL2("Empty \\%c{}", c);
14768
14769                     n = e - RExC_parse;
14770                     while (isSPACE(*(RExC_parse + n - 1)))
14771                         n--;
14772                 }   /* The \p isn't immediately followed by a '{' */
14773                 else if (! isALPHA(*RExC_parse)) {
14774                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14775                     vFAIL2("Character following \\%c must be '{' or a "
14776                            "single-character Unicode property name",
14777                            (U8) value);
14778                 }
14779                 else {
14780                     e = RExC_parse;
14781                     n = 1;
14782                 }
14783                 if (!SIZE_ONLY) {
14784                     SV* invlist;
14785                     char* name;
14786                     char* base_name;    /* name after any packages are stripped */
14787                     const char * const colon_colon = "::";
14788
14789                     /* Try to get the definition of the property into
14790                      * <invlist>.  If /i is in effect, the effective property
14791                      * will have its name be <__NAME_i>.  The design is
14792                      * discussed in commit
14793                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14794                     name = savepv(Perl_form(aTHX_
14795                                           "%s%.*s%s\n",
14796                                           (FOLD) ? "__" : "",
14797                                           (int)n,
14798                                           RExC_parse,
14799                                           (FOLD) ? "_i" : ""
14800                                 ));
14801
14802                     /* Look up the property name, and get its swash and
14803                      * inversion list, if the property is found  */
14804                     if (swash) {    /* Return any left-overs */
14805                         SvREFCNT_dec_NN(swash);
14806                     }
14807                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14808                                              1, /* binary */
14809                                              0, /* not tr/// */
14810                                              NULL, /* No inversion list */
14811                                              &swash_init_flags
14812                                             );
14813                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14814                         HV* curpkg = (IN_PERL_COMPILETIME)
14815                                       ? PL_curstash
14816                                       : CopSTASH(PL_curcop);
14817                         UV final_n = n;
14818                         bool has_pkg;
14819
14820                         if (swash) {    /* Got a swash but no inversion list.
14821                                            Something is likely wrong that will
14822                                            be sorted-out later */
14823                             SvREFCNT_dec_NN(swash);
14824                             swash = NULL;
14825                         }
14826
14827                         /* Here didn't find it.  It could be a an error (like a
14828                          * typo) in specifying a Unicode property, or it could
14829                          * be a user-defined property that will be available at
14830                          * run-time.  The names of these must begin with 'In'
14831                          * or 'Is' (after any packages are stripped off).  So
14832                          * if not one of those, or if we accept only
14833                          * compile-time properties, is an error; otherwise add
14834                          * it to the list for run-time look up. */
14835                         if ((base_name = rninstr(name, name + n,
14836                                                  colon_colon, colon_colon + 2)))
14837                         { /* Has ::.  We know this must be a user-defined
14838                              property */
14839                             base_name += 2;
14840                             final_n -= base_name - name;
14841                             has_pkg = TRUE;
14842                         }
14843                         else {
14844                             base_name = name;
14845                             has_pkg = FALSE;
14846                         }
14847
14848                         if (   final_n < 3
14849                             || base_name[0] != 'I'
14850                             || (base_name[1] != 's' && base_name[1] != 'n')
14851                             || ret_invlist)
14852                         {
14853                             const char * const msg
14854                                 = (has_pkg)
14855                                   ? "Illegal user-defined property name"
14856                                   : "Can't find Unicode property definition";
14857                             RExC_parse = e + 1;
14858
14859                             /* diag_listed_as: Can't find Unicode property definition "%s" */
14860                             vFAIL3utf8f("%s \"%"UTF8f"\"",
14861                                 msg, UTF8fARG(UTF, n, name));
14862                         }
14863
14864                         /* If the property name doesn't already have a package
14865                          * name, add the current one to it so that it can be
14866                          * referred to outside it. [perl #121777] */
14867                         if (! has_pkg && curpkg) {
14868                             char* pkgname = HvNAME(curpkg);
14869                             if (strNE(pkgname, "main")) {
14870                                 char* full_name = Perl_form(aTHX_
14871                                                             "%s::%s",
14872                                                             pkgname,
14873                                                             name);
14874                                 n = strlen(full_name);
14875                                 Safefree(name);
14876                                 name = savepvn(full_name, n);
14877                             }
14878                         }
14879                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14880                                         (value == 'p' ? '+' : '!'),
14881                                         UTF8fARG(UTF, n, name));
14882                         has_user_defined_property = TRUE;
14883                         optimizable = FALSE;    /* Will have to leave this an
14884                                                    ANYOF node */
14885
14886                         /* We don't know yet what this matches, so have to flag
14887                          * it */
14888                         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
14889                     }
14890                     else {
14891
14892                         /* Here, did get the swash and its inversion list.  If
14893                          * the swash is from a user-defined property, then this
14894                          * whole character class should be regarded as such */
14895                         if (swash_init_flags
14896                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14897                         {
14898                             has_user_defined_property = TRUE;
14899                         }
14900                         else if
14901                             /* We warn on matching an above-Unicode code point
14902                              * if the match would return true, except don't
14903                              * warn for \p{All}, which has exactly one element
14904                              * = 0 */
14905                             (_invlist_contains_cp(invlist, 0x110000)
14906                                 && (! (_invlist_len(invlist) == 1
14907                                        && *invlist_array(invlist) == 0)))
14908                         {
14909                             warn_super = TRUE;
14910                         }
14911
14912
14913                         /* Invert if asking for the complement */
14914                         if (value == 'P') {
14915                             _invlist_union_complement_2nd(properties,
14916                                                           invlist,
14917                                                           &properties);
14918
14919                             /* The swash can't be used as-is, because we've
14920                              * inverted things; delay removing it to here after
14921                              * have copied its invlist above */
14922                             SvREFCNT_dec_NN(swash);
14923                             swash = NULL;
14924                         }
14925                         else {
14926                             _invlist_union(properties, invlist, &properties);
14927                         }
14928                     }
14929                     Safefree(name);
14930                 }
14931                 RExC_parse = e + 1;
14932                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14933                                                 named */
14934
14935                 /* \p means they want Unicode semantics */
14936                 REQUIRE_UNI_RULES(flagp, NULL);
14937                 }
14938                 break;
14939             case 'n':   value = '\n';                   break;
14940             case 'r':   value = '\r';                   break;
14941             case 't':   value = '\t';                   break;
14942             case 'f':   value = '\f';                   break;
14943             case 'b':   value = '\b';                   break;
14944             case 'e':   value = ESC_NATIVE;             break;
14945             case 'a':   value = '\a';                   break;
14946             case 'o':
14947                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14948                 {
14949                     const char* error_msg;
14950                     bool valid = grok_bslash_o(&RExC_parse,
14951                                                &value,
14952                                                &error_msg,
14953                                                PASS2,   /* warnings only in
14954                                                            pass 2 */
14955                                                strict,
14956                                                silence_non_portable,
14957                                                UTF);
14958                     if (! valid) {
14959                         vFAIL(error_msg);
14960                     }
14961                 }
14962                 non_portable_endpoint++;
14963                 if (IN_ENCODING && value < 0x100) {
14964                     goto recode_encoding;
14965                 }
14966                 break;
14967             case 'x':
14968                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14969                 {
14970                     const char* error_msg;
14971                     bool valid = grok_bslash_x(&RExC_parse,
14972                                                &value,
14973                                                &error_msg,
14974                                                PASS2, /* Output warnings */
14975                                                strict,
14976                                                silence_non_portable,
14977                                                UTF);
14978                     if (! valid) {
14979                         vFAIL(error_msg);
14980                     }
14981                 }
14982                 non_portable_endpoint++;
14983                 if (IN_ENCODING && value < 0x100)
14984                     goto recode_encoding;
14985                 break;
14986             case 'c':
14987                 value = grok_bslash_c(*RExC_parse++, PASS2);
14988                 non_portable_endpoint++;
14989                 break;
14990             case '0': case '1': case '2': case '3': case '4':
14991             case '5': case '6': case '7':
14992                 {
14993                     /* Take 1-3 octal digits */
14994                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14995                     numlen = (strict) ? 4 : 3;
14996                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14997                     RExC_parse += numlen;
14998                     if (numlen != 3) {
14999                         if (strict) {
15000                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15001                             vFAIL("Need exactly 3 octal digits");
15002                         }
15003                         else if (! SIZE_ONLY /* like \08, \178 */
15004                                  && numlen < 3
15005                                  && RExC_parse < RExC_end
15006                                  && isDIGIT(*RExC_parse)
15007                                  && ckWARN(WARN_REGEXP))
15008                         {
15009                             SAVEFREESV(RExC_rx_sv);
15010                             reg_warn_non_literal_string(
15011                                  RExC_parse + 1,
15012                                  form_short_octal_warning(RExC_parse, numlen));
15013                             (void)ReREFCNT_inc(RExC_rx_sv);
15014                         }
15015                     }
15016                     non_portable_endpoint++;
15017                     if (IN_ENCODING && value < 0x100)
15018                         goto recode_encoding;
15019                     break;
15020                 }
15021               recode_encoding:
15022                 if (! RExC_override_recoding) {
15023                     SV* enc = _get_encoding();
15024                     value = reg_recode((U8)value, &enc);
15025                     if (!enc) {
15026                         if (strict) {
15027                             vFAIL("Invalid escape in the specified encoding");
15028                         }
15029                         else if (PASS2) {
15030                             ckWARNreg(RExC_parse,
15031                                   "Invalid escape in the specified encoding");
15032                         }
15033                     }
15034                     break;
15035                 }
15036             default:
15037                 /* Allow \_ to not give an error */
15038                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
15039                     if (strict) {
15040                         vFAIL2("Unrecognized escape \\%c in character class",
15041                                (int)value);
15042                     }
15043                     else {
15044                         SAVEFREESV(RExC_rx_sv);
15045                         ckWARN2reg(RExC_parse,
15046                             "Unrecognized escape \\%c in character class passed through",
15047                             (int)value);
15048                         (void)ReREFCNT_inc(RExC_rx_sv);
15049                     }
15050                 }
15051                 break;
15052             }   /* End of switch on char following backslash */
15053         } /* end of handling backslash escape sequences */
15054
15055         /* Here, we have the current token in 'value' */
15056
15057         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
15058             U8 classnum;
15059
15060             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
15061              * literal, as is the character that began the false range, i.e.
15062              * the 'a' in the examples */
15063             if (range) {
15064                 if (!SIZE_ONLY) {
15065                     const int w = (RExC_parse >= rangebegin)
15066                                   ? RExC_parse - rangebegin
15067                                   : 0;
15068                     if (strict) {
15069                         vFAIL2utf8f(
15070                             "False [] range \"%"UTF8f"\"",
15071                             UTF8fARG(UTF, w, rangebegin));
15072                     }
15073                     else {
15074                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
15075                         ckWARN2reg(RExC_parse,
15076                             "False [] range \"%"UTF8f"\"",
15077                             UTF8fARG(UTF, w, rangebegin));
15078                         (void)ReREFCNT_inc(RExC_rx_sv);
15079                         cp_list = add_cp_to_invlist(cp_list, '-');
15080                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
15081                                                              prevvalue);
15082                     }
15083                 }
15084
15085                 range = 0; /* this was not a true range */
15086                 element_count += 2; /* So counts for three values */
15087             }
15088
15089             classnum = namedclass_to_classnum(namedclass);
15090
15091             if (LOC && namedclass < ANYOF_POSIXL_MAX
15092 #ifndef HAS_ISASCII
15093                 && classnum != _CC_ASCII
15094 #endif
15095             ) {
15096                 /* What the Posix classes (like \w, [:space:]) match in locale
15097                  * isn't knowable under locale until actual match time.  Room
15098                  * must be reserved (one time per outer bracketed class) to
15099                  * store such classes.  The space will contain a bit for each
15100                  * named class that is to be matched against.  This isn't
15101                  * needed for \p{} and pseudo-classes, as they are not affected
15102                  * by locale, and hence are dealt with separately */
15103                 if (! need_class) {
15104                     need_class = 1;
15105                     if (SIZE_ONLY) {
15106                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15107                     }
15108                     else {
15109                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15110                     }
15111                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
15112                     ANYOF_POSIXL_ZERO(ret);
15113
15114                     /* We can't change this into some other type of node
15115                      * (unless this is the only element, in which case there
15116                      * are nodes that mean exactly this) as has runtime
15117                      * dependencies */
15118                     optimizable = FALSE;
15119                 }
15120
15121                 /* Coverity thinks it is possible for this to be negative; both
15122                  * jhi and khw think it's not, but be safer */
15123                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15124                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
15125
15126                 /* See if it already matches the complement of this POSIX
15127                  * class */
15128                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15129                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
15130                                                             ? -1
15131                                                             : 1)))
15132                 {
15133                     posixl_matches_all = TRUE;
15134                     break;  /* No need to continue.  Since it matches both
15135                                e.g., \w and \W, it matches everything, and the
15136                                bracketed class can be optimized into qr/./s */
15137                 }
15138
15139                 /* Add this class to those that should be checked at runtime */
15140                 ANYOF_POSIXL_SET(ret, namedclass);
15141
15142                 /* The above-Latin1 characters are not subject to locale rules.
15143                  * Just add them, in the second pass, to the
15144                  * unconditionally-matched list */
15145                 if (! SIZE_ONLY) {
15146                     SV* scratch_list = NULL;
15147
15148                     /* Get the list of the above-Latin1 code points this
15149                      * matches */
15150                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
15151                                           PL_XPosix_ptrs[classnum],
15152
15153                                           /* Odd numbers are complements, like
15154                                            * NDIGIT, NASCII, ... */
15155                                           namedclass % 2 != 0,
15156                                           &scratch_list);
15157                     /* Checking if 'cp_list' is NULL first saves an extra
15158                      * clone.  Its reference count will be decremented at the
15159                      * next union, etc, or if this is the only instance, at the
15160                      * end of the routine */
15161                     if (! cp_list) {
15162                         cp_list = scratch_list;
15163                     }
15164                     else {
15165                         _invlist_union(cp_list, scratch_list, &cp_list);
15166                         SvREFCNT_dec_NN(scratch_list);
15167                     }
15168                     continue;   /* Go get next character */
15169                 }
15170             }
15171             else if (! SIZE_ONLY) {
15172
15173                 /* Here, not in pass1 (in that pass we skip calculating the
15174                  * contents of this class), and is /l, or is a POSIX class for
15175                  * which /l doesn't matter (or is a Unicode property, which is
15176                  * skipped here). */
15177                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
15178                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
15179
15180                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
15181                          * nor /l make a difference in what these match,
15182                          * therefore we just add what they match to cp_list. */
15183                         if (classnum != _CC_VERTSPACE) {
15184                             assert(   namedclass == ANYOF_HORIZWS
15185                                    || namedclass == ANYOF_NHORIZWS);
15186
15187                             /* It turns out that \h is just a synonym for
15188                              * XPosixBlank */
15189                             classnum = _CC_BLANK;
15190                         }
15191
15192                         _invlist_union_maybe_complement_2nd(
15193                                 cp_list,
15194                                 PL_XPosix_ptrs[classnum],
15195                                 namedclass % 2 != 0,    /* Complement if odd
15196                                                           (NHORIZWS, NVERTWS)
15197                                                         */
15198                                 &cp_list);
15199                     }
15200                 }
15201                 else if (UNI_SEMANTICS
15202                         || classnum == _CC_ASCII
15203                         || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
15204                                                   || classnum == _CC_XDIGIT)))
15205                 {
15206                     /* We usually have to worry about /d and /a affecting what
15207                      * POSIX classes match, with special code needed for /d
15208                      * because we won't know until runtime what all matches.
15209                      * But there is no extra work needed under /u, and
15210                      * [:ascii:] is unaffected by /a and /d; and :digit: and
15211                      * :xdigit: don't have runtime differences under /d.  So we
15212                      * can special case these, and avoid some extra work below,
15213                      * and at runtime. */
15214                     _invlist_union_maybe_complement_2nd(
15215                                                      simple_posixes,
15216                                                      PL_XPosix_ptrs[classnum],
15217                                                      namedclass % 2 != 0,
15218                                                      &simple_posixes);
15219                 }
15220                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
15221                            complement and use nposixes */
15222                     SV** posixes_ptr = namedclass % 2 == 0
15223                                        ? &posixes
15224                                        : &nposixes;
15225                     _invlist_union_maybe_complement_2nd(
15226                                                      *posixes_ptr,
15227                                                      PL_XPosix_ptrs[classnum],
15228                                                      namedclass % 2 != 0,
15229                                                      posixes_ptr);
15230                 }
15231             }
15232         } /* end of namedclass \blah */
15233
15234         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15235
15236         /* If 'range' is set, 'value' is the ending of a range--check its
15237          * validity.  (If value isn't a single code point in the case of a
15238          * range, we should have figured that out above in the code that
15239          * catches false ranges).  Later, we will handle each individual code
15240          * point in the range.  If 'range' isn't set, this could be the
15241          * beginning of a range, so check for that by looking ahead to see if
15242          * the next real character to be processed is the range indicator--the
15243          * minus sign */
15244
15245         if (range) {
15246 #ifdef EBCDIC
15247             /* For unicode ranges, we have to test that the Unicode as opposed
15248              * to the native values are not decreasing.  (Above 255, there is
15249              * no difference between native and Unicode) */
15250             if (unicode_range && prevvalue < 255 && value < 255) {
15251                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
15252                     goto backwards_range;
15253                 }
15254             }
15255             else
15256 #endif
15257             if (prevvalue > value) /* b-a */ {
15258                 int w;
15259 #ifdef EBCDIC
15260               backwards_range:
15261 #endif
15262                 w = RExC_parse - rangebegin;
15263                 vFAIL2utf8f(
15264                     "Invalid [] range \"%"UTF8f"\"",
15265                     UTF8fARG(UTF, w, rangebegin));
15266                 NOT_REACHED; /* NOTREACHED */
15267             }
15268         }
15269         else {
15270             prevvalue = value; /* save the beginning of the potential range */
15271             if (! stop_at_1     /* Can't be a range if parsing just one thing */
15272                 && *RExC_parse == '-')
15273             {
15274                 char* next_char_ptr = RExC_parse + 1;
15275
15276                 /* Get the next real char after the '-' */
15277                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
15278
15279                 /* If the '-' is at the end of the class (just before the ']',
15280                  * it is a literal minus; otherwise it is a range */
15281                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
15282                     RExC_parse = next_char_ptr;
15283
15284                     /* a bad range like \w-, [:word:]- ? */
15285                     if (namedclass > OOB_NAMEDCLASS) {
15286                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
15287                             const int w = RExC_parse >= rangebegin
15288                                           ?  RExC_parse - rangebegin
15289                                           : 0;
15290                             if (strict) {
15291                                 vFAIL4("False [] range \"%*.*s\"",
15292                                     w, w, rangebegin);
15293                             }
15294                             else if (PASS2) {
15295                                 vWARN4(RExC_parse,
15296                                     "False [] range \"%*.*s\"",
15297                                     w, w, rangebegin);
15298                             }
15299                         }
15300                         if (!SIZE_ONLY) {
15301                             cp_list = add_cp_to_invlist(cp_list, '-');
15302                         }
15303                         element_count++;
15304                     } else
15305                         range = 1;      /* yeah, it's a range! */
15306                     continue;   /* but do it the next time */
15307                 }
15308             }
15309         }
15310
15311         if (namedclass > OOB_NAMEDCLASS) {
15312             continue;
15313         }
15314
15315         /* Here, we have a single value this time through the loop, and
15316          * <prevvalue> is the beginning of the range, if any; or <value> if
15317          * not. */
15318
15319         /* non-Latin1 code point implies unicode semantics.  Must be set in
15320          * pass1 so is there for the whole of pass 2 */
15321         if (value > 255) {
15322             REQUIRE_UNI_RULES(flagp, NULL);
15323         }
15324
15325         /* Ready to process either the single value, or the completed range.
15326          * For single-valued non-inverted ranges, we consider the possibility
15327          * of multi-char folds.  (We made a conscious decision to not do this
15328          * for the other cases because it can often lead to non-intuitive
15329          * results.  For example, you have the peculiar case that:
15330          *  "s s" =~ /^[^\xDF]+$/i => Y
15331          *  "ss"  =~ /^[^\xDF]+$/i => N
15332          *
15333          * See [perl #89750] */
15334         if (FOLD && allow_multi_folds && value == prevvalue) {
15335             if (value == LATIN_SMALL_LETTER_SHARP_S
15336                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15337                                                         value)))
15338             {
15339                 /* Here <value> is indeed a multi-char fold.  Get what it is */
15340
15341                 U8 foldbuf[UTF8_MAXBYTES_CASE];
15342                 STRLEN foldlen;
15343
15344                 UV folded = _to_uni_fold_flags(
15345                                 value,
15346                                 foldbuf,
15347                                 &foldlen,
15348                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15349                                                    ? FOLD_FLAGS_NOMIX_ASCII
15350                                                    : 0)
15351                                 );
15352
15353                 /* Here, <folded> should be the first character of the
15354                  * multi-char fold of <value>, with <foldbuf> containing the
15355                  * whole thing.  But, if this fold is not allowed (because of
15356                  * the flags), <fold> will be the same as <value>, and should
15357                  * be processed like any other character, so skip the special
15358                  * handling */
15359                 if (folded != value) {
15360
15361                     /* Skip if we are recursed, currently parsing the class
15362                      * again.  Otherwise add this character to the list of
15363                      * multi-char folds. */
15364                     if (! RExC_in_multi_char_class) {
15365                         STRLEN cp_count = utf8_length(foldbuf,
15366                                                       foldbuf + foldlen);
15367                         SV* multi_fold = sv_2mortal(newSVpvs(""));
15368
15369                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15370
15371                         multi_char_matches
15372                                         = add_multi_match(multi_char_matches,
15373                                                           multi_fold,
15374                                                           cp_count);
15375
15376                     }
15377
15378                     /* This element should not be processed further in this
15379                      * class */
15380                     element_count--;
15381                     value = save_value;
15382                     prevvalue = save_prevvalue;
15383                     continue;
15384                 }
15385             }
15386         }
15387
15388         if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15389             if (range) {
15390
15391                 /* If the range starts above 255, everything is portable and
15392                  * likely to be so for any forseeable character set, so don't
15393                  * warn. */
15394                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15395                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15396                 }
15397                 else if (prevvalue != value) {
15398
15399                     /* Under strict, ranges that stop and/or end in an ASCII
15400                      * printable should have each end point be a portable value
15401                      * for it (preferably like 'A', but we don't warn if it is
15402                      * a (portable) Unicode name or code point), and the range
15403                      * must be be all digits or all letters of the same case.
15404                      * Otherwise, the range is non-portable and unclear as to
15405                      * what it contains */
15406                     if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15407                         && (non_portable_endpoint
15408                             || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15409                                    || (isLOWER_A(prevvalue) && isLOWER_A(value))
15410                                    || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15411                     {
15412                         vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15413                     }
15414                     else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15415
15416                         /* But the nature of Unicode and languages mean we
15417                          * can't do the same checks for above-ASCII ranges,
15418                          * except in the case of digit ones.  These should
15419                          * contain only digits from the same group of 10.  The
15420                          * ASCII case is handled just above.  0x660 is the
15421                          * first digit character beyond ASCII.  Hence here, the
15422                          * range could be a range of digits.  Find out.  */
15423                         IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15424                                                          prevvalue);
15425                         IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15426                                                          value);
15427
15428                         /* If the range start and final points are in the same
15429                          * inversion list element, it means that either both
15430                          * are not digits, or both are digits in a consecutive
15431                          * sequence of digits.  (So far, Unicode has kept all
15432                          * such sequences as distinct groups of 10, but assert
15433                          * to make sure).  If the end points are not in the
15434                          * same element, neither should be a digit. */
15435                         if (index_start == index_final) {
15436                             assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15437                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15438                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15439                                == 10)
15440                                /* But actually Unicode did have one group of 11
15441                                 * 'digits' in 5.2, so in case we are operating
15442                                 * on that version, let that pass */
15443                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15444                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15445                                 == 11
15446                                && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15447                                 == 0x19D0)
15448                             );
15449                         }
15450                         else if ((index_start >= 0
15451                                   && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15452                                  || (index_final >= 0
15453                                      && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15454                         {
15455                             vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15456                         }
15457                     }
15458                 }
15459             }
15460             if ((! range || prevvalue == value) && non_portable_endpoint) {
15461                 if (isPRINT_A(value)) {
15462                     char literal[3];
15463                     unsigned d = 0;
15464                     if (isBACKSLASHED_PUNCT(value)) {
15465                         literal[d++] = '\\';
15466                     }
15467                     literal[d++] = (char) value;
15468                     literal[d++] = '\0';
15469
15470                     vWARN4(RExC_parse,
15471                            "\"%.*s\" is more clearly written simply as \"%s\"",
15472                            (int) (RExC_parse - rangebegin),
15473                            rangebegin,
15474                            literal
15475                         );
15476                 }
15477                 else if isMNEMONIC_CNTRL(value) {
15478                     vWARN4(RExC_parse,
15479                            "\"%.*s\" is more clearly written simply as \"%s\"",
15480                            (int) (RExC_parse - rangebegin),
15481                            rangebegin,
15482                            cntrl_to_mnemonic((char) value)
15483                         );
15484                 }
15485             }
15486         }
15487
15488         /* Deal with this element of the class */
15489         if (! SIZE_ONLY) {
15490
15491 #ifndef EBCDIC
15492             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15493                                                      prevvalue, value);
15494 #else
15495             /* On non-ASCII platforms, for ranges that span all of 0..255, and
15496              * ones that don't require special handling, we can just add the
15497              * range like we do for ASCII platforms */
15498             if ((UNLIKELY(prevvalue == 0) && value >= 255)
15499                 || ! (prevvalue < 256
15500                       && (unicode_range
15501                           || (! non_portable_endpoint
15502                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15503                                   || (isUPPER_A(prevvalue)
15504                                       && isUPPER_A(value)))))))
15505             {
15506                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15507                                                          prevvalue, value);
15508             }
15509             else {
15510                 /* Here, requires special handling.  This can be because it is
15511                  * a range whose code points are considered to be Unicode, and
15512                  * so must be individually translated into native, or because
15513                  * its a subrange of 'A-Z' or 'a-z' which each aren't
15514                  * contiguous in EBCDIC, but we have defined them to include
15515                  * only the "expected" upper or lower case ASCII alphabetics.
15516                  * Subranges above 255 are the same in native and Unicode, so
15517                  * can be added as a range */
15518                 U8 start = NATIVE_TO_LATIN1(prevvalue);
15519                 unsigned j;
15520                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15521                 for (j = start; j <= end; j++) {
15522                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15523                 }
15524                 if (value > 255) {
15525                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15526                                                              256, value);
15527                 }
15528             }
15529 #endif
15530         }
15531
15532         range = 0; /* this range (if it was one) is done now */
15533     } /* End of loop through all the text within the brackets */
15534
15535     /* If anything in the class expands to more than one character, we have to
15536      * deal with them by building up a substitute parse string, and recursively
15537      * calling reg() on it, instead of proceeding */
15538     if (multi_char_matches) {
15539         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15540         I32 cp_count;
15541         STRLEN len;
15542         char *save_end = RExC_end;
15543         char *save_parse = RExC_parse;
15544         char *save_start = RExC_start;
15545         STRLEN prefix_end = 0;      /* We copy the character class after a
15546                                        prefix supplied here.  This is the size
15547                                        + 1 of that prefix */
15548         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
15549                                        a "|" */
15550         I32 reg_flags;
15551
15552         assert(! invert);
15553         assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
15554
15555 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
15556            because too confusing */
15557         if (invert) {
15558             sv_catpv(substitute_parse, "(?:");
15559         }
15560 #endif
15561
15562         /* Look at the longest folds first */
15563         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15564
15565             if (av_exists(multi_char_matches, cp_count)) {
15566                 AV** this_array_ptr;
15567                 SV* this_sequence;
15568
15569                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15570                                                  cp_count, FALSE);
15571                 while ((this_sequence = av_pop(*this_array_ptr)) !=
15572                                                                 &PL_sv_undef)
15573                 {
15574                     if (! first_time) {
15575                         sv_catpv(substitute_parse, "|");
15576                     }
15577                     first_time = FALSE;
15578
15579                     sv_catpv(substitute_parse, SvPVX(this_sequence));
15580                 }
15581             }
15582         }
15583
15584         /* If the character class contains anything else besides these
15585          * multi-character folds, have to include it in recursive parsing */
15586         if (element_count) {
15587             sv_catpv(substitute_parse, "|[");
15588             prefix_end = SvCUR(substitute_parse);
15589             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15590             sv_catpv(substitute_parse, "]");
15591         }
15592
15593         sv_catpv(substitute_parse, ")");
15594 #if 0
15595         if (invert) {
15596             /* This is a way to get the parse to skip forward a whole named
15597              * sequence instead of matching the 2nd character when it fails the
15598              * first */
15599             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15600         }
15601 #endif
15602
15603         /* Set up the data structure so that any errors will be properly
15604          * reported.  See the comments at the definition of
15605          * REPORT_LOCATION_ARGS for details */
15606         RExC_precomp_adj = orig_parse - RExC_precomp;
15607         RExC_start =  RExC_parse = SvPV(substitute_parse, len);
15608         RExC_adjusted_start = RExC_start + prefix_end;
15609         RExC_end = RExC_parse + len;
15610         RExC_in_multi_char_class = 1;
15611         RExC_override_recoding = 1;
15612         RExC_emit = (regnode *)orig_emit;
15613
15614         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
15615
15616         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
15617
15618         /* And restore so can parse the rest of the pattern */
15619         RExC_parse = save_parse;
15620         RExC_start = RExC_adjusted_start = save_start;
15621         RExC_precomp_adj = 0;
15622         RExC_end = save_end;
15623         RExC_in_multi_char_class = 0;
15624         RExC_override_recoding = 0;
15625         SvREFCNT_dec_NN(multi_char_matches);
15626         return ret;
15627     }
15628
15629     /* Here, we've gone through the entire class and dealt with multi-char
15630      * folds.  We are now in a position that we can do some checks to see if we
15631      * can optimize this ANYOF node into a simpler one, even in Pass 1.
15632      * Currently we only do two checks:
15633      * 1) is in the unlikely event that the user has specified both, eg. \w and
15634      *    \W under /l, then the class matches everything.  (This optimization
15635      *    is done only to make the optimizer code run later work.)
15636      * 2) if the character class contains only a single element (including a
15637      *    single range), we see if there is an equivalent node for it.
15638      * Other checks are possible */
15639     if (   optimizable
15640         && ! ret_invlist   /* Can't optimize if returning the constructed
15641                               inversion list */
15642         && (UNLIKELY(posixl_matches_all) || element_count == 1))
15643     {
15644         U8 op = END;
15645         U8 arg = 0;
15646
15647         if (UNLIKELY(posixl_matches_all)) {
15648             op = SANY;
15649         }
15650         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15651                                                    \w or [:digit:] or \p{foo}
15652                                                  */
15653
15654             /* All named classes are mapped into POSIXish nodes, with its FLAG
15655              * argument giving which class it is */
15656             switch ((I32)namedclass) {
15657                 case ANYOF_UNIPROP:
15658                     break;
15659
15660                 /* These don't depend on the charset modifiers.  They always
15661                  * match under /u rules */
15662                 case ANYOF_NHORIZWS:
15663                 case ANYOF_HORIZWS:
15664                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15665                     /* FALLTHROUGH */
15666
15667                 case ANYOF_NVERTWS:
15668                 case ANYOF_VERTWS:
15669                     op = POSIXU;
15670                     goto join_posix;
15671
15672                 /* The actual POSIXish node for all the rest depends on the
15673                  * charset modifier.  The ones in the first set depend only on
15674                  * ASCII or, if available on this platform, also locale */
15675                 case ANYOF_ASCII:
15676                 case ANYOF_NASCII:
15677 #ifdef HAS_ISASCII
15678                     op = (LOC) ? POSIXL : POSIXA;
15679 #else
15680                     op = POSIXA;
15681 #endif
15682                     goto join_posix;
15683
15684                 /* The following don't have any matches in the upper Latin1
15685                  * range, hence /d is equivalent to /u for them.  Making it /u
15686                  * saves some branches at runtime */
15687                 case ANYOF_DIGIT:
15688                 case ANYOF_NDIGIT:
15689                 case ANYOF_XDIGIT:
15690                 case ANYOF_NXDIGIT:
15691                     if (! DEPENDS_SEMANTICS) {
15692                         goto treat_as_default;
15693                     }
15694
15695                     op = POSIXU;
15696                     goto join_posix;
15697
15698                 /* The following change to CASED under /i */
15699                 case ANYOF_LOWER:
15700                 case ANYOF_NLOWER:
15701                 case ANYOF_UPPER:
15702                 case ANYOF_NUPPER:
15703                     if (FOLD) {
15704                         namedclass = ANYOF_CASED + (namedclass % 2);
15705                     }
15706                     /* FALLTHROUGH */
15707
15708                 /* The rest have more possibilities depending on the charset.
15709                  * We take advantage of the enum ordering of the charset
15710                  * modifiers to get the exact node type, */
15711                 default:
15712                   treat_as_default:
15713                     op = POSIXD + get_regex_charset(RExC_flags);
15714                     if (op > POSIXA) { /* /aa is same as /a */
15715                         op = POSIXA;
15716                     }
15717
15718                   join_posix:
15719                     /* The odd numbered ones are the complements of the
15720                      * next-lower even number one */
15721                     if (namedclass % 2 == 1) {
15722                         invert = ! invert;
15723                         namedclass--;
15724                     }
15725                     arg = namedclass_to_classnum(namedclass);
15726                     break;
15727             }
15728         }
15729         else if (value == prevvalue) {
15730
15731             /* Here, the class consists of just a single code point */
15732
15733             if (invert) {
15734                 if (! LOC && value == '\n') {
15735                     op = REG_ANY; /* Optimize [^\n] */
15736                     *flagp |= HASWIDTH|SIMPLE;
15737                     MARK_NAUGHTY(1);
15738                 }
15739             }
15740             else if (value < 256 || UTF) {
15741
15742                 /* Optimize a single value into an EXACTish node, but not if it
15743                  * would require converting the pattern to UTF-8. */
15744                 op = compute_EXACTish(pRExC_state);
15745             }
15746         } /* Otherwise is a range */
15747         else if (! LOC) {   /* locale could vary these */
15748             if (prevvalue == '0') {
15749                 if (value == '9') {
15750                     arg = _CC_DIGIT;
15751                     op = POSIXA;
15752                 }
15753             }
15754             else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15755                 /* We can optimize A-Z or a-z, but not if they could match
15756                  * something like the KELVIN SIGN under /i. */
15757                 if (prevvalue == 'A') {
15758                     if (value == 'Z'
15759 #ifdef EBCDIC
15760                         && ! non_portable_endpoint
15761 #endif
15762                     ) {
15763                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15764                         op = POSIXA;
15765                     }
15766                 }
15767                 else if (prevvalue == 'a') {
15768                     if (value == 'z'
15769 #ifdef EBCDIC
15770                         && ! non_portable_endpoint
15771 #endif
15772                     ) {
15773                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15774                         op = POSIXA;
15775                     }
15776                 }
15777             }
15778         }
15779
15780         /* Here, we have changed <op> away from its initial value iff we found
15781          * an optimization */
15782         if (op != END) {
15783
15784             /* Throw away this ANYOF regnode, and emit the calculated one,
15785              * which should correspond to the beginning, not current, state of
15786              * the parse */
15787             const char * cur_parse = RExC_parse;
15788             RExC_parse = (char *)orig_parse;
15789             if ( SIZE_ONLY) {
15790                 if (! LOC) {
15791
15792                     /* To get locale nodes to not use the full ANYOF size would
15793                      * require moving the code above that writes the portions
15794                      * of it that aren't in other nodes to after this point.
15795                      * e.g.  ANYOF_POSIXL_SET */
15796                     RExC_size = orig_size;
15797                 }
15798             }
15799             else {
15800                 RExC_emit = (regnode *)orig_emit;
15801                 if (PL_regkind[op] == POSIXD) {
15802                     if (op == POSIXL) {
15803                         RExC_contains_locale = 1;
15804                     }
15805                     if (invert) {
15806                         op += NPOSIXD - POSIXD;
15807                     }
15808                 }
15809             }
15810
15811             ret = reg_node(pRExC_state, op);
15812
15813             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15814                 if (! SIZE_ONLY) {
15815                     FLAGS(ret) = arg;
15816                 }
15817                 *flagp |= HASWIDTH|SIMPLE;
15818             }
15819             else if (PL_regkind[op] == EXACT) {
15820                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15821                                            TRUE /* downgradable to EXACT */
15822                                            );
15823             }
15824
15825             RExC_parse = (char *) cur_parse;
15826
15827             SvREFCNT_dec(posixes);
15828             SvREFCNT_dec(nposixes);
15829             SvREFCNT_dec(simple_posixes);
15830             SvREFCNT_dec(cp_list);
15831             SvREFCNT_dec(cp_foldable_list);
15832             return ret;
15833         }
15834     }
15835
15836     if (SIZE_ONLY)
15837         return ret;
15838     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15839
15840     /* If folding, we calculate all characters that could fold to or from the
15841      * ones already on the list */
15842     if (cp_foldable_list) {
15843         if (FOLD) {
15844             UV start, end;      /* End points of code point ranges */
15845
15846             SV* fold_intersection = NULL;
15847             SV** use_list;
15848
15849             /* Our calculated list will be for Unicode rules.  For locale
15850              * matching, we have to keep a separate list that is consulted at
15851              * runtime only when the locale indicates Unicode rules.  For
15852              * non-locale, we just use the general list */
15853             if (LOC) {
15854                 use_list = &only_utf8_locale_list;
15855             }
15856             else {
15857                 use_list = &cp_list;
15858             }
15859
15860             /* Only the characters in this class that participate in folds need
15861              * be checked.  Get the intersection of this class and all the
15862              * possible characters that are foldable.  This can quickly narrow
15863              * down a large class */
15864             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15865                                   &fold_intersection);
15866
15867             /* The folds for all the Latin1 characters are hard-coded into this
15868              * program, but we have to go out to disk to get the others. */
15869             if (invlist_highest(cp_foldable_list) >= 256) {
15870
15871                 /* This is a hash that for a particular fold gives all
15872                  * characters that are involved in it */
15873                 if (! PL_utf8_foldclosures) {
15874                     _load_PL_utf8_foldclosures();
15875                 }
15876             }
15877
15878             /* Now look at the foldable characters in this class individually */
15879             invlist_iterinit(fold_intersection);
15880             while (invlist_iternext(fold_intersection, &start, &end)) {
15881                 UV j;
15882
15883                 /* Look at every character in the range */
15884                 for (j = start; j <= end; j++) {
15885                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15886                     STRLEN foldlen;
15887                     SV** listp;
15888
15889                     if (j < 256) {
15890
15891                         if (IS_IN_SOME_FOLD_L1(j)) {
15892
15893                             /* ASCII is always matched; non-ASCII is matched
15894                              * only under Unicode rules (which could happen
15895                              * under /l if the locale is a UTF-8 one */
15896                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15897                                 *use_list = add_cp_to_invlist(*use_list,
15898                                                             PL_fold_latin1[j]);
15899                             }
15900                             else {
15901                                 has_upper_latin1_only_utf8_matches
15902                                     = add_cp_to_invlist(
15903                                             has_upper_latin1_only_utf8_matches,
15904                                             PL_fold_latin1[j]);
15905                             }
15906                         }
15907
15908                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15909                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15910                         {
15911                             add_above_Latin1_folds(pRExC_state,
15912                                                    (U8) j,
15913                                                    use_list);
15914                         }
15915                         continue;
15916                     }
15917
15918                     /* Here is an above Latin1 character.  We don't have the
15919                      * rules hard-coded for it.  First, get its fold.  This is
15920                      * the simple fold, as the multi-character folds have been
15921                      * handled earlier and separated out */
15922                     _to_uni_fold_flags(j, foldbuf, &foldlen,
15923                                                         (ASCII_FOLD_RESTRICTED)
15924                                                         ? FOLD_FLAGS_NOMIX_ASCII
15925                                                         : 0);
15926
15927                     /* Single character fold of above Latin1.  Add everything in
15928                     * its fold closure to the list that this node should match.
15929                     * The fold closures data structure is a hash with the keys
15930                     * being the UTF-8 of every character that is folded to, like
15931                     * 'k', and the values each an array of all code points that
15932                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
15933                     * Multi-character folds are not included */
15934                     if ((listp = hv_fetch(PL_utf8_foldclosures,
15935                                         (char *) foldbuf, foldlen, FALSE)))
15936                     {
15937                         AV* list = (AV*) *listp;
15938                         IV k;
15939                         for (k = 0; k <= av_tindex(list); k++) {
15940                             SV** c_p = av_fetch(list, k, FALSE);
15941                             UV c;
15942                             assert(c_p);
15943
15944                             c = SvUV(*c_p);
15945
15946                             /* /aa doesn't allow folds between ASCII and non- */
15947                             if ((ASCII_FOLD_RESTRICTED
15948                                 && (isASCII(c) != isASCII(j))))
15949                             {
15950                                 continue;
15951                             }
15952
15953                             /* Folds under /l which cross the 255/256 boundary
15954                              * are added to a separate list.  (These are valid
15955                              * only when the locale is UTF-8.) */
15956                             if (c < 256 && LOC) {
15957                                 *use_list = add_cp_to_invlist(*use_list, c);
15958                                 continue;
15959                             }
15960
15961                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15962                             {
15963                                 cp_list = add_cp_to_invlist(cp_list, c);
15964                             }
15965                             else {
15966                                 /* Similarly folds involving non-ascii Latin1
15967                                 * characters under /d are added to their list */
15968                                 has_upper_latin1_only_utf8_matches
15969                                         = add_cp_to_invlist(
15970                                            has_upper_latin1_only_utf8_matches,
15971                                            c);
15972                             }
15973                         }
15974                     }
15975                 }
15976             }
15977             SvREFCNT_dec_NN(fold_intersection);
15978         }
15979
15980         /* Now that we have finished adding all the folds, there is no reason
15981          * to keep the foldable list separate */
15982         _invlist_union(cp_list, cp_foldable_list, &cp_list);
15983         SvREFCNT_dec_NN(cp_foldable_list);
15984     }
15985
15986     /* And combine the result (if any) with any inversion list from posix
15987      * classes.  The lists are kept separate up to now because we don't want to
15988      * fold the classes (folding of those is automatically handled by the swash
15989      * fetching code) */
15990     if (simple_posixes) {
15991         _invlist_union(cp_list, simple_posixes, &cp_list);
15992         SvREFCNT_dec_NN(simple_posixes);
15993     }
15994     if (posixes || nposixes) {
15995         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15996             /* Under /a and /aa, nothing above ASCII matches these */
15997             _invlist_intersection(posixes,
15998                                   PL_XPosix_ptrs[_CC_ASCII],
15999                                   &posixes);
16000         }
16001         if (nposixes) {
16002             if (DEPENDS_SEMANTICS) {
16003                 /* Under /d, everything in the upper half of the Latin1 range
16004                  * matches these complements */
16005                 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16006             }
16007             else if (AT_LEAST_ASCII_RESTRICTED) {
16008                 /* Under /a and /aa, everything above ASCII matches these
16009                  * complements */
16010                 _invlist_union_complement_2nd(nposixes,
16011                                               PL_XPosix_ptrs[_CC_ASCII],
16012                                               &nposixes);
16013             }
16014             if (posixes) {
16015                 _invlist_union(posixes, nposixes, &posixes);
16016                 SvREFCNT_dec_NN(nposixes);
16017             }
16018             else {
16019                 posixes = nposixes;
16020             }
16021         }
16022         if (! DEPENDS_SEMANTICS) {
16023             if (cp_list) {
16024                 _invlist_union(cp_list, posixes, &cp_list);
16025                 SvREFCNT_dec_NN(posixes);
16026             }
16027             else {
16028                 cp_list = posixes;
16029             }
16030         }
16031         else {
16032             /* Under /d, we put into a separate list the Latin1 things that
16033              * match only when the target string is utf8 */
16034             SV* nonascii_but_latin1_properties = NULL;
16035             _invlist_intersection(posixes, PL_UpperLatin1,
16036                                   &nonascii_but_latin1_properties);
16037             _invlist_subtract(posixes, nonascii_but_latin1_properties,
16038                               &posixes);
16039             if (cp_list) {
16040                 _invlist_union(cp_list, posixes, &cp_list);
16041                 SvREFCNT_dec_NN(posixes);
16042             }
16043             else {
16044                 cp_list = posixes;
16045             }
16046
16047             if (has_upper_latin1_only_utf8_matches) {
16048                 _invlist_union(has_upper_latin1_only_utf8_matches,
16049                                nonascii_but_latin1_properties,
16050                                &has_upper_latin1_only_utf8_matches);
16051                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
16052             }
16053             else {
16054                 has_upper_latin1_only_utf8_matches
16055                                             = nonascii_but_latin1_properties;
16056             }
16057         }
16058     }
16059
16060     /* And combine the result (if any) with any inversion list from properties.
16061      * The lists are kept separate up to now so that we can distinguish the two
16062      * in regards to matching above-Unicode.  A run-time warning is generated
16063      * if a Unicode property is matched against a non-Unicode code point. But,
16064      * we allow user-defined properties to match anything, without any warning,
16065      * and we also suppress the warning if there is a portion of the character
16066      * class that isn't a Unicode property, and which matches above Unicode, \W
16067      * or [\x{110000}] for example.
16068      * (Note that in this case, unlike the Posix one above, there is no
16069      * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
16070      * forces Unicode semantics */
16071     if (properties) {
16072         if (cp_list) {
16073
16074             /* If it matters to the final outcome, see if a non-property
16075              * component of the class matches above Unicode.  If so, the
16076              * warning gets suppressed.  This is true even if just a single
16077              * such code point is specified, as though not strictly correct if
16078              * another such code point is matched against, the fact that they
16079              * are using above-Unicode code points indicates they should know
16080              * the issues involved */
16081             if (warn_super) {
16082                 warn_super = ! (invert
16083                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
16084             }
16085
16086             _invlist_union(properties, cp_list, &cp_list);
16087             SvREFCNT_dec_NN(properties);
16088         }
16089         else {
16090             cp_list = properties;
16091         }
16092
16093         if (warn_super) {
16094             ANYOF_FLAGS(ret)
16095              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16096
16097             /* Because an ANYOF node is the only one that warns, this node
16098              * can't be optimized into something else */
16099             optimizable = FALSE;
16100         }
16101     }
16102
16103     /* Here, we have calculated what code points should be in the character
16104      * class.
16105      *
16106      * Now we can see about various optimizations.  Fold calculation (which we
16107      * did above) needs to take place before inversion.  Otherwise /[^k]/i
16108      * would invert to include K, which under /i would match k, which it
16109      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
16110      * folded until runtime */
16111
16112     /* If we didn't do folding, it's because some information isn't available
16113      * until runtime; set the run-time fold flag for these.  (We don't have to
16114      * worry about properties folding, as that is taken care of by the swash
16115      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
16116      * locales, or the class matches at least one 0-255 range code point */
16117     if (LOC && FOLD) {
16118         if (only_utf8_locale_list) {
16119             ANYOF_FLAGS(ret)
16120                  |=  ANYOFL_FOLD
16121                     |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16122         }
16123         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
16124             UV start, end;
16125             invlist_iterinit(cp_list);
16126             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
16127                 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
16128             }
16129             invlist_iterfinish(cp_list);
16130         }
16131     }
16132
16133 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret)                                 \
16134     (   DEPENDS_SEMANTICS                                                   \
16135      && ANYOF_FLAGS(ret)                                                    \
16136         & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
16137
16138     /* See if we can simplify things under /d */
16139     if (   has_upper_latin1_only_utf8_matches
16140         || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
16141     {
16142         if (has_upper_latin1_only_utf8_matches) {
16143             if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
16144
16145                 /* Here, we have two, almost opposite, constraints in effect
16146                  * for upper latin1 characters.  The macro means they all match
16147                  * when the target string ISN'T in UTF-8.
16148                  * 'has_upper_latin1_only_utf8_matches' contains the chars that
16149                  * match only if the target string IS UTF-8.  Therefore the
16150                  * ones in 'has_upper_latin1_only_utf8_matches' match
16151                  * regardless of UTF-8, so can be added to the regular list,
16152                  * and 'has_upper_latin1_only_utf8_matches' cleared */
16153                 _invlist_union(cp_list,
16154                                has_upper_latin1_only_utf8_matches,
16155                                &cp_list);
16156                 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16157                 has_upper_latin1_only_utf8_matches = NULL;
16158             }
16159             else if (cp_list) {
16160
16161                 /* Here, 'cp_list' gives chars that always match, and
16162                  * 'has_upper_latin1_only_utf8_matches' gives chars that were
16163                  * specified to match only if the target string is in UTF-8.
16164                  * It may be that these overlap, so we can subtract the
16165                  * unconditionally matching from the conditional ones, to make
16166                  * the conditional list as small as possible, perhaps even
16167                  * clearing it, in which case more optimizations are possible
16168                  * later */
16169                 _invlist_subtract(has_upper_latin1_only_utf8_matches,
16170                                   cp_list,
16171                                   &has_upper_latin1_only_utf8_matches);
16172                 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
16173                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16174                     has_upper_latin1_only_utf8_matches = NULL;
16175                 }
16176             }
16177         }
16178
16179         /* Similarly, if the unconditional matches include every upper latin1
16180          * character, we can clear that flag to permit later optimizations */
16181         if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
16182             SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
16183             _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
16184             if (_invlist_len(only_non_utf8_list) == 0) {
16185                 ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16186             }
16187             SvREFCNT_dec_NN(only_non_utf8_list);
16188             only_non_utf8_list = NULL;;
16189         }
16190
16191         /* If we haven't gotten rid of all conditional matching, we change the
16192          * regnode type to indicate that */
16193         if (   has_upper_latin1_only_utf8_matches
16194             || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
16195         {
16196             OP(ret) = ANYOFD;
16197             optimizable = FALSE;
16198         }
16199     }
16200 #undef MATCHES_ALL_NON_UTF8_NON_ASCII
16201
16202     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
16203      * at compile time.  Besides not inverting folded locale now, we can't
16204      * invert if there are things such as \w, which aren't known until runtime
16205      * */
16206     if (cp_list
16207         && invert
16208         && OP(ret) != ANYOFD
16209         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
16210         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16211     {
16212         _invlist_invert(cp_list);
16213
16214         /* Any swash can't be used as-is, because we've inverted things */
16215         if (swash) {
16216             SvREFCNT_dec_NN(swash);
16217             swash = NULL;
16218         }
16219
16220         /* Clear the invert flag since have just done it here */
16221         invert = FALSE;
16222     }
16223
16224     if (ret_invlist) {
16225         assert(cp_list);
16226
16227         *ret_invlist = cp_list;
16228         SvREFCNT_dec(swash);
16229
16230         /* Discard the generated node */
16231         if (SIZE_ONLY) {
16232             RExC_size = orig_size;
16233         }
16234         else {
16235             RExC_emit = orig_emit;
16236         }
16237         return orig_emit;
16238     }
16239
16240     /* Some character classes are equivalent to other nodes.  Such nodes take
16241      * up less room and generally fewer operations to execute than ANYOF nodes.
16242      * Above, we checked for and optimized into some such equivalents for
16243      * certain common classes that are easy to test.  Getting to this point in
16244      * the code means that the class didn't get optimized there.  Since this
16245      * code is only executed in Pass 2, it is too late to save space--it has
16246      * been allocated in Pass 1, and currently isn't given back.  But turning
16247      * things into an EXACTish node can allow the optimizer to join it to any
16248      * adjacent such nodes.  And if the class is equivalent to things like /./,
16249      * expensive run-time swashes can be avoided.  Now that we have more
16250      * complete information, we can find things necessarily missed by the
16251      * earlier code.  Another possible "optimization" that isn't done is that
16252      * something like [Ee] could be changed into an EXACTFU.  khw tried this
16253      * and found that the ANYOF is faster, including for code points not in the
16254      * bitmap.  This still might make sense to do, provided it got joined with
16255      * an adjacent node(s) to create a longer EXACTFU one.  This could be
16256      * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
16257      * routine would know is joinable.  If that didn't happen, the node type
16258      * could then be made a straight ANYOF */
16259
16260     if (optimizable && cp_list && ! invert) {
16261         UV start, end;
16262         U8 op = END;  /* The optimzation node-type */
16263         int posix_class = -1;   /* Illegal value */
16264         const char * cur_parse= RExC_parse;
16265
16266         invlist_iterinit(cp_list);
16267         if (! invlist_iternext(cp_list, &start, &end)) {
16268
16269             /* Here, the list is empty.  This happens, for example, when a
16270              * Unicode property that doesn't match anything is the only element
16271              * in the character class (perluniprops.pod notes such properties).
16272              * */
16273             op = OPFAIL;
16274             *flagp |= HASWIDTH|SIMPLE;
16275         }
16276         else if (start == end) {    /* The range is a single code point */
16277             if (! invlist_iternext(cp_list, &start, &end)
16278
16279                     /* Don't do this optimization if it would require changing
16280                      * the pattern to UTF-8 */
16281                 && (start < 256 || UTF))
16282             {
16283                 /* Here, the list contains a single code point.  Can optimize
16284                  * into an EXACTish node */
16285
16286                 value = start;
16287
16288                 if (! FOLD) {
16289                     op = (LOC)
16290                          ? EXACTL
16291                          : EXACT;
16292                 }
16293                 else if (LOC) {
16294
16295                     /* A locale node under folding with one code point can be
16296                      * an EXACTFL, as its fold won't be calculated until
16297                      * runtime */
16298                     op = EXACTFL;
16299                 }
16300                 else {
16301
16302                     /* Here, we are generally folding, but there is only one
16303                      * code point to match.  If we have to, we use an EXACT
16304                      * node, but it would be better for joining with adjacent
16305                      * nodes in the optimization pass if we used the same
16306                      * EXACTFish node that any such are likely to be.  We can
16307                      * do this iff the code point doesn't participate in any
16308                      * folds.  For example, an EXACTF of a colon is the same as
16309                      * an EXACT one, since nothing folds to or from a colon. */
16310                     if (value < 256) {
16311                         if (IS_IN_SOME_FOLD_L1(value)) {
16312                             op = EXACT;
16313                         }
16314                     }
16315                     else {
16316                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
16317                             op = EXACT;
16318                         }
16319                     }
16320
16321                     /* If we haven't found the node type, above, it means we
16322                      * can use the prevailing one */
16323                     if (op == END) {
16324                         op = compute_EXACTish(pRExC_state);
16325                     }
16326                 }
16327             }
16328         }   /* End of first range contains just a single code point */
16329         else if (start == 0) {
16330             if (end == UV_MAX) {
16331                 op = SANY;
16332                 *flagp |= HASWIDTH|SIMPLE;
16333                 MARK_NAUGHTY(1);
16334             }
16335             else if (end == '\n' - 1
16336                     && invlist_iternext(cp_list, &start, &end)
16337                     && start == '\n' + 1 && end == UV_MAX)
16338             {
16339                 op = REG_ANY;
16340                 *flagp |= HASWIDTH|SIMPLE;
16341                 MARK_NAUGHTY(1);
16342             }
16343         }
16344         invlist_iterfinish(cp_list);
16345
16346         if (op == END) {
16347             const UV cp_list_len = _invlist_len(cp_list);
16348             const UV* cp_list_array = invlist_array(cp_list);
16349
16350             /* Here, didn't find an optimization.  See if this matches any of
16351              * the POSIX classes.  These run slightly faster for above-Unicode
16352              * code points, so don't bother with POSIXA ones nor the 2 that
16353              * have no above-Unicode matches.  We can avoid these checks unless
16354              * the ANYOF matches at least as high as the lowest POSIX one
16355              * (which was manually found to be \v.  The actual code point may
16356              * increase in later Unicode releases, if a higher code point is
16357              * assigned to be \v, but this code will never break.  It would
16358              * just mean we could execute the checks for posix optimizations
16359              * unnecessarily) */
16360
16361             if (cp_list_array[cp_list_len-1] > 0x2029) {
16362                 for (posix_class = 0;
16363                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
16364                      posix_class++)
16365                 {
16366                     int try_inverted;
16367                     if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
16368                         continue;
16369                     }
16370                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
16371
16372                         /* Check if matches normal or inverted */
16373                         if (_invlistEQ(cp_list,
16374                                        PL_XPosix_ptrs[posix_class],
16375                                        try_inverted))
16376                         {
16377                             op = (try_inverted)
16378                                  ? NPOSIXU
16379                                  : POSIXU;
16380                             *flagp |= HASWIDTH|SIMPLE;
16381                             goto found_posix;
16382                         }
16383                     }
16384                 }
16385               found_posix: ;
16386             }
16387         }
16388
16389         if (op != END) {
16390             RExC_parse = (char *)orig_parse;
16391             RExC_emit = (regnode *)orig_emit;
16392
16393             if (regarglen[op]) {
16394                 ret = reganode(pRExC_state, op, 0);
16395             } else {
16396                 ret = reg_node(pRExC_state, op);
16397             }
16398
16399             RExC_parse = (char *)cur_parse;
16400
16401             if (PL_regkind[op] == EXACT) {
16402                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
16403                                            TRUE /* downgradable to EXACT */
16404                                           );
16405             }
16406             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
16407                 FLAGS(ret) = posix_class;
16408             }
16409
16410             SvREFCNT_dec_NN(cp_list);
16411             return ret;
16412         }
16413     }
16414
16415     /* Here, <cp_list> contains all the code points we can determine at
16416      * compile time that match under all conditions.  Go through it, and
16417      * for things that belong in the bitmap, put them there, and delete from
16418      * <cp_list>.  While we are at it, see if everything above 255 is in the
16419      * list, and if so, set a flag to speed up execution */
16420
16421     populate_ANYOF_from_invlist(ret, &cp_list);
16422
16423     if (invert) {
16424         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
16425     }
16426
16427     /* Here, the bitmap has been populated with all the Latin1 code points that
16428      * always match.  Can now add to the overall list those that match only
16429      * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
16430      * */
16431     if (has_upper_latin1_only_utf8_matches) {
16432         if (cp_list) {
16433             _invlist_union(cp_list,
16434                            has_upper_latin1_only_utf8_matches,
16435                            &cp_list);
16436             SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16437         }
16438         else {
16439             cp_list = has_upper_latin1_only_utf8_matches;
16440         }
16441         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16442     }
16443
16444     /* If there is a swash and more than one element, we can't use the swash in
16445      * the optimization below. */
16446     if (swash && element_count > 1) {
16447         SvREFCNT_dec_NN(swash);
16448         swash = NULL;
16449     }
16450
16451     /* Note that the optimization of using 'swash' if it is the only thing in
16452      * the class doesn't have us change swash at all, so it can include things
16453      * that are also in the bitmap; otherwise we have purposely deleted that
16454      * duplicate information */
16455     set_ANYOF_arg(pRExC_state, ret, cp_list,
16456                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16457                    ? listsv : NULL,
16458                   only_utf8_locale_list,
16459                   swash, has_user_defined_property);
16460
16461     *flagp |= HASWIDTH|SIMPLE;
16462
16463     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
16464         RExC_contains_locale = 1;
16465     }
16466
16467     return ret;
16468 }
16469
16470 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
16471
16472 STATIC void
16473 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
16474                 regnode* const node,
16475                 SV* const cp_list,
16476                 SV* const runtime_defns,
16477                 SV* const only_utf8_locale_list,
16478                 SV* const swash,
16479                 const bool has_user_defined_property)
16480 {
16481     /* Sets the arg field of an ANYOF-type node 'node', using information about
16482      * the node passed-in.  If there is nothing outside the node's bitmap, the
16483      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
16484      * the count returned by add_data(), having allocated and stored an array,
16485      * av, that that count references, as follows:
16486      *  av[0] stores the character class description in its textual form.
16487      *        This is used later (regexec.c:Perl_regclass_swash()) to
16488      *        initialize the appropriate swash, and is also useful for dumping
16489      *        the regnode.  This is set to &PL_sv_undef if the textual
16490      *        description is not needed at run-time (as happens if the other
16491      *        elements completely define the class)
16492      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16493      *        computed from av[0].  But if no further computation need be done,
16494      *        the swash is stored here now (and av[0] is &PL_sv_undef).
16495      *  av[2] stores the inversion list of code points that match only if the
16496      *        current locale is UTF-8
16497      *  av[3] stores the cp_list inversion list for use in addition or instead
16498      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16499      *        (Otherwise everything needed is already in av[0] and av[1])
16500      *  av[4] is set if any component of the class is from a user-defined
16501      *        property; used only if av[3] exists */
16502
16503     UV n;
16504
16505     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16506
16507     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16508         assert(! (ANYOF_FLAGS(node)
16509                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
16510         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16511     }
16512     else {
16513         AV * const av = newAV();
16514         SV *rv;
16515
16516         av_store(av, 0, (runtime_defns)
16517                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16518         if (swash) {
16519             assert(cp_list);
16520             av_store(av, 1, swash);
16521             SvREFCNT_dec_NN(cp_list);
16522         }
16523         else {
16524             av_store(av, 1, &PL_sv_undef);
16525             if (cp_list) {
16526                 av_store(av, 3, cp_list);
16527                 av_store(av, 4, newSVuv(has_user_defined_property));
16528             }
16529         }
16530
16531         if (only_utf8_locale_list) {
16532             av_store(av, 2, only_utf8_locale_list);
16533         }
16534         else {
16535             av_store(av, 2, &PL_sv_undef);
16536         }
16537
16538         rv = newRV_noinc(MUTABLE_SV(av));
16539         n = add_data(pRExC_state, STR_WITH_LEN("s"));
16540         RExC_rxi->data->data[n] = (void*)rv;
16541         ARG_SET(node, n);
16542     }
16543 }
16544
16545 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16546 SV *
16547 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16548                                         const regnode* node,
16549                                         bool doinit,
16550                                         SV** listsvp,
16551                                         SV** only_utf8_locale_ptr,
16552                                         SV*  exclude_list)
16553
16554 {
16555     /* For internal core use only.
16556      * Returns the swash for the input 'node' in the regex 'prog'.
16557      * If <doinit> is 'true', will attempt to create the swash if not already
16558      *    done.
16559      * If <listsvp> is non-null, will return the printable contents of the
16560      *    swash.  This can be used to get debugging information even before the
16561      *    swash exists, by calling this function with 'doinit' set to false, in
16562      *    which case the components that will be used to eventually create the
16563      *    swash are returned  (in a printable form).
16564      * If <exclude_list> is not NULL, it is an inversion list of things to
16565      *    exclude from what's returned in <listsvp>.
16566      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
16567      * that, in spite of this function's name, the swash it returns may include
16568      * the bitmap data as well */
16569
16570     SV *sw  = NULL;
16571     SV *si  = NULL;         /* Input swash initialization string */
16572     SV*  invlist = NULL;
16573
16574     RXi_GET_DECL(prog,progi);
16575     const struct reg_data * const data = prog ? progi->data : NULL;
16576
16577     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16578
16579     if (data && data->count) {
16580         const U32 n = ARG(node);
16581
16582         if (data->what[n] == 's') {
16583             SV * const rv = MUTABLE_SV(data->data[n]);
16584             AV * const av = MUTABLE_AV(SvRV(rv));
16585             SV **const ary = AvARRAY(av);
16586             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16587
16588             si = *ary;  /* ary[0] = the string to initialize the swash with */
16589
16590             if (av_tindex(av) >= 2) {
16591                 if (only_utf8_locale_ptr
16592                     && ary[2]
16593                     && ary[2] != &PL_sv_undef)
16594                 {
16595                     *only_utf8_locale_ptr = ary[2];
16596                 }
16597                 else {
16598                     assert(only_utf8_locale_ptr);
16599                     *only_utf8_locale_ptr = NULL;
16600                 }
16601
16602                 /* Elements 3 and 4 are either both present or both absent. [3]
16603                  * is any inversion list generated at compile time; [4]
16604                  * indicates if that inversion list has any user-defined
16605                  * properties in it. */
16606                 if (av_tindex(av) >= 3) {
16607                     invlist = ary[3];
16608                     if (SvUV(ary[4])) {
16609                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16610                     }
16611                 }
16612                 else {
16613                     invlist = NULL;
16614                 }
16615             }
16616
16617             /* Element [1] is reserved for the set-up swash.  If already there,
16618              * return it; if not, create it and store it there */
16619             if (ary[1] && SvROK(ary[1])) {
16620                 sw = ary[1];
16621             }
16622             else if (doinit && ((si && si != &PL_sv_undef)
16623                                  || (invlist && invlist != &PL_sv_undef))) {
16624                 assert(si);
16625                 sw = _core_swash_init("utf8", /* the utf8 package */
16626                                       "", /* nameless */
16627                                       si,
16628                                       1, /* binary */
16629                                       0, /* not from tr/// */
16630                                       invlist,
16631                                       &swash_init_flags);
16632                 (void)av_store(av, 1, sw);
16633             }
16634         }
16635     }
16636
16637     /* If requested, return a printable version of what this swash matches */
16638     if (listsvp) {
16639         SV* matches_string = newSVpvs("");
16640
16641         /* The swash should be used, if possible, to get the data, as it
16642          * contains the resolved data.  But this function can be called at
16643          * compile-time, before everything gets resolved, in which case we
16644          * return the currently best available information, which is the string
16645          * that will eventually be used to do that resolving, 'si' */
16646         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16647             && (si && si != &PL_sv_undef))
16648         {
16649             sv_catsv(matches_string, si);
16650         }
16651
16652         /* Add the inversion list to whatever we have.  This may have come from
16653          * the swash, or from an input parameter */
16654         if (invlist) {
16655             if (exclude_list) {
16656                 SV* clone = invlist_clone(invlist);
16657                 _invlist_subtract(clone, exclude_list, &clone);
16658                 sv_catsv(matches_string, _invlist_contents(clone));
16659                 SvREFCNT_dec_NN(clone);
16660             }
16661             else {
16662                 sv_catsv(matches_string, _invlist_contents(invlist));
16663             }
16664         }
16665         *listsvp = matches_string;
16666     }
16667
16668     return sw;
16669 }
16670 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16671
16672 /* reg_skipcomment()
16673
16674    Absorbs an /x style # comment from the input stream,
16675    returning a pointer to the first character beyond the comment, or if the
16676    comment terminates the pattern without anything following it, this returns
16677    one past the final character of the pattern (in other words, RExC_end) and
16678    sets the REG_RUN_ON_COMMENT_SEEN flag.
16679
16680    Note it's the callers responsibility to ensure that we are
16681    actually in /x mode
16682
16683 */
16684
16685 PERL_STATIC_INLINE char*
16686 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16687 {
16688     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16689
16690     assert(*p == '#');
16691
16692     while (p < RExC_end) {
16693         if (*(++p) == '\n') {
16694             return p+1;
16695         }
16696     }
16697
16698     /* we ran off the end of the pattern without ending the comment, so we have
16699      * to add an \n when wrapping */
16700     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16701     return p;
16702 }
16703
16704 STATIC void
16705 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
16706                                 char ** p,
16707                                 const bool force_to_xmod
16708                          )
16709 {
16710     /* If the text at the current parse position '*p' is a '(?#...)' comment,
16711      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
16712      * is /x whitespace, advance '*p' so that on exit it points to the first
16713      * byte past all such white space and comments */
16714
16715     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
16716
16717     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
16718
16719     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
16720
16721     for (;;) {
16722         if (RExC_end - (*p) >= 3
16723             && *(*p)     == '('
16724             && *(*p + 1) == '?'
16725             && *(*p + 2) == '#')
16726         {
16727             while (*(*p) != ')') {
16728                 if ((*p) == RExC_end)
16729                     FAIL("Sequence (?#... not terminated");
16730                 (*p)++;
16731             }
16732             (*p)++;
16733             continue;
16734         }
16735
16736         if (use_xmod) {
16737             const char * save_p = *p;
16738             while ((*p) < RExC_end) {
16739                 STRLEN len;
16740                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
16741                     (*p) += len;
16742                 }
16743                 else if (*(*p) == '#') {
16744                     (*p) = reg_skipcomment(pRExC_state, (*p));
16745                 }
16746                 else {
16747                     break;
16748                 }
16749             }
16750             if (*p != save_p) {
16751                 continue;
16752             }
16753         }
16754
16755         break;
16756     }
16757
16758     return;
16759 }
16760
16761 /* nextchar()
16762
16763    Advances the parse position by one byte, unless that byte is the beginning
16764    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
16765    those two cases, the parse position is advanced beyond all such comments and
16766    white space.
16767
16768    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
16769 */
16770
16771 STATIC void
16772 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16773 {
16774     PERL_ARGS_ASSERT_NEXTCHAR;
16775
16776     assert(   ! UTF
16777            || UTF8_IS_INVARIANT(*RExC_parse)
16778            || UTF8_IS_START(*RExC_parse));
16779
16780     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16781
16782     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16783                             FALSE /* Don't assume /x */ );
16784 }
16785
16786 STATIC regnode *
16787 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16788 {
16789     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16790      * space.  In pass1, it aligns and increments RExC_size; in pass2,
16791      * RExC_emit */
16792
16793     regnode * const ret = RExC_emit;
16794     GET_RE_DEBUG_FLAGS_DECL;
16795
16796     PERL_ARGS_ASSERT_REGNODE_GUTS;
16797
16798     assert(extra_size >= regarglen[op]);
16799
16800     if (SIZE_ONLY) {
16801         SIZE_ALIGN(RExC_size);
16802         RExC_size += 1 + extra_size;
16803         return(ret);
16804     }
16805     if (RExC_emit >= RExC_emit_bound)
16806         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16807                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
16808
16809     NODE_ALIGN_FILL(ret);
16810 #ifndef RE_TRACK_PATTERN_OFFSETS
16811     PERL_UNUSED_ARG(name);
16812 #else
16813     if (RExC_offsets) {         /* MJD */
16814         MJD_OFFSET_DEBUG(
16815               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16816               name, __LINE__,
16817               PL_reg_name[op],
16818               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16819                 ? "Overwriting end of array!\n" : "OK",
16820               (UV)(RExC_emit - RExC_emit_start),
16821               (UV)(RExC_parse - RExC_start),
16822               (UV)RExC_offsets[0]));
16823         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16824     }
16825 #endif
16826     return(ret);
16827 }
16828
16829 /*
16830 - reg_node - emit a node
16831 */
16832 STATIC regnode *                        /* Location. */
16833 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16834 {
16835     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16836
16837     PERL_ARGS_ASSERT_REG_NODE;
16838
16839     assert(regarglen[op] == 0);
16840
16841     if (PASS2) {
16842         regnode *ptr = ret;
16843         FILL_ADVANCE_NODE(ptr, op);
16844         RExC_emit = ptr;
16845     }
16846     return(ret);
16847 }
16848
16849 /*
16850 - reganode - emit a node with an argument
16851 */
16852 STATIC regnode *                        /* Location. */
16853 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16854 {
16855     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16856
16857     PERL_ARGS_ASSERT_REGANODE;
16858
16859     assert(regarglen[op] == 1);
16860
16861     if (PASS2) {
16862         regnode *ptr = ret;
16863         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16864         RExC_emit = ptr;
16865     }
16866     return(ret);
16867 }
16868
16869 STATIC regnode *
16870 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16871 {
16872     /* emit a node with U32 and I32 arguments */
16873
16874     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16875
16876     PERL_ARGS_ASSERT_REG2LANODE;
16877
16878     assert(regarglen[op] == 2);
16879
16880     if (PASS2) {
16881         regnode *ptr = ret;
16882         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16883         RExC_emit = ptr;
16884     }
16885     return(ret);
16886 }
16887
16888 /*
16889 - reginsert - insert an operator in front of already-emitted operand
16890 *
16891 * Means relocating the operand.
16892 */
16893 STATIC void
16894 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16895 {
16896     regnode *src;
16897     regnode *dst;
16898     regnode *place;
16899     const int offset = regarglen[(U8)op];
16900     const int size = NODE_STEP_REGNODE + offset;
16901     GET_RE_DEBUG_FLAGS_DECL;
16902
16903     PERL_ARGS_ASSERT_REGINSERT;
16904     PERL_UNUSED_CONTEXT;
16905     PERL_UNUSED_ARG(depth);
16906 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16907     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16908     if (SIZE_ONLY) {
16909         RExC_size += size;
16910         return;
16911     }
16912
16913     src = RExC_emit;
16914     RExC_emit += size;
16915     dst = RExC_emit;
16916     if (RExC_open_parens) {
16917         int paren;
16918         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16919         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16920             if ( RExC_open_parens[paren] >= opnd ) {
16921                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16922                 RExC_open_parens[paren] += size;
16923             } else {
16924                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16925             }
16926             if ( RExC_close_parens[paren] >= opnd ) {
16927                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16928                 RExC_close_parens[paren] += size;
16929             } else {
16930                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16931             }
16932         }
16933     }
16934
16935     while (src > opnd) {
16936         StructCopy(--src, --dst, regnode);
16937 #ifdef RE_TRACK_PATTERN_OFFSETS
16938         if (RExC_offsets) {     /* MJD 20010112 */
16939             MJD_OFFSET_DEBUG(
16940                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16941                   "reg_insert",
16942                   __LINE__,
16943                   PL_reg_name[op],
16944                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16945                     ? "Overwriting end of array!\n" : "OK",
16946                   (UV)(src - RExC_emit_start),
16947                   (UV)(dst - RExC_emit_start),
16948                   (UV)RExC_offsets[0]));
16949             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16950             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16951         }
16952 #endif
16953     }
16954
16955
16956     place = opnd;               /* Op node, where operand used to be. */
16957 #ifdef RE_TRACK_PATTERN_OFFSETS
16958     if (RExC_offsets) {         /* MJD */
16959         MJD_OFFSET_DEBUG(
16960               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16961               "reginsert",
16962               __LINE__,
16963               PL_reg_name[op],
16964               (UV)(place - RExC_emit_start) > RExC_offsets[0]
16965               ? "Overwriting end of array!\n" : "OK",
16966               (UV)(place - RExC_emit_start),
16967               (UV)(RExC_parse - RExC_start),
16968               (UV)RExC_offsets[0]));
16969         Set_Node_Offset(place, RExC_parse);
16970         Set_Node_Length(place, 1);
16971     }
16972 #endif
16973     src = NEXTOPER(place);
16974     FILL_ADVANCE_NODE(place, op);
16975     Zero(src, offset, regnode);
16976 }
16977
16978 /*
16979 - regtail - set the next-pointer at the end of a node chain of p to val.
16980 - SEE ALSO: regtail_study
16981 */
16982 STATIC void
16983 S_regtail(pTHX_ RExC_state_t * pRExC_state,
16984                 const regnode * const p,
16985                 const regnode * const val,
16986                 const U32 depth)
16987 {
16988     regnode *scan;
16989     GET_RE_DEBUG_FLAGS_DECL;
16990
16991     PERL_ARGS_ASSERT_REGTAIL;
16992 #ifndef DEBUGGING
16993     PERL_UNUSED_ARG(depth);
16994 #endif
16995
16996     if (SIZE_ONLY)
16997         return;
16998
16999     /* Find last node. */
17000     scan = (regnode *) p;
17001     for (;;) {
17002         regnode * const temp = regnext(scan);
17003         DEBUG_PARSE_r({
17004             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
17005             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
17006             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
17007                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
17008                     (temp == NULL ? "->" : ""),
17009                     (temp == NULL ? PL_reg_name[OP(val)] : "")
17010             );
17011         });
17012         if (temp == NULL)
17013             break;
17014         scan = temp;
17015     }
17016
17017     if (reg_off_by_arg[OP(scan)]) {
17018         ARG_SET(scan, val - scan);
17019     }
17020     else {
17021         NEXT_OFF(scan) = val - scan;
17022     }
17023 }
17024
17025 #ifdef DEBUGGING
17026 /*
17027 - regtail_study - set the next-pointer at the end of a node chain of p to val.
17028 - Look for optimizable sequences at the same time.
17029 - currently only looks for EXACT chains.
17030
17031 This is experimental code. The idea is to use this routine to perform
17032 in place optimizations on branches and groups as they are constructed,
17033 with the long term intention of removing optimization from study_chunk so
17034 that it is purely analytical.
17035
17036 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
17037 to control which is which.
17038
17039 */
17040 /* TODO: All four parms should be const */
17041
17042 STATIC U8
17043 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
17044                       const regnode *val,U32 depth)
17045 {
17046     regnode *scan;
17047     U8 exact = PSEUDO;
17048 #ifdef EXPERIMENTAL_INPLACESCAN
17049     I32 min = 0;
17050 #endif
17051     GET_RE_DEBUG_FLAGS_DECL;
17052
17053     PERL_ARGS_ASSERT_REGTAIL_STUDY;
17054
17055
17056     if (SIZE_ONLY)
17057         return exact;
17058
17059     /* Find last node. */
17060
17061     scan = p;
17062     for (;;) {
17063         regnode * const temp = regnext(scan);
17064 #ifdef EXPERIMENTAL_INPLACESCAN
17065         if (PL_regkind[OP(scan)] == EXACT) {
17066             bool unfolded_multi_char;   /* Unexamined in this routine */
17067             if (join_exact(pRExC_state, scan, &min,
17068                            &unfolded_multi_char, 1, val, depth+1))
17069                 return EXACT;
17070         }
17071 #endif
17072         if ( exact ) {
17073             switch (OP(scan)) {
17074                 case EXACT:
17075                 case EXACTL:
17076                 case EXACTF:
17077                 case EXACTFA_NO_TRIE:
17078                 case EXACTFA:
17079                 case EXACTFU:
17080                 case EXACTFLU8:
17081                 case EXACTFU_SS:
17082                 case EXACTFL:
17083                         if( exact == PSEUDO )
17084                             exact= OP(scan);
17085                         else if ( exact != OP(scan) )
17086                             exact= 0;
17087                 case NOTHING:
17088                     break;
17089                 default:
17090                     exact= 0;
17091             }
17092         }
17093         DEBUG_PARSE_r({
17094             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
17095             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
17096             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
17097                 SvPV_nolen_const(RExC_mysv),
17098                 REG_NODE_NUM(scan),
17099                 PL_reg_name[exact]);
17100         });
17101         if (temp == NULL)
17102             break;
17103         scan = temp;
17104     }
17105     DEBUG_PARSE_r({
17106         DEBUG_PARSE_MSG("");
17107         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
17108         PerlIO_printf(Perl_debug_log,
17109                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
17110                       SvPV_nolen_const(RExC_mysv),
17111                       (IV)REG_NODE_NUM(val),
17112                       (IV)(val - scan)
17113         );
17114     });
17115     if (reg_off_by_arg[OP(scan)]) {
17116         ARG_SET(scan, val - scan);
17117     }
17118     else {
17119         NEXT_OFF(scan) = val - scan;
17120     }
17121
17122     return exact;
17123 }
17124 #endif
17125
17126 /*
17127  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
17128  */
17129 #ifdef DEBUGGING
17130
17131 static void
17132 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
17133 {
17134     int bit;
17135     int set=0;
17136
17137     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
17138
17139     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
17140         if (flags & (1<<bit)) {
17141             if (!set++ && lead)
17142                 PerlIO_printf(Perl_debug_log, "%s",lead);
17143             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
17144         }
17145     }
17146     if (lead)  {
17147         if (set)
17148             PerlIO_printf(Perl_debug_log, "\n");
17149         else
17150             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
17151     }
17152 }
17153
17154 static void
17155 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
17156 {
17157     int bit;
17158     int set=0;
17159     regex_charset cs;
17160
17161     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
17162
17163     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
17164         if (flags & (1<<bit)) {
17165             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
17166                 continue;
17167             }
17168             if (!set++ && lead)
17169                 PerlIO_printf(Perl_debug_log, "%s",lead);
17170             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
17171         }
17172     }
17173     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
17174             if (!set++ && lead) {
17175                 PerlIO_printf(Perl_debug_log, "%s",lead);
17176             }
17177             switch (cs) {
17178                 case REGEX_UNICODE_CHARSET:
17179                     PerlIO_printf(Perl_debug_log, "UNICODE");
17180                     break;
17181                 case REGEX_LOCALE_CHARSET:
17182                     PerlIO_printf(Perl_debug_log, "LOCALE");
17183                     break;
17184                 case REGEX_ASCII_RESTRICTED_CHARSET:
17185                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
17186                     break;
17187                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
17188                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
17189                     break;
17190                 default:
17191                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
17192                     break;
17193             }
17194     }
17195     if (lead)  {
17196         if (set)
17197             PerlIO_printf(Perl_debug_log, "\n");
17198         else
17199             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
17200     }
17201 }
17202 #endif
17203
17204 void
17205 Perl_regdump(pTHX_ const regexp *r)
17206 {
17207 #ifdef DEBUGGING
17208     SV * const sv = sv_newmortal();
17209     SV *dsv= sv_newmortal();
17210     RXi_GET_DECL(r,ri);
17211     GET_RE_DEBUG_FLAGS_DECL;
17212
17213     PERL_ARGS_ASSERT_REGDUMP;
17214
17215     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
17216
17217     /* Header fields of interest. */
17218     if (r->anchored_substr) {
17219         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
17220             RE_SV_DUMPLEN(r->anchored_substr), 30);
17221         PerlIO_printf(Perl_debug_log,
17222                       "anchored %s%s at %"IVdf" ",
17223                       s, RE_SV_TAIL(r->anchored_substr),
17224                       (IV)r->anchored_offset);
17225     } else if (r->anchored_utf8) {
17226         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
17227             RE_SV_DUMPLEN(r->anchored_utf8), 30);
17228         PerlIO_printf(Perl_debug_log,
17229                       "anchored utf8 %s%s at %"IVdf" ",
17230                       s, RE_SV_TAIL(r->anchored_utf8),
17231                       (IV)r->anchored_offset);
17232     }
17233     if (r->float_substr) {
17234         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
17235             RE_SV_DUMPLEN(r->float_substr), 30);
17236         PerlIO_printf(Perl_debug_log,
17237                       "floating %s%s at %"IVdf"..%"UVuf" ",
17238                       s, RE_SV_TAIL(r->float_substr),
17239                       (IV)r->float_min_offset, (UV)r->float_max_offset);
17240     } else if (r->float_utf8) {
17241         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
17242             RE_SV_DUMPLEN(r->float_utf8), 30);
17243         PerlIO_printf(Perl_debug_log,
17244                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
17245                       s, RE_SV_TAIL(r->float_utf8),
17246                       (IV)r->float_min_offset, (UV)r->float_max_offset);
17247     }
17248     if (r->check_substr || r->check_utf8)
17249         PerlIO_printf(Perl_debug_log,
17250                       (const char *)
17251                       (r->check_substr == r->float_substr
17252                        && r->check_utf8 == r->float_utf8
17253                        ? "(checking floating" : "(checking anchored"));
17254     if (r->intflags & PREGf_NOSCAN)
17255         PerlIO_printf(Perl_debug_log, " noscan");
17256     if (r->extflags & RXf_CHECK_ALL)
17257         PerlIO_printf(Perl_debug_log, " isall");
17258     if (r->check_substr || r->check_utf8)
17259         PerlIO_printf(Perl_debug_log, ") ");
17260
17261     if (ri->regstclass) {
17262         regprop(r, sv, ri->regstclass, NULL, NULL);
17263         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
17264     }
17265     if (r->intflags & PREGf_ANCH) {
17266         PerlIO_printf(Perl_debug_log, "anchored");
17267         if (r->intflags & PREGf_ANCH_MBOL)
17268             PerlIO_printf(Perl_debug_log, "(MBOL)");
17269         if (r->intflags & PREGf_ANCH_SBOL)
17270             PerlIO_printf(Perl_debug_log, "(SBOL)");
17271         if (r->intflags & PREGf_ANCH_GPOS)
17272             PerlIO_printf(Perl_debug_log, "(GPOS)");
17273         (void)PerlIO_putc(Perl_debug_log, ' ');
17274     }
17275     if (r->intflags & PREGf_GPOS_SEEN)
17276         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
17277     if (r->intflags & PREGf_SKIP)
17278         PerlIO_printf(Perl_debug_log, "plus ");
17279     if (r->intflags & PREGf_IMPLICIT)
17280         PerlIO_printf(Perl_debug_log, "implicit ");
17281     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
17282     if (r->extflags & RXf_EVAL_SEEN)
17283         PerlIO_printf(Perl_debug_log, "with eval ");
17284     PerlIO_printf(Perl_debug_log, "\n");
17285     DEBUG_FLAGS_r({
17286         regdump_extflags("r->extflags: ",r->extflags);
17287         regdump_intflags("r->intflags: ",r->intflags);
17288     });
17289 #else
17290     PERL_ARGS_ASSERT_REGDUMP;
17291     PERL_UNUSED_CONTEXT;
17292     PERL_UNUSED_ARG(r);
17293 #endif  /* DEBUGGING */
17294 }
17295
17296 /*
17297 - regprop - printable representation of opcode, with run time support
17298 */
17299
17300 void
17301 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
17302 {
17303 #ifdef DEBUGGING
17304     int k;
17305
17306     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
17307     static const char * const anyofs[] = {
17308 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
17309     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
17310     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
17311     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
17312     || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
17313   #error Need to adjust order of anyofs[]
17314 #endif
17315         "\\w",
17316         "\\W",
17317         "\\d",
17318         "\\D",
17319         "[:alpha:]",
17320         "[:^alpha:]",
17321         "[:lower:]",
17322         "[:^lower:]",
17323         "[:upper:]",
17324         "[:^upper:]",
17325         "[:punct:]",
17326         "[:^punct:]",
17327         "[:print:]",
17328         "[:^print:]",
17329         "[:alnum:]",
17330         "[:^alnum:]",
17331         "[:graph:]",
17332         "[:^graph:]",
17333         "[:cased:]",
17334         "[:^cased:]",
17335         "\\s",
17336         "\\S",
17337         "[:blank:]",
17338         "[:^blank:]",
17339         "[:xdigit:]",
17340         "[:^xdigit:]",
17341         "[:cntrl:]",
17342         "[:^cntrl:]",
17343         "[:ascii:]",
17344         "[:^ascii:]",
17345         "\\v",
17346         "\\V"
17347     };
17348     RXi_GET_DECL(prog,progi);
17349     GET_RE_DEBUG_FLAGS_DECL;
17350
17351     PERL_ARGS_ASSERT_REGPROP;
17352
17353     sv_setpvn(sv, "", 0);
17354
17355     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
17356         /* It would be nice to FAIL() here, but this may be called from
17357            regexec.c, and it would be hard to supply pRExC_state. */
17358         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17359                                               (int)OP(o), (int)REGNODE_MAX);
17360     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
17361
17362     k = PL_regkind[OP(o)];
17363
17364     if (k == EXACT) {
17365         sv_catpvs(sv, " ");
17366         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
17367          * is a crude hack but it may be the best for now since
17368          * we have no flag "this EXACTish node was UTF-8"
17369          * --jhi */
17370         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
17371                   PERL_PV_ESCAPE_UNI_DETECT |
17372                   PERL_PV_ESCAPE_NONASCII   |
17373                   PERL_PV_PRETTY_ELLIPSES   |
17374                   PERL_PV_PRETTY_LTGT       |
17375                   PERL_PV_PRETTY_NOCLEAR
17376                   );
17377     } else if (k == TRIE) {
17378         /* print the details of the trie in dumpuntil instead, as
17379          * progi->data isn't available here */
17380         const char op = OP(o);
17381         const U32 n = ARG(o);
17382         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
17383                (reg_ac_data *)progi->data->data[n] :
17384                NULL;
17385         const reg_trie_data * const trie
17386             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
17387
17388         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
17389         DEBUG_TRIE_COMPILE_r(
17390           Perl_sv_catpvf(aTHX_ sv,
17391             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
17392             (UV)trie->startstate,
17393             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
17394             (UV)trie->wordcount,
17395             (UV)trie->minlen,
17396             (UV)trie->maxlen,
17397             (UV)TRIE_CHARCOUNT(trie),
17398             (UV)trie->uniquecharcount
17399           );
17400         );
17401         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
17402             sv_catpvs(sv, "[");
17403             (void) put_charclass_bitmap_innards(sv,
17404                                                 (IS_ANYOF_TRIE(op))
17405                                                  ? ANYOF_BITMAP(o)
17406                                                  : TRIE_BITMAP(trie),
17407                                                 NULL);
17408             sv_catpvs(sv, "]");
17409         }
17410
17411     } else if (k == CURLY) {
17412         U32 lo = ARG1(o), hi = ARG2(o);
17413         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
17414             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
17415         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
17416         if (hi == REG_INFTY)
17417             sv_catpvs(sv, "INFTY");
17418         else
17419             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
17420         sv_catpvs(sv, "}");
17421     }
17422     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
17423         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
17424     else if (k == REF || k == OPEN || k == CLOSE
17425              || k == GROUPP || OP(o)==ACCEPT)
17426     {
17427         AV *name_list= NULL;
17428         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
17429         Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno);        /* Parenth number */
17430         if ( RXp_PAREN_NAMES(prog) ) {
17431             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17432         } else if ( pRExC_state ) {
17433             name_list= RExC_paren_name_list;
17434         }
17435         if (name_list) {
17436             if ( k != REF || (OP(o) < NREF)) {
17437                 SV **name= av_fetch(name_list, parno, 0 );
17438                 if (name)
17439                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17440             }
17441             else {
17442                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
17443                 I32 *nums=(I32*)SvPVX(sv_dat);
17444                 SV **name= av_fetch(name_list, nums[0], 0 );
17445                 I32 n;
17446                 if (name) {
17447                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
17448                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
17449                                     (n ? "," : ""), (IV)nums[n]);
17450                     }
17451                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17452                 }
17453             }
17454         }
17455         if ( k == REF && reginfo) {
17456             U32 n = ARG(o);  /* which paren pair */
17457             I32 ln = prog->offs[n].start;
17458             if (prog->lastparen < n || ln == -1)
17459                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
17460             else if (ln == prog->offs[n].end)
17461                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
17462             else {
17463                 const char *s = reginfo->strbeg + ln;
17464                 Perl_sv_catpvf(aTHX_ sv, ": ");
17465                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
17466                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
17467             }
17468         }
17469     } else if (k == GOSUB) {
17470         AV *name_list= NULL;
17471         if ( RXp_PAREN_NAMES(prog) ) {
17472             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17473         } else if ( pRExC_state ) {
17474             name_list= RExC_paren_name_list;
17475         }
17476
17477         /* Paren and offset */
17478         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
17479         if (name_list) {
17480             SV **name= av_fetch(name_list, ARG(o), 0 );
17481             if (name)
17482                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17483         }
17484     }
17485     else if (k == LOGICAL)
17486         /* 2: embedded, otherwise 1 */
17487         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
17488     else if (k == ANYOF) {
17489         const U8 flags = ANYOF_FLAGS(o);
17490         int do_sep = 0;
17491         SV* bitmap_invlist = NULL;  /* Will hold what the bit map contains */
17492
17493
17494         if (OP(o) == ANYOFL) {
17495             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
17496                 sv_catpvs(sv, "{utf8-loc}");
17497             }
17498             else {
17499                 sv_catpvs(sv, "{loc}");
17500             }
17501         }
17502         if (flags & ANYOFL_FOLD)
17503             sv_catpvs(sv, "{i}");
17504         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
17505         if (flags & ANYOF_INVERT)
17506             sv_catpvs(sv, "^");
17507
17508         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
17509          * */
17510         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
17511                                                             &bitmap_invlist);
17512
17513         /* output any special charclass tests (used entirely under use
17514          * locale) * */
17515         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17516             int i;
17517             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17518                 if (ANYOF_POSIXL_TEST(o,i)) {
17519                     sv_catpv(sv, anyofs[i]);
17520                     do_sep = 1;
17521                 }
17522             }
17523         }
17524
17525         if (    ARG(o) != ANYOF_ONLY_HAS_BITMAP
17526             || (flags
17527                 & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
17528                    |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
17529                    |ANYOFL_FOLD)))
17530         {
17531             if (do_sep) {
17532                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17533                 if (flags & ANYOF_INVERT)
17534                     /*make sure the invert info is in each */
17535                     sv_catpvs(sv, "^");
17536             }
17537
17538             if (OP(o) == ANYOFD
17539                 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17540             {
17541                 sv_catpvs(sv, "{non-utf8-latin1-all}");
17542             }
17543
17544             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17545                 sv_catpvs(sv, "{above_bitmap_all}");
17546
17547             if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17548                 SV *lv; /* Set if there is something outside the bit map. */
17549                 bool byte_output = FALSE;   /* If something has been output */
17550                 SV *only_utf8_locale;
17551
17552                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
17553                  * is used to guarantee that nothing in the bitmap gets
17554                  * returned */
17555                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17556                                                     &lv, &only_utf8_locale,
17557                                                     bitmap_invlist);
17558                 if (lv && lv != &PL_sv_undef) {
17559                     char *s = savesvpv(lv);
17560                     char * const origs = s;
17561
17562                     while (*s && *s != '\n')
17563                         s++;
17564
17565                     if (*s == '\n') {
17566                         const char * const t = ++s;
17567
17568                         if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
17569                             if (OP(o) == ANYOFD) {
17570                                 sv_catpvs(sv, "{utf8}");
17571                             }
17572                             else {
17573                                 sv_catpvs(sv, "{outside bitmap}");
17574                             }
17575                         }
17576
17577                         if (byte_output) {
17578                             sv_catpvs(sv, " ");
17579                         }
17580
17581                         while (*s) {
17582                             if (*s == '\n') {
17583
17584                                 /* Truncate very long output */
17585                                 if (s - origs > 256) {
17586                                     Perl_sv_catpvf(aTHX_ sv,
17587                                                 "%.*s...",
17588                                                 (int) (s - origs - 1),
17589                                                 t);
17590                                     goto out_dump;
17591                                 }
17592                                 *s = ' ';
17593                             }
17594                             else if (*s == '\t') {
17595                                 *s = '-';
17596                             }
17597                             s++;
17598                         }
17599                         if (s[-1] == ' ')
17600                             s[-1] = 0;
17601
17602                         sv_catpv(sv, t);
17603                     }
17604
17605                   out_dump:
17606
17607                     Safefree(origs);
17608                     SvREFCNT_dec_NN(lv);
17609                 }
17610
17611                 if ((flags & ANYOFL_FOLD)
17612                      && only_utf8_locale
17613                      && only_utf8_locale != &PL_sv_undef)
17614                 {
17615                     UV start, end;
17616                     int max_entries = 256;
17617
17618                     sv_catpvs(sv, "{utf8 locale}");
17619                     invlist_iterinit(only_utf8_locale);
17620                     while (invlist_iternext(only_utf8_locale,
17621                                             &start, &end)) {
17622                         put_range(sv, start, end, FALSE);
17623                         max_entries --;
17624                         if (max_entries < 0) {
17625                             sv_catpvs(sv, "...");
17626                             break;
17627                         }
17628                     }
17629                     invlist_iterfinish(only_utf8_locale);
17630                 }
17631             }
17632         }
17633         SvREFCNT_dec(bitmap_invlist);
17634
17635
17636         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17637     }
17638     else if (k == POSIXD || k == NPOSIXD) {
17639         U8 index = FLAGS(o) * 2;
17640         if (index < C_ARRAY_LENGTH(anyofs)) {
17641             if (*anyofs[index] != '[')  {
17642                 sv_catpv(sv, "[");
17643             }
17644             sv_catpv(sv, anyofs[index]);
17645             if (*anyofs[index] != '[')  {
17646                 sv_catpv(sv, "]");
17647             }
17648         }
17649         else {
17650             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17651         }
17652     }
17653     else if (k == BOUND || k == NBOUND) {
17654         /* Must be synced with order of 'bound_type' in regcomp.h */
17655         const char * const bounds[] = {
17656             "",      /* Traditional */
17657             "{gcb}",
17658             "{lb}",
17659             "{sb}",
17660             "{wb}"
17661         };
17662         sv_catpv(sv, bounds[FLAGS(o)]);
17663     }
17664     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17665         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17666     else if (OP(o) == SBOL)
17667         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17668
17669     /* add on the verb argument if there is one */
17670     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
17671         Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
17672                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
17673     }
17674 #else
17675     PERL_UNUSED_CONTEXT;
17676     PERL_UNUSED_ARG(sv);
17677     PERL_UNUSED_ARG(o);
17678     PERL_UNUSED_ARG(prog);
17679     PERL_UNUSED_ARG(reginfo);
17680     PERL_UNUSED_ARG(pRExC_state);
17681 #endif  /* DEBUGGING */
17682 }
17683
17684
17685
17686 SV *
17687 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17688 {                               /* Assume that RE_INTUIT is set */
17689     struct regexp *const prog = ReANY(r);
17690     GET_RE_DEBUG_FLAGS_DECL;
17691
17692     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17693     PERL_UNUSED_CONTEXT;
17694
17695     DEBUG_COMPILE_r(
17696         {
17697             const char * const s = SvPV_nolen_const(RX_UTF8(r)
17698                       ? prog->check_utf8 : prog->check_substr);
17699
17700             if (!PL_colorset) reginitcolors();
17701             PerlIO_printf(Perl_debug_log,
17702                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17703                       PL_colors[4],
17704                       RX_UTF8(r) ? "utf8 " : "",
17705                       PL_colors[5],PL_colors[0],
17706                       s,
17707                       PL_colors[1],
17708                       (strlen(s) > 60 ? "..." : ""));
17709         } );
17710
17711     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17712     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17713 }
17714
17715 /*
17716    pregfree()
17717
17718    handles refcounting and freeing the perl core regexp structure. When
17719    it is necessary to actually free the structure the first thing it
17720    does is call the 'free' method of the regexp_engine associated to
17721    the regexp, allowing the handling of the void *pprivate; member
17722    first. (This routine is not overridable by extensions, which is why
17723    the extensions free is called first.)
17724
17725    See regdupe and regdupe_internal if you change anything here.
17726 */
17727 #ifndef PERL_IN_XSUB_RE
17728 void
17729 Perl_pregfree(pTHX_ REGEXP *r)
17730 {
17731     SvREFCNT_dec(r);
17732 }
17733
17734 void
17735 Perl_pregfree2(pTHX_ REGEXP *rx)
17736 {
17737     struct regexp *const r = ReANY(rx);
17738     GET_RE_DEBUG_FLAGS_DECL;
17739
17740     PERL_ARGS_ASSERT_PREGFREE2;
17741
17742     if (r->mother_re) {
17743         ReREFCNT_dec(r->mother_re);
17744     } else {
17745         CALLREGFREE_PVT(rx); /* free the private data */
17746         SvREFCNT_dec(RXp_PAREN_NAMES(r));
17747         Safefree(r->xpv_len_u.xpvlenu_pv);
17748     }
17749     if (r->substrs) {
17750         SvREFCNT_dec(r->anchored_substr);
17751         SvREFCNT_dec(r->anchored_utf8);
17752         SvREFCNT_dec(r->float_substr);
17753         SvREFCNT_dec(r->float_utf8);
17754         Safefree(r->substrs);
17755     }
17756     RX_MATCH_COPY_FREE(rx);
17757 #ifdef PERL_ANY_COW
17758     SvREFCNT_dec(r->saved_copy);
17759 #endif
17760     Safefree(r->offs);
17761     SvREFCNT_dec(r->qr_anoncv);
17762     rx->sv_u.svu_rx = 0;
17763 }
17764
17765 /*  reg_temp_copy()
17766
17767     This is a hacky workaround to the structural issue of match results
17768     being stored in the regexp structure which is in turn stored in
17769     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17770     could be PL_curpm in multiple contexts, and could require multiple
17771     result sets being associated with the pattern simultaneously, such
17772     as when doing a recursive match with (??{$qr})
17773
17774     The solution is to make a lightweight copy of the regexp structure
17775     when a qr// is returned from the code executed by (??{$qr}) this
17776     lightweight copy doesn't actually own any of its data except for
17777     the starp/end and the actual regexp structure itself.
17778
17779 */
17780
17781
17782 REGEXP *
17783 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17784 {
17785     struct regexp *ret;
17786     struct regexp *const r = ReANY(rx);
17787     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17788
17789     PERL_ARGS_ASSERT_REG_TEMP_COPY;
17790
17791     if (!ret_x)
17792         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17793     else {
17794         SvOK_off((SV *)ret_x);
17795         if (islv) {
17796             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17797                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
17798                made both spots point to the same regexp body.) */
17799             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17800             assert(!SvPVX(ret_x));
17801             ret_x->sv_u.svu_rx = temp->sv_any;
17802             temp->sv_any = NULL;
17803             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17804             SvREFCNT_dec_NN(temp);
17805             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17806                ing below will not set it. */
17807             SvCUR_set(ret_x, SvCUR(rx));
17808         }
17809     }
17810     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17811        sv_force_normal(sv) is called.  */
17812     SvFAKE_on(ret_x);
17813     ret = ReANY(ret_x);
17814
17815     SvFLAGS(ret_x) |= SvUTF8(rx);
17816     /* We share the same string buffer as the original regexp, on which we
17817        hold a reference count, incremented when mother_re is set below.
17818        The string pointer is copied here, being part of the regexp struct.
17819      */
17820     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17821            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17822     if (r->offs) {
17823         const I32 npar = r->nparens+1;
17824         Newx(ret->offs, npar, regexp_paren_pair);
17825         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17826     }
17827     if (r->substrs) {
17828         Newx(ret->substrs, 1, struct reg_substr_data);
17829         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17830
17831         SvREFCNT_inc_void(ret->anchored_substr);
17832         SvREFCNT_inc_void(ret->anchored_utf8);
17833         SvREFCNT_inc_void(ret->float_substr);
17834         SvREFCNT_inc_void(ret->float_utf8);
17835
17836         /* check_substr and check_utf8, if non-NULL, point to either their
17837            anchored or float namesakes, and don't hold a second reference.  */
17838     }
17839     RX_MATCH_COPIED_off(ret_x);
17840 #ifdef PERL_ANY_COW
17841     ret->saved_copy = NULL;
17842 #endif
17843     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17844     SvREFCNT_inc_void(ret->qr_anoncv);
17845
17846     return ret_x;
17847 }
17848 #endif
17849
17850 /* regfree_internal()
17851
17852    Free the private data in a regexp. This is overloadable by
17853    extensions. Perl takes care of the regexp structure in pregfree(),
17854    this covers the *pprivate pointer which technically perl doesn't
17855    know about, however of course we have to handle the
17856    regexp_internal structure when no extension is in use.
17857
17858    Note this is called before freeing anything in the regexp
17859    structure.
17860  */
17861
17862 void
17863 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17864 {
17865     struct regexp *const r = ReANY(rx);
17866     RXi_GET_DECL(r,ri);
17867     GET_RE_DEBUG_FLAGS_DECL;
17868
17869     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17870
17871     DEBUG_COMPILE_r({
17872         if (!PL_colorset)
17873             reginitcolors();
17874         {
17875             SV *dsv= sv_newmortal();
17876             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17877                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17878             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17879                 PL_colors[4],PL_colors[5],s);
17880         }
17881     });
17882 #ifdef RE_TRACK_PATTERN_OFFSETS
17883     if (ri->u.offsets)
17884         Safefree(ri->u.offsets);             /* 20010421 MJD */
17885 #endif
17886     if (ri->code_blocks) {
17887         int n;
17888         for (n = 0; n < ri->num_code_blocks; n++)
17889             SvREFCNT_dec(ri->code_blocks[n].src_regex);
17890         Safefree(ri->code_blocks);
17891     }
17892
17893     if (ri->data) {
17894         int n = ri->data->count;
17895
17896         while (--n >= 0) {
17897           /* If you add a ->what type here, update the comment in regcomp.h */
17898             switch (ri->data->what[n]) {
17899             case 'a':
17900             case 'r':
17901             case 's':
17902             case 'S':
17903             case 'u':
17904                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17905                 break;
17906             case 'f':
17907                 Safefree(ri->data->data[n]);
17908                 break;
17909             case 'l':
17910             case 'L':
17911                 break;
17912             case 'T':
17913                 { /* Aho Corasick add-on structure for a trie node.
17914                      Used in stclass optimization only */
17915                     U32 refcount;
17916                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17917 #ifdef USE_ITHREADS
17918                     dVAR;
17919 #endif
17920                     OP_REFCNT_LOCK;
17921                     refcount = --aho->refcount;
17922                     OP_REFCNT_UNLOCK;
17923                     if ( !refcount ) {
17924                         PerlMemShared_free(aho->states);
17925                         PerlMemShared_free(aho->fail);
17926                          /* do this last!!!! */
17927                         PerlMemShared_free(ri->data->data[n]);
17928                         /* we should only ever get called once, so
17929                          * assert as much, and also guard the free
17930                          * which /might/ happen twice. At the least
17931                          * it will make code anlyzers happy and it
17932                          * doesn't cost much. - Yves */
17933                         assert(ri->regstclass);
17934                         if (ri->regstclass) {
17935                             PerlMemShared_free(ri->regstclass);
17936                             ri->regstclass = 0;
17937                         }
17938                     }
17939                 }
17940                 break;
17941             case 't':
17942                 {
17943                     /* trie structure. */
17944                     U32 refcount;
17945                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17946 #ifdef USE_ITHREADS
17947                     dVAR;
17948 #endif
17949                     OP_REFCNT_LOCK;
17950                     refcount = --trie->refcount;
17951                     OP_REFCNT_UNLOCK;
17952                     if ( !refcount ) {
17953                         PerlMemShared_free(trie->charmap);
17954                         PerlMemShared_free(trie->states);
17955                         PerlMemShared_free(trie->trans);
17956                         if (trie->bitmap)
17957                             PerlMemShared_free(trie->bitmap);
17958                         if (trie->jump)
17959                             PerlMemShared_free(trie->jump);
17960                         PerlMemShared_free(trie->wordinfo);
17961                         /* do this last!!!! */
17962                         PerlMemShared_free(ri->data->data[n]);
17963                     }
17964                 }
17965                 break;
17966             default:
17967                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17968                                                     ri->data->what[n]);
17969             }
17970         }
17971         Safefree(ri->data->what);
17972         Safefree(ri->data);
17973     }
17974
17975     Safefree(ri);
17976 }
17977
17978 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17979 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17980 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
17981
17982 /*
17983    re_dup - duplicate a regexp.
17984
17985    This routine is expected to clone a given regexp structure. It is only
17986    compiled under USE_ITHREADS.
17987
17988    After all of the core data stored in struct regexp is duplicated
17989    the regexp_engine.dupe method is used to copy any private data
17990    stored in the *pprivate pointer. This allows extensions to handle
17991    any duplication it needs to do.
17992
17993    See pregfree() and regfree_internal() if you change anything here.
17994 */
17995 #if defined(USE_ITHREADS)
17996 #ifndef PERL_IN_XSUB_RE
17997 void
17998 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17999 {
18000     dVAR;
18001     I32 npar;
18002     const struct regexp *r = ReANY(sstr);
18003     struct regexp *ret = ReANY(dstr);
18004
18005     PERL_ARGS_ASSERT_RE_DUP_GUTS;
18006
18007     npar = r->nparens+1;
18008     Newx(ret->offs, npar, regexp_paren_pair);
18009     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
18010
18011     if (ret->substrs) {
18012         /* Do it this way to avoid reading from *r after the StructCopy().
18013            That way, if any of the sv_dup_inc()s dislodge *r from the L1
18014            cache, it doesn't matter.  */
18015         const bool anchored = r->check_substr
18016             ? r->check_substr == r->anchored_substr
18017             : r->check_utf8 == r->anchored_utf8;
18018         Newx(ret->substrs, 1, struct reg_substr_data);
18019         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
18020
18021         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
18022         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
18023         ret->float_substr = sv_dup_inc(ret->float_substr, param);
18024         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
18025
18026         /* check_substr and check_utf8, if non-NULL, point to either their
18027            anchored or float namesakes, and don't hold a second reference.  */
18028
18029         if (ret->check_substr) {
18030             if (anchored) {
18031                 assert(r->check_utf8 == r->anchored_utf8);
18032                 ret->check_substr = ret->anchored_substr;
18033                 ret->check_utf8 = ret->anchored_utf8;
18034             } else {
18035                 assert(r->check_substr == r->float_substr);
18036                 assert(r->check_utf8 == r->float_utf8);
18037                 ret->check_substr = ret->float_substr;
18038                 ret->check_utf8 = ret->float_utf8;
18039             }
18040         } else if (ret->check_utf8) {
18041             if (anchored) {
18042                 ret->check_utf8 = ret->anchored_utf8;
18043             } else {
18044                 ret->check_utf8 = ret->float_utf8;
18045             }
18046         }
18047     }
18048
18049     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
18050     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
18051
18052     if (ret->pprivate)
18053         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
18054
18055     if (RX_MATCH_COPIED(dstr))
18056         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
18057     else
18058         ret->subbeg = NULL;
18059 #ifdef PERL_ANY_COW
18060     ret->saved_copy = NULL;
18061 #endif
18062
18063     /* Whether mother_re be set or no, we need to copy the string.  We
18064        cannot refrain from copying it when the storage points directly to
18065        our mother regexp, because that's
18066                1: a buffer in a different thread
18067                2: something we no longer hold a reference on
18068                so we need to copy it locally.  */
18069     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
18070     ret->mother_re   = NULL;
18071 }
18072 #endif /* PERL_IN_XSUB_RE */
18073
18074 /*
18075    regdupe_internal()
18076
18077    This is the internal complement to regdupe() which is used to copy
18078    the structure pointed to by the *pprivate pointer in the regexp.
18079    This is the core version of the extension overridable cloning hook.
18080    The regexp structure being duplicated will be copied by perl prior
18081    to this and will be provided as the regexp *r argument, however
18082    with the /old/ structures pprivate pointer value. Thus this routine
18083    may override any copying normally done by perl.
18084
18085    It returns a pointer to the new regexp_internal structure.
18086 */
18087
18088 void *
18089 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
18090 {
18091     dVAR;
18092     struct regexp *const r = ReANY(rx);
18093     regexp_internal *reti;
18094     int len;
18095     RXi_GET_DECL(r,ri);
18096
18097     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
18098
18099     len = ProgLen(ri);
18100
18101     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
18102           char, regexp_internal);
18103     Copy(ri->program, reti->program, len+1, regnode);
18104
18105     reti->num_code_blocks = ri->num_code_blocks;
18106     if (ri->code_blocks) {
18107         int n;
18108         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
18109                 struct reg_code_block);
18110         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
18111                 struct reg_code_block);
18112         for (n = 0; n < ri->num_code_blocks; n++)
18113              reti->code_blocks[n].src_regex = (REGEXP*)
18114                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
18115     }
18116     else
18117         reti->code_blocks = NULL;
18118
18119     reti->regstclass = NULL;
18120
18121     if (ri->data) {
18122         struct reg_data *d;
18123         const int count = ri->data->count;
18124         int i;
18125
18126         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
18127                 char, struct reg_data);
18128         Newx(d->what, count, U8);
18129
18130         d->count = count;
18131         for (i = 0; i < count; i++) {
18132             d->what[i] = ri->data->what[i];
18133             switch (d->what[i]) {
18134                 /* see also regcomp.h and regfree_internal() */
18135             case 'a': /* actually an AV, but the dup function is identical.  */
18136             case 'r':
18137             case 's':
18138             case 'S':
18139             case 'u': /* actually an HV, but the dup function is identical.  */
18140                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
18141                 break;
18142             case 'f':
18143                 /* This is cheating. */
18144                 Newx(d->data[i], 1, regnode_ssc);
18145                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
18146                 reti->regstclass = (regnode*)d->data[i];
18147                 break;
18148             case 'T':
18149                 /* Trie stclasses are readonly and can thus be shared
18150                  * without duplication. We free the stclass in pregfree
18151                  * when the corresponding reg_ac_data struct is freed.
18152                  */
18153                 reti->regstclass= ri->regstclass;
18154                 /* FALLTHROUGH */
18155             case 't':
18156                 OP_REFCNT_LOCK;
18157                 ((reg_trie_data*)ri->data->data[i])->refcount++;
18158                 OP_REFCNT_UNLOCK;
18159                 /* FALLTHROUGH */
18160             case 'l':
18161             case 'L':
18162                 d->data[i] = ri->data->data[i];
18163                 break;
18164             default:
18165                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
18166                                                            ri->data->what[i]);
18167             }
18168         }
18169
18170         reti->data = d;
18171     }
18172     else
18173         reti->data = NULL;
18174
18175     reti->name_list_idx = ri->name_list_idx;
18176
18177 #ifdef RE_TRACK_PATTERN_OFFSETS
18178     if (ri->u.offsets) {
18179         Newx(reti->u.offsets, 2*len+1, U32);
18180         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
18181     }
18182 #else
18183     SetProgLen(reti,len);
18184 #endif
18185
18186     return (void*)reti;
18187 }
18188
18189 #endif    /* USE_ITHREADS */
18190
18191 #ifndef PERL_IN_XSUB_RE
18192
18193 /*
18194  - regnext - dig the "next" pointer out of a node
18195  */
18196 regnode *
18197 Perl_regnext(pTHX_ regnode *p)
18198 {
18199     I32 offset;
18200
18201     if (!p)
18202         return(NULL);
18203
18204     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
18205         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
18206                                                 (int)OP(p), (int)REGNODE_MAX);
18207     }
18208
18209     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
18210     if (offset == 0)
18211         return(NULL);
18212
18213     return(p+offset);
18214 }
18215 #endif
18216
18217 STATIC void
18218 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
18219 {
18220     va_list args;
18221     STRLEN l1 = strlen(pat1);
18222     STRLEN l2 = strlen(pat2);
18223     char buf[512];
18224     SV *msv;
18225     const char *message;
18226
18227     PERL_ARGS_ASSERT_RE_CROAK2;
18228
18229     if (l1 > 510)
18230         l1 = 510;
18231     if (l1 + l2 > 510)
18232         l2 = 510 - l1;
18233     Copy(pat1, buf, l1 , char);
18234     Copy(pat2, buf + l1, l2 , char);
18235     buf[l1 + l2] = '\n';
18236     buf[l1 + l2 + 1] = '\0';
18237     va_start(args, pat2);
18238     msv = vmess(buf, &args);
18239     va_end(args);
18240     message = SvPV_const(msv,l1);
18241     if (l1 > 512)
18242         l1 = 512;
18243     Copy(message, buf, l1 , char);
18244     /* l1-1 to avoid \n */
18245     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
18246 }
18247
18248 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
18249
18250 #ifndef PERL_IN_XSUB_RE
18251 void
18252 Perl_save_re_context(pTHX)
18253 {
18254     I32 nparens = -1;
18255     I32 i;
18256
18257     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
18258
18259     if (PL_curpm) {
18260         const REGEXP * const rx = PM_GETRE(PL_curpm);
18261         if (rx)
18262             nparens = RX_NPARENS(rx);
18263     }
18264
18265     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
18266      * that PL_curpm will be null, but that utf8.pm and the modules it
18267      * loads will only use $1..$3.
18268      * The t/porting/re_context.t test file checks this assumption.
18269      */
18270     if (nparens == -1)
18271         nparens = 3;
18272
18273     for (i = 1; i <= nparens; i++) {
18274         char digits[TYPE_CHARS(long)];
18275         const STRLEN len = my_snprintf(digits, sizeof(digits),
18276                                        "%lu", (long)i);
18277         GV *const *const gvp
18278             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
18279
18280         if (gvp) {
18281             GV * const gv = *gvp;
18282             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
18283                 save_scalar(gv);
18284         }
18285     }
18286 }
18287 #endif
18288
18289 #ifdef DEBUGGING
18290
18291 STATIC void
18292 S_put_code_point(pTHX_ SV *sv, UV c)
18293 {
18294     PERL_ARGS_ASSERT_PUT_CODE_POINT;
18295
18296     if (c > 255) {
18297         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
18298     }
18299     else if (isPRINT(c)) {
18300         const char string = (char) c;
18301         if (isBACKSLASHED_PUNCT(c))
18302             sv_catpvs(sv, "\\");
18303         sv_catpvn(sv, &string, 1);
18304     }
18305     else {
18306         const char * const mnemonic = cntrl_to_mnemonic((char) c);
18307         if (mnemonic) {
18308             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
18309         }
18310         else {
18311             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
18312         }
18313     }
18314 }
18315
18316 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
18317
18318 STATIC void
18319 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
18320 {
18321     /* Appends to 'sv' a displayable version of the range of code points from
18322      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
18323      * as-is (though some of these will be escaped by put_code_point()). */
18324
18325     const unsigned int min_range_count = 3;
18326
18327     assert(start <= end);
18328
18329     PERL_ARGS_ASSERT_PUT_RANGE;
18330
18331     while (start <= end) {
18332         UV this_end;
18333         const char * format;
18334
18335         if (end - start < min_range_count) {
18336
18337             /* Individual chars in short ranges */
18338             for (; start <= end; start++) {
18339                 put_code_point(sv, start);
18340             }
18341             break;
18342         }
18343
18344         /* If permitted by the input options, and there is a possibility that
18345          * this range contains a printable literal, look to see if there is
18346          * one.  */
18347         if (allow_literals && start <= MAX_PRINT_A) {
18348
18349             /* If the range begin isn't an ASCII printable, effectively split
18350              * the range into two parts:
18351              *  1) the portion before the first such printable,
18352              *  2) the rest
18353              * and output them separately. */
18354             if (! isPRINT_A(start)) {
18355                 UV temp_end = start + 1;
18356
18357                 /* There is no point looking beyond the final possible
18358                  * printable, in MAX_PRINT_A */
18359                 UV max = MIN(end, MAX_PRINT_A);
18360
18361                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
18362                     temp_end++;
18363                 }
18364
18365                 /* Here, temp_end points to one beyond the first printable if
18366                  * found, or to one beyond 'max' if not.  If none found, make
18367                  * sure that we use the entire range */
18368                 if (temp_end > MAX_PRINT_A) {
18369                     temp_end = end + 1;
18370                 }
18371
18372                 /* Output the first part of the split range, the part that
18373                  * doesn't have printables, with no looking for literals
18374                  * (otherwise we would infinitely recurse) */
18375                 put_range(sv, start, temp_end - 1, FALSE);
18376
18377                 /* The 2nd part of the range (if any) starts here. */
18378                 start = temp_end;
18379
18380                 /* We continue instead of dropping down because even if the 2nd
18381                  * part is non-empty, it could be so short that we want to
18382                  * output it specially, as tested for at the top of this loop.
18383                  * */
18384                 continue;
18385             }
18386
18387             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
18388              * output a sub-range of just the digits or letters, then process
18389              * the remaining portion as usual. */
18390             if (isALPHANUMERIC_A(start)) {
18391                 UV mask = (isDIGIT_A(start))
18392                            ? _CC_DIGIT
18393                              : isUPPER_A(start)
18394                                ? _CC_UPPER
18395                                : _CC_LOWER;
18396                 UV temp_end = start + 1;
18397
18398                 /* Find the end of the sub-range that includes just the
18399                  * characters in the same class as the first character in it */
18400                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
18401                     temp_end++;
18402                 }
18403                 temp_end--;
18404
18405                 /* For short ranges, don't duplicate the code above to output
18406                  * them; just call recursively */
18407                 if (temp_end - start < min_range_count) {
18408                     put_range(sv, start, temp_end, FALSE);
18409                 }
18410                 else {  /* Output as a range */
18411                     put_code_point(sv, start);
18412                     sv_catpvs(sv, "-");
18413                     put_code_point(sv, temp_end);
18414                 }
18415                 start = temp_end + 1;
18416                 continue;
18417             }
18418
18419             /* We output any other printables as individual characters */
18420             if (isPUNCT_A(start) || isSPACE_A(start)) {
18421                 while (start <= end && (isPUNCT_A(start)
18422                                         || isSPACE_A(start)))
18423                 {
18424                     put_code_point(sv, start);
18425                     start++;
18426                 }
18427                 continue;
18428             }
18429         } /* End of looking for literals */
18430
18431         /* Here is not to output as a literal.  Some control characters have
18432          * mnemonic names.  Split off any of those at the beginning and end of
18433          * the range to print mnemonically.  It isn't possible for many of
18434          * these to be in a row, so this won't overwhelm with output */
18435         while (isMNEMONIC_CNTRL(start) && start <= end) {
18436             put_code_point(sv, start);
18437             start++;
18438         }
18439         if (start < end && isMNEMONIC_CNTRL(end)) {
18440
18441             /* Here, the final character in the range has a mnemonic name.
18442              * Work backwards from the end to find the final non-mnemonic */
18443             UV temp_end = end - 1;
18444             while (isMNEMONIC_CNTRL(temp_end)) {
18445                 temp_end--;
18446             }
18447
18448             /* And separately output the range that doesn't have mnemonics */
18449             put_range(sv, start, temp_end, FALSE);
18450
18451             /* Then output the mnemonic trailing controls */
18452             start = temp_end + 1;
18453             while (start <= end) {
18454                 put_code_point(sv, start);
18455                 start++;
18456             }
18457             break;
18458         }
18459
18460         /* As a final resort, output the range or subrange as hex. */
18461
18462         this_end = (end < NUM_ANYOF_CODE_POINTS)
18463                     ? end
18464                     : NUM_ANYOF_CODE_POINTS - 1;
18465 #if NUM_ANYOF_CODE_POINTS > 256
18466         format = (this_end < 256)
18467                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
18468                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
18469 #else
18470         format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
18471 #endif
18472         GCC_DIAG_IGNORE(-Wformat-nonliteral);
18473         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
18474         GCC_DIAG_RESTORE;
18475         break;
18476     }
18477 }
18478
18479 STATIC bool
18480 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
18481 {
18482     /* Appends to 'sv' a displayable version of the innards of the bracketed
18483      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
18484      * output anything, and bitmap_invlist, if not NULL, will point to an
18485      * inversion list of what is in the bit map */
18486
18487     int i;
18488     UV start, end;
18489     unsigned int punct_count = 0;
18490     SV* invlist;
18491     bool allow_literals = TRUE;
18492     bool inverted_for_output = FALSE;
18493
18494     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
18495
18496     /* Worst case is exactly every-other code point is in the list */
18497     invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
18498
18499     /* Convert the bit map to an inversion list, keeping track of how many
18500      * ASCII puncts are set, including an extra amount for the backslashed
18501      * ones.  */
18502     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
18503         if (BITMAP_TEST(bitmap, i)) {
18504             invlist = add_cp_to_invlist(invlist, i);
18505             if (isPUNCT_A(i)) {
18506                 punct_count++;
18507                 if isBACKSLASHED_PUNCT(i) {
18508                     punct_count++;
18509                 }
18510             }
18511         }
18512     }
18513
18514     /* Nothing to output */
18515     if (_invlist_len(invlist) == 0) {
18516         SvREFCNT_dec_NN(invlist);
18517         return FALSE;
18518     }
18519
18520     /* Generally, it is more readable if printable characters are output as
18521      * literals, but if a range (nearly) spans all of them, it's best to output
18522      * it as a single range.  This code will use a single range if all but 2
18523      * printables are in it */
18524     invlist_iterinit(invlist);
18525     while (invlist_iternext(invlist, &start, &end)) {
18526
18527         /* If range starts beyond final printable, it doesn't have any in it */
18528         if (start > MAX_PRINT_A) {
18529             break;
18530         }
18531
18532         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
18533          * all but two, the range must start and end no later than 2 from
18534          * either end */
18535         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18536             if (end > MAX_PRINT_A) {
18537                 end = MAX_PRINT_A;
18538             }
18539             if (start < ' ') {
18540                 start = ' ';
18541             }
18542             if (end - start >= MAX_PRINT_A - ' ' - 2) {
18543                 allow_literals = FALSE;
18544             }
18545             break;
18546         }
18547     }
18548     invlist_iterfinish(invlist);
18549
18550     /* The legibility of the output depends mostly on how many punctuation
18551      * characters are output.  There are 32 possible ASCII ones, and some have
18552      * an additional backslash, bringing it to currently 36, so if any more
18553      * than 18 are to be output, we can instead output it as its complement,
18554      * yielding fewer puncts, and making it more legible.  But give some weight
18555      * to the fact that outputting it as a complement is less legible than a
18556      * straight output, so don't complement unless we are somewhat over the 18
18557      * mark */
18558     if (allow_literals && punct_count > 22) {
18559         sv_catpvs(sv, "^");
18560
18561         /* Add everything remaining to the list, so when we invert it just
18562          * below, it will be excluded */
18563         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
18564         _invlist_invert(invlist);
18565         inverted_for_output = TRUE;
18566     }
18567
18568     /* Here we have figured things out.  Output each range */
18569     invlist_iterinit(invlist);
18570     while (invlist_iternext(invlist, &start, &end)) {
18571         if (start >= NUM_ANYOF_CODE_POINTS) {
18572             break;
18573         }
18574         put_range(sv, start, end, allow_literals);
18575     }
18576     invlist_iterfinish(invlist);
18577
18578     if (bitmap_invlist) {
18579
18580         /* Here, wants the inversion list returned.  If we inverted it, we have
18581          * to restore it to the original */
18582         if (inverted_for_output) {
18583             _invlist_invert(invlist);
18584             _invlist_intersection(invlist, PL_InBitmap, &invlist);
18585         }
18586
18587         *bitmap_invlist = invlist;
18588     }
18589     else {
18590         SvREFCNT_dec_NN(invlist);
18591     }
18592
18593     return TRUE;
18594 }
18595
18596 #define CLEAR_OPTSTART \
18597     if (optstart) STMT_START {                                               \
18598         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
18599                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18600         optstart=NULL;                                                       \
18601     } STMT_END
18602
18603 #define DUMPUNTIL(b,e)                                                       \
18604                     CLEAR_OPTSTART;                                          \
18605                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18606
18607 STATIC const regnode *
18608 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18609             const regnode *last, const regnode *plast,
18610             SV* sv, I32 indent, U32 depth)
18611 {
18612     U8 op = PSEUDO;     /* Arbitrary non-END op. */
18613     const regnode *next;
18614     const regnode *optstart= NULL;
18615
18616     RXi_GET_DECL(r,ri);
18617     GET_RE_DEBUG_FLAGS_DECL;
18618
18619     PERL_ARGS_ASSERT_DUMPUNTIL;
18620
18621 #ifdef DEBUG_DUMPUNTIL
18622     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18623         last ? last-start : 0,plast ? plast-start : 0);
18624 #endif
18625
18626     if (plast && plast < last)
18627         last= plast;
18628
18629     while (PL_regkind[op] != END && (!last || node < last)) {
18630         assert(node);
18631         /* While that wasn't END last time... */
18632         NODE_ALIGN(node);
18633         op = OP(node);
18634         if (op == CLOSE || op == WHILEM)
18635             indent--;
18636         next = regnext((regnode *)node);
18637
18638         /* Where, what. */
18639         if (OP(node) == OPTIMIZED) {
18640             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18641                 optstart = node;
18642             else
18643                 goto after_print;
18644         } else
18645             CLEAR_OPTSTART;
18646
18647         regprop(r, sv, node, NULL, NULL);
18648         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18649                       (int)(2*indent + 1), "", SvPVX_const(sv));
18650
18651         if (OP(node) != OPTIMIZED) {
18652             if (next == NULL)           /* Next ptr. */
18653                 PerlIO_printf(Perl_debug_log, " (0)");
18654             else if (PL_regkind[(U8)op] == BRANCH
18655                      && PL_regkind[OP(next)] != BRANCH )
18656                 PerlIO_printf(Perl_debug_log, " (FAIL)");
18657             else
18658                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18659             (void)PerlIO_putc(Perl_debug_log, '\n');
18660         }
18661
18662       after_print:
18663         if (PL_regkind[(U8)op] == BRANCHJ) {
18664             assert(next);
18665             {
18666                 const regnode *nnode = (OP(next) == LONGJMP
18667                                        ? regnext((regnode *)next)
18668                                        : next);
18669                 if (last && nnode > last)
18670                     nnode = last;
18671                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18672             }
18673         }
18674         else if (PL_regkind[(U8)op] == BRANCH) {
18675             assert(next);
18676             DUMPUNTIL(NEXTOPER(node), next);
18677         }
18678         else if ( PL_regkind[(U8)op]  == TRIE ) {
18679             const regnode *this_trie = node;
18680             const char op = OP(node);
18681             const U32 n = ARG(node);
18682             const reg_ac_data * const ac = op>=AHOCORASICK ?
18683                (reg_ac_data *)ri->data->data[n] :
18684                NULL;
18685             const reg_trie_data * const trie =
18686                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18687 #ifdef DEBUGGING
18688             AV *const trie_words
18689                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18690 #endif
18691             const regnode *nextbranch= NULL;
18692             I32 word_idx;
18693             sv_setpvs(sv, "");
18694             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18695                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18696
18697                 PerlIO_printf(Perl_debug_log, "%*s%s ",
18698                    (int)(2*(indent+3)), "",
18699                     elem_ptr
18700                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18701                                 SvCUR(*elem_ptr), 60,
18702                                 PL_colors[0], PL_colors[1],
18703                                 (SvUTF8(*elem_ptr)
18704                                  ? PERL_PV_ESCAPE_UNI
18705                                  : 0)
18706                                 | PERL_PV_PRETTY_ELLIPSES
18707                                 | PERL_PV_PRETTY_LTGT
18708                             )
18709                     : "???"
18710                 );
18711                 if (trie->jump) {
18712                     U16 dist= trie->jump[word_idx+1];
18713                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18714                                (UV)((dist ? this_trie + dist : next) - start));
18715                     if (dist) {
18716                         if (!nextbranch)
18717                             nextbranch= this_trie + trie->jump[0];
18718                         DUMPUNTIL(this_trie + dist, nextbranch);
18719                     }
18720                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18721                         nextbranch= regnext((regnode *)nextbranch);
18722                 } else {
18723                     PerlIO_printf(Perl_debug_log, "\n");
18724                 }
18725             }
18726             if (last && next > last)
18727                 node= last;
18728             else
18729                 node= next;
18730         }
18731         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
18732             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18733                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18734         }
18735         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18736             assert(next);
18737             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18738         }
18739         else if ( op == PLUS || op == STAR) {
18740             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18741         }
18742         else if (PL_regkind[(U8)op] == ANYOF) {
18743             /* arglen 1 + class block */
18744             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18745                           ? ANYOF_POSIXL_SKIP
18746                           : ANYOF_SKIP);
18747             node = NEXTOPER(node);
18748         }
18749         else if (PL_regkind[(U8)op] == EXACT) {
18750             /* Literal string, where present. */
18751             node += NODE_SZ_STR(node) - 1;
18752             node = NEXTOPER(node);
18753         }
18754         else {
18755             node = NEXTOPER(node);
18756             node += regarglen[(U8)op];
18757         }
18758         if (op == CURLYX || op == OPEN)
18759             indent++;
18760     }
18761     CLEAR_OPTSTART;
18762 #ifdef DEBUG_DUMPUNTIL
18763     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18764 #endif
18765     return node;
18766 }
18767
18768 #endif  /* DEBUGGING */
18769
18770 /*
18771  * ex: set ts=8 sts=4 sw=4 et:
18772  */