regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC  static
 103 #endif
 104
 105 #ifndef MIN
 106 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 107 #endif
 108
 109 /* this is a chain of data about sub patterns we are processing that
 110    need to be handled separately/specially in study_chunk. Its so
 111    we can simulate recursion without losing state.  */
 112 struct scan_frame;
 113 typedef struct scan_frame {
 114     regnode *last_regnode;      /* last node to process in this frame */
 115     regnode *next_regnode;      /* next node to process when last is reached */
 116     U32 prev_recursed_depth;
 117     I32 stopparen;              /* what stopparen do we use */
 118     U32 is_top_frame;           /* what flags do we use? */
 119
 120     struct scan_frame *this_prev_frame; /* this previous frame */
 121     struct scan_frame *prev_frame;      /* previous frame */
 122     struct scan_frame *next_frame;      /* next frame */
 123 } scan_frame;
 124
 125 struct RExC_state_t {
 126     U32         flags;                  /* RXf_* are we folding, multilining? */
 127     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 128     char        *precomp;               /* uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 137     regnode     *emit_start;            /* Start of emitted-code area */
 138     regnode     *emit_bound;            /* First regnode outside of the
 139                                            allocated space */
 140     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 141                                            implies compiling, so don't emit */
 142     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 143                                            large enough for the largest
 144                                            non-EXACTish node, so can use it as
 145                                            scratch in pass1 */
 146     I32         naughty;                /* How bad is this pattern? */
 147     I32         sawback;                /* Did we see \1, ...? */
 148     U32         seen;
 149     SSize_t     size;                   /* Code size. */
 150     I32                npar;            /* Capture buffer count, (OPEN) plus
 151                                            one. ("par" 0 is the whole
 152                                            pattern)*/
 153     I32         nestroot;               /* root parens we are in - used by
 154                                            accept */
 155     I32         extralen;
 156     I32         seen_zerolen;
 157     regnode     **open_parens;          /* pointers to open parens */
 158     regnode     **close_parens;         /* pointers to close parens */
 159     regnode     *opend;                 /* END node in program */
 160     I32         utf8;           /* whether the pattern is utf8 or not */
 161     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 162                                 /* XXX use this for future optimisation of case
 163                                  * where pattern must be upgraded to utf8. */
 164     I32         uni_semantics;  /* If a d charset modifier should use unicode
 165                                    rules, even if the pattern is not in
 166                                    utf8 */
 167     HV          *paren_names;           /* Paren names */
 168
 169     regnode     **recurse;              /* Recurse regops */
 170     I32         recurse_count;          /* Number of recurse regops */
 171     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 172                                            through */
 173     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 174     I32         in_lookbehind;
 175     I32         contains_locale;
 176     I32         contains_i;
 177     I32         override_recoding;
 178     I32         in_multi_char_class;
 179     struct reg_code_block *code_blocks; /* positions of literal (?{})
 180                                             within pattern */
 181     int         num_code_blocks;        /* size of code_blocks[] */
 182     int         code_index;             /* next code_blocks[] slot */
 183     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 184     scan_frame *frame_head;
 185     scan_frame *frame_last;
 186     U32         frame_count;
 187 #ifdef ADD_TO_REGEXEC
 188     char        *starttry;              /* -Dr: where regtry was called. */
 189 #define RExC_starttry   (pRExC_state->starttry)
 190 #endif
 191     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 192 #ifdef DEBUGGING
 193     const char  *lastparse;
 194     I32         lastnum;
 195     AV          *paren_name_list;       /* idx -> name */
 196     U32         study_chunk_recursed_count;
 197     SV          *mysv1;
 198     SV          *mysv2;
 199 #define RExC_lastparse  (pRExC_state->lastparse)
 200 #define RExC_lastnum    (pRExC_state->lastnum)
 201 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 202 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 203 #define RExC_mysv       (pRExC_state->mysv1)
 204 #define RExC_mysv1      (pRExC_state->mysv1)
 205 #define RExC_mysv2      (pRExC_state->mysv2)
 206
 207 #endif
 208 };
 209
 210 #define RExC_flags      (pRExC_state->flags)
 211 #define RExC_pm_flags   (pRExC_state->pm_flags)
 212 #define RExC_precomp    (pRExC_state->precomp)
 213 #define RExC_rx_sv      (pRExC_state->rx_sv)
 214 #define RExC_rx         (pRExC_state->rx)
 215 #define RExC_rxi        (pRExC_state->rxi)
 216 #define RExC_start      (pRExC_state->start)
 217 #define RExC_end        (pRExC_state->end)
 218 #define RExC_parse      (pRExC_state->parse)
 219 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 220 #ifdef RE_TRACK_PATTERN_OFFSETS
 221 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 222                                                          others */
 223 #endif
 224 #define RExC_emit       (pRExC_state->emit)
 225 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 226 #define RExC_emit_start (pRExC_state->emit_start)
 227 #define RExC_emit_bound (pRExC_state->emit_bound)
 228 #define RExC_sawback    (pRExC_state->sawback)
 229 #define RExC_seen       (pRExC_state->seen)
 230 #define RExC_size       (pRExC_state->size)
 231 #define RExC_maxlen        (pRExC_state->maxlen)
 232 #define RExC_npar       (pRExC_state->npar)
 233 #define RExC_nestroot   (pRExC_state->nestroot)
 234 #define RExC_extralen   (pRExC_state->extralen)
 235 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 236 #define RExC_utf8       (pRExC_state->utf8)
 237 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 238 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 239 #define RExC_open_parens        (pRExC_state->open_parens)
 240 #define RExC_close_parens       (pRExC_state->close_parens)
 241 #define RExC_opend      (pRExC_state->opend)
 242 #define RExC_paren_names        (pRExC_state->paren_names)
 243 #define RExC_recurse    (pRExC_state->recurse)
 244 #define RExC_recurse_count      (pRExC_state->recurse_count)
 245 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 246 #define RExC_study_chunk_recursed_bytes  \
 247                                    (pRExC_state->study_chunk_recursed_bytes)
 248 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 249 #define RExC_contains_locale    (pRExC_state->contains_locale)
 250 #define RExC_contains_i (pRExC_state->contains_i)
 251 #define RExC_override_recoding (pRExC_state->override_recoding)
 252 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 253 #define RExC_frame_head (pRExC_state->frame_head)
 254 #define RExC_frame_last (pRExC_state->frame_last)
 255 #define RExC_frame_count (pRExC_state->frame_count)
 256
 257 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 258  * a flag to disable back-off on the fixed/floating substrings - if it's
 259  * a high complexity pattern we assume the benefit of avoiding a full match
 260  * is worth the cost of checking for the substrings even if they rarely help.
 261  */
 262 #define RExC_naughty    (pRExC_state->naughty)
 263 #define TOO_NAUGHTY (10)
 264 #define MARK_NAUGHTY(add) \
 265     if (RExC_naughty < TOO_NAUGHTY) \
 266         RExC_naughty += (add)
 267 #define MARK_NAUGHTY_EXP(exp, add) \
 268     if (RExC_naughty < TOO_NAUGHTY) \
 269         RExC_naughty += RExC_naughty / (exp) + (add)
 270
 271 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 272 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 273         ((*s) == '{' && regcurly(s)))
 274
 275 /*
 276  * Flags to be passed up and down.
 277  */
 278 #define WORST           0       /* Worst case. */
 279 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 280
 281 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 282  * character.  (There needs to be a case: in the switch statement in regexec.c
 283  * for any node marked SIMPLE.)  Note that this is not the same thing as
 284  * REGNODE_SIMPLE */
 285 #define SIMPLE          0x02
 286 #define SPSTART         0x04    /* Starts with * or + */
 287 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 288 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 289 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 290
 291 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 292
 293 /* whether trie related optimizations are enabled */
 294 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 295 #define TRIE_STUDY_OPT
 296 #define FULL_TRIE_STUDY
 297 #define TRIE_STCLASS
 298 #endif
 299
 300
 301
 302 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 303 #define PBITVAL(paren) (1 << ((paren) & 7))
 304 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 305 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 306 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 307
 308 #define REQUIRE_UTF8    STMT_START {                                       \
 309                                      if (!UTF) {                           \
 310                                          *flagp = RESTART_UTF8;            \
 311                                          return NULL;                      \
 312                                      }                                     \
 313                         } STMT_END
 314
 315 /* This converts the named class defined in regcomp.h to its equivalent class
 316  * number defined in handy.h. */
 317 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 318 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 319
 320 #define _invlist_union_complement_2nd(a, b, output) \
 321                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 322 #define _invlist_intersection_complement_2nd(a, b, output) \
 323                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 324
 325 /* About scan_data_t.
 326
 327   During optimisation we recurse through the regexp program performing
 328   various inplace (keyhole style) optimisations. In addition study_chunk
 329   and scan_commit populate this data structure with information about
 330   what strings MUST appear in the pattern. We look for the longest
 331   string that must appear at a fixed location, and we look for the
 332   longest string that may appear at a floating location. So for instance
 333   in the pattern:
 334
 335     /FOO[xX]A.*B[xX]BAR/
 336
 337   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 338   strings (because they follow a .* construct). study_chunk will identify
 339   both FOO and BAR as being the longest fixed and floating strings respectively.
 340
 341   The strings can be composites, for instance
 342
 343      /(f)(o)(o)/
 344
 345   will result in a composite fixed substring 'foo'.
 346
 347   For each string some basic information is maintained:
 348
 349   - offset or min_offset
 350     This is the position the string must appear at, or not before.
 351     It also implicitly (when combined with minlenp) tells us how many
 352     characters must match before the string we are searching for.
 353     Likewise when combined with minlenp and the length of the string it
 354     tells us how many characters must appear after the string we have
 355     found.
 356
 357   - max_offset
 358     Only used for floating strings. This is the rightmost point that
 359     the string can appear at. If set to SSize_t_MAX it indicates that the
 360     string can occur infinitely far to the right.
 361
 362   - minlenp
 363     A pointer to the minimum number of characters of the pattern that the
 364     string was found inside. This is important as in the case of positive
 365     lookahead or positive lookbehind we can have multiple patterns
 366     involved. Consider
 367
 368     /(?=FOO).*F/
 369
 370     The minimum length of the pattern overall is 3, the minimum length
 371     of the lookahead part is 3, but the minimum length of the part that
 372     will actually match is 1. So 'FOO's minimum length is 3, but the
 373     minimum length for the F is 1. This is important as the minimum length
 374     is used to determine offsets in front of and behind the string being
 375     looked for.  Since strings can be composites this is the length of the
 376     pattern at the time it was committed with a scan_commit. Note that
 377     the length is calculated by study_chunk, so that the minimum lengths
 378     are not known until the full pattern has been compiled, thus the
 379     pointer to the value.
 380
 381   - lookbehind
 382
 383     In the case of lookbehind the string being searched for can be
 384     offset past the start point of the final matching string.
 385     If this value was just blithely removed from the min_offset it would
 386     invalidate some of the calculations for how many chars must match
 387     before or after (as they are derived from min_offset and minlen and
 388     the length of the string being searched for).
 389     When the final pattern is compiled and the data is moved from the
 390     scan_data_t structure into the regexp structure the information
 391     about lookbehind is factored in, with the information that would
 392     have been lost precalculated in the end_shift field for the
 393     associated string.
 394
 395   The fields pos_min and pos_delta are used to store the minimum offset
 396   and the delta to the maximum offset at the current point in the pattern.
 397
 398 */
 399
 400 typedef struct scan_data_t {
 401     /*I32 len_min;      unused */
 402     /*I32 len_delta;    unused */
 403     SSize_t pos_min;
 404     SSize_t pos_delta;
 405     SV *last_found;
 406     SSize_t last_end;       /* min value, <0 unless valid. */
 407     SSize_t last_start_min;
 408     SSize_t last_start_max;
 409     SV **longest;           /* Either &l_fixed, or &l_float. */
 410     SV *longest_fixed;      /* longest fixed string found in pattern */
 411     SSize_t offset_fixed;   /* offset where it starts */
 412     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 413     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 414     SV *longest_float;      /* longest floating string found in pattern */
 415     SSize_t offset_float_min; /* earliest point in string it can appear */
 416     SSize_t offset_float_max; /* latest point in string it can appear */
 417     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 418     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 419     I32 flags;
 420     I32 whilem_c;
 421     SSize_t *last_closep;
 422     regnode_ssc *start_class;
 423 } scan_data_t;
 424
 425 /*
 426  * Forward declarations for pregcomp()'s friends.
 427  */
 428
 429 static const scan_data_t zero_scan_data =
 430   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 431
 432 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 433 #define SF_BEFORE_SEOL          0x0001
 434 #define SF_BEFORE_MEOL          0x0002
 435 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 436 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 437
 438 #define SF_FIX_SHIFT_EOL        (+2)
 439 #define SF_FL_SHIFT_EOL         (+4)
 440
 441 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 442 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 443
 444 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 445 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 446 #define SF_IS_INF               0x0040
 447 #define SF_HAS_PAR              0x0080
 448 #define SF_IN_PAR               0x0100
 449 #define SF_HAS_EVAL             0x0200
 450 #define SCF_DO_SUBSTR           0x0400
 451 #define SCF_DO_STCLASS_AND      0x0800
 452 #define SCF_DO_STCLASS_OR       0x1000
 453 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 454 #define SCF_WHILEM_VISITED_POS  0x2000
 455
 456 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 457 #define SCF_SEEN_ACCEPT         0x8000
 458 #define SCF_TRIE_DOING_RESTUDY 0x10000
 459 #define SCF_IN_DEFINE          0x20000
 460
 461
 462
 463
 464 #define UTF cBOOL(RExC_utf8)
 465
 466 /* The enums for all these are ordered so things work out correctly */
 467 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 468 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 469                                                      == REGEX_DEPENDS_CHARSET)
 470 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 471 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 472                                                      >= REGEX_UNICODE_CHARSET)
 473 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 474                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 475 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 476                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 477 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 478                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 479
 480 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 481
 482 /* For programs that want to be strictly Unicode compatible by dying if any
 483  * attempt is made to match a non-Unicode code point against a Unicode
 484  * property.  */
 485 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 486
 487 #define OOB_NAMEDCLASS          -1
 488
 489 /* There is no code point that is out-of-bounds, so this is problematic.  But
 490  * its only current use is to initialize a variable that is always set before
 491  * looked at. */
 492 #define OOB_UNICODE             0xDEADBEEF
 493
 494 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 495 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 496
 497
 498 /* length of regex to show in messages that don't mark a position within */
 499 #define RegexLengthToShowInErrorMessages 127
 500
 501 /*
 502  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 503  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 504  * op/pragma/warn/regcomp.
 505  */
 506 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 507 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 508
 509 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 510                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 511
 512 #define REPORT_LOCATION_ARGS(offset)            \
 513                 UTF8fARG(UTF, offset, RExC_precomp), \
 514                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 515
 516 /*
 517  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 518  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 519  * "...".
 520  */
 521 #define _FAIL(code) STMT_START {                                        \
 522     const char *ellipses = "";                                          \
 523     IV len = RExC_end - RExC_precomp;                                   \
 524                                                                         \
 525     if (!SIZE_ONLY)                                                     \
 526         SAVEFREESV(RExC_rx_sv);                                         \
 527     if (len > RegexLengthToShowInErrorMessages) {                       \
 528         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 529         len = RegexLengthToShowInErrorMessages - 10;                    \
 530         ellipses = "...";                                               \
 531     }                                                                   \
 532     code;                                                               \
 533 } STMT_END
 534
 535 #define FAIL(msg) _FAIL(                            \
 536     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 537             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 538
 539 #define FAIL2(msg,arg) _FAIL(                       \
 540     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 541             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 542
 543 /*
 544  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 545  */
 546 #define Simple_vFAIL(m) STMT_START {                                    \
 547     const IV offset =                                                   \
 548         (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
 549     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 550             m, REPORT_LOCATION_ARGS(offset));   \
 551 } STMT_END
 552
 553 /*
 554  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 555  */
 556 #define vFAIL(m) STMT_START {                           \
 557     if (!SIZE_ONLY)                                     \
 558         SAVEFREESV(RExC_rx_sv);                         \
 559     Simple_vFAIL(m);                                    \
 560 } STMT_END
 561
 562 /*
 563  * Like Simple_vFAIL(), but accepts two arguments.
 564  */
 565 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 566     const IV offset = RExC_parse - RExC_precomp;                        \
 567     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 568                       REPORT_LOCATION_ARGS(offset));    \
 569 } STMT_END
 570
 571 /*
 572  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 573  */
 574 #define vFAIL2(m,a1) STMT_START {                       \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL2(m, a1);                               \
 578 } STMT_END
 579
 580
 581 /*
 582  * Like Simple_vFAIL(), but accepts three arguments.
 583  */
 584 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 585     const IV offset = RExC_parse - RExC_precomp;                \
 586     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 587             REPORT_LOCATION_ARGS(offset));      \
 588 } STMT_END
 589
 590 /*
 591  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 592  */
 593 #define vFAIL3(m,a1,a2) STMT_START {                    \
 594     if (!SIZE_ONLY)                                     \
 595         SAVEFREESV(RExC_rx_sv);                         \
 596     Simple_vFAIL3(m, a1, a2);                           \
 597 } STMT_END
 598
 599 /*
 600  * Like Simple_vFAIL(), but accepts four arguments.
 601  */
 602 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 603     const IV offset = RExC_parse - RExC_precomp;                \
 604     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 605             REPORT_LOCATION_ARGS(offset));      \
 606 } STMT_END
 607
 608 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 609     if (!SIZE_ONLY)                                     \
 610         SAVEFREESV(RExC_rx_sv);                         \
 611     Simple_vFAIL4(m, a1, a2, a3);                       \
 612 } STMT_END
 613
 614 /* A specialized version of vFAIL2 that works with UTF8f */
 615 #define vFAIL2utf8f(m, a1) STMT_START { \
 616     const IV offset = RExC_parse - RExC_precomp;   \
 617     if (!SIZE_ONLY)                                \
 618         SAVEFREESV(RExC_rx_sv);                    \
 619     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 620             REPORT_LOCATION_ARGS(offset));         \
 621 } STMT_END
 622
 623 /* These have asserts in them because of [perl #122671] Many warnings in
 624  * regcomp.c can occur twice.  If they get output in pass1 and later in that
 625  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
 626  * would get output again.  So they should be output in pass2, and these
 627  * asserts make sure new warnings follow that paradigm. */
 628
 629 /* m is not necessarily a "literal string", in this macro */
 630 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 631     const IV offset = loc - RExC_precomp;                               \
 632     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 633             m, REPORT_LOCATION_ARGS(offset));       \
 634 } STMT_END
 635
 636 #define ckWARNreg(loc,m) STMT_START {                                   \
 637     const IV offset = loc - RExC_precomp;                               \
 638     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 639             REPORT_LOCATION_ARGS(offset));              \
 640 } STMT_END
 641
 642 #define vWARN_dep(loc, m) STMT_START {                                  \
 643     const IV offset = loc - RExC_precomp;                               \
 644     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
 645             REPORT_LOCATION_ARGS(offset));              \
 646 } STMT_END
 647
 648 #define ckWARNdep(loc,m) STMT_START {                                   \
 649     const IV offset = loc - RExC_precomp;                               \
 650     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
 651             m REPORT_LOCATION,                                          \
 652             REPORT_LOCATION_ARGS(offset));              \
 653 } STMT_END
 654
 655 #define ckWARNregdep(loc,m) STMT_START {                                \
 656     const IV offset = loc - RExC_precomp;                               \
 657     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
 658             m REPORT_LOCATION,                                          \
 659             REPORT_LOCATION_ARGS(offset));              \
 660 } STMT_END
 661
 662 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 663     const IV offset = loc - RExC_precomp;                               \
 664     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
 665             m REPORT_LOCATION,                                          \
 666             a1, REPORT_LOCATION_ARGS(offset));  \
 667 } STMT_END
 668
 669 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 670     const IV offset = loc - RExC_precomp;                               \
 671     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 672             a1, REPORT_LOCATION_ARGS(offset));  \
 673 } STMT_END
 674
 675 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 676     const IV offset = loc - RExC_precomp;                               \
 677     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 678             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 679 } STMT_END
 680
 681 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 682     const IV offset = loc - RExC_precomp;                               \
 683     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 684             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 685 } STMT_END
 686
 687 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 688     const IV offset = loc - RExC_precomp;                               \
 689     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 690             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 691 } STMT_END
 692
 693 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 694     const IV offset = loc - RExC_precomp;                               \
 695     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
 696             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 697 } STMT_END
 698
 699 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 700     const IV offset = loc - RExC_precomp;                               \
 701     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
 702             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 703 } STMT_END
 704
 705
 706 /* Allow for side effects in s */
 707 #define REGC(c,s) STMT_START {                  \
 708     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 709 } STMT_END
 710
 711 /* Macros for recording node offsets.   20001227 mjd@plover.com
 712  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 713  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 714  * Element 0 holds the number n.
 715  * Position is 1 indexed.
 716  */
 717 #ifndef RE_TRACK_PATTERN_OFFSETS
 718 #define Set_Node_Offset_To_R(node,byte)
 719 #define Set_Node_Offset(node,byte)
 720 #define Set_Cur_Node_Offset
 721 #define Set_Node_Length_To_R(node,len)
 722 #define Set_Node_Length(node,len)
 723 #define Set_Node_Cur_Length(node,start)
 724 #define Node_Offset(n)
 725 #define Node_Length(n)
 726 #define Set_Node_Offset_Length(node,offset,len)
 727 #define ProgLen(ri) ri->u.proglen
 728 #define SetProgLen(ri,x) ri->u.proglen = x
 729 #else
 730 #define ProgLen(ri) ri->u.offsets[0]
 731 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 732 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 733     if (! SIZE_ONLY) {                                                  \
 734         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 735                     __LINE__, (int)(node), (int)(byte)));               \
 736         if((node) < 0) {                                                \
 737             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 738                                          (int)(node));                  \
 739         } else {                                                        \
 740             RExC_offsets[2*(node)-1] = (byte);                          \
 741         }                                                               \
 742     }                                                                   \
 743 } STMT_END
 744
 745 #define Set_Node_Offset(node,byte) \
 746     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 747 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 748
 749 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 750     if (! SIZE_ONLY) {                                                  \
 751         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 752                 __LINE__, (int)(node), (int)(len)));                    \
 753         if((node) < 0) {                                                \
 754             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 755                                          (int)(node));                  \
 756         } else {                                                        \
 757             RExC_offsets[2*(node)] = (len);                             \
 758         }                                                               \
 759     }                                                                   \
 760 } STMT_END
 761
 762 #define Set_Node_Length(node,len) \
 763     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 764 #define Set_Node_Cur_Length(node, start)                \
 765     Set_Node_Length(node, RExC_parse - start)
 766
 767 /* Get offsets and lengths */
 768 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 769 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 770
 771 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 772     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 773     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 774 } STMT_END
 775 #endif
 776
 777 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 778 #define EXPERIMENTAL_INPLACESCAN
 779 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 780
 781 #define DEBUG_RExC_seen() \
 782         DEBUG_OPTIMISE_MORE_r({                                             \
 783             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 784                                                                             \
 785             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 786                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 787                                                                             \
 788             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 789                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 790                                                                             \
 791             if (RExC_seen & REG_GPOS_SEEN)                                  \
 792                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 793                                                                             \
 794             if (RExC_seen & REG_CANY_SEEN)                                  \
 795                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 796                                                                             \
 797             if (RExC_seen & REG_RECURSE_SEEN)                               \
 798                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 799                                                                             \
 800             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 801                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 802                                                                             \
 803             if (RExC_seen & REG_VERBARG_SEEN)                               \
 804                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 805                                                                             \
 806             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 807                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 808                                                                             \
 809             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 810                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 811                                                                             \
 812             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 813                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 814                                                                             \
 815             if (RExC_seen & REG_GOSTART_SEEN)                               \
 816                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 817                                                                             \
 818             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 819                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 820                                                                             \
 821             PerlIO_printf(Perl_debug_log,"\n");                             \
 822         });
 823
 824 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 825   if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
 826
 827 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
 828     if ( ( flags ) ) {                                                      \
 829         PerlIO_printf(Perl_debug_log, "%s", open_str);                      \
 830         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
 831         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
 832         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
 833         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
 834         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
 835         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
 836         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
 837         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
 838         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
 839         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
 840         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
 841         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
 842         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
 843         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
 844         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
 845         PerlIO_printf(Perl_debug_log, "%s", close_str);                     \
 846     }
 847
 848
 849 #define DEBUG_STUDYDATA(str,data,depth)                              \
 850 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 851     PerlIO_printf(Perl_debug_log,                                    \
 852         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 853         " Flags: 0x%"UVXf,                                           \
 854         (int)(depth)*2, "",                                          \
 855         (IV)((data)->pos_min),                                       \
 856         (IV)((data)->pos_delta),                                     \
 857         (UV)((data)->flags)                                          \
 858     );                                                               \
 859     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
 860     PerlIO_printf(Perl_debug_log,                                    \
 861         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
 862         (IV)((data)->whilem_c),                                      \
 863         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 864         is_inf ? "INF " : ""                                         \
 865     );                                                               \
 866     if ((data)->last_found)                                          \
 867         PerlIO_printf(Perl_debug_log,                                \
 868             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 869             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 870             SvPVX_const((data)->last_found),                         \
 871             (IV)((data)->last_end),                                  \
 872             (IV)((data)->last_start_min),                            \
 873             (IV)((data)->last_start_max),                            \
 874             ((data)->longest &&                                      \
 875              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 876             SvPVX_const((data)->longest_fixed),                      \
 877             (IV)((data)->offset_fixed),                              \
 878             ((data)->longest &&                                      \
 879              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 880             SvPVX_const((data)->longest_float),                      \
 881             (IV)((data)->offset_float_min),                          \
 882             (IV)((data)->offset_float_max)                           \
 883         );                                                           \
 884     PerlIO_printf(Perl_debug_log,"\n");                              \
 885 });
 886
 887 #ifdef DEBUGGING
 888
 889 /* is c a control character for which we have a mnemonic? */
 890 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 891
 892 STATIC const char *
 893 S_cntrl_to_mnemonic(const U8 c)
 894 {
 895     /* Returns the mnemonic string that represents character 'c', if one
 896      * exists; NULL otherwise.  The only ones that exist for the purposes of
 897      * this routine are a few control characters */
 898
 899     switch (c) {
 900         case '\a':       return "\\a";
 901         case '\b':       return "\\b";
 902         case ESC_NATIVE: return "\\e";
 903         case '\f':       return "\\f";
 904         case '\n':       return "\\n";
 905         case '\r':       return "\\r";
 906         case '\t':       return "\\t";
 907     }
 908
 909     return NULL;
 910 }
 911
 912 #endif
 913
 914 /* Mark that we cannot extend a found fixed substring at this point.
 915    Update the longest found anchored substring and the longest found
 916    floating substrings if needed. */
 917
 918 STATIC void
 919 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 920                     SSize_t *minlenp, int is_inf)
 921 {
 922     const STRLEN l = CHR_SVLEN(data->last_found);
 923     const STRLEN old_l = CHR_SVLEN(*data->longest);
 924     GET_RE_DEBUG_FLAGS_DECL;
 925
 926     PERL_ARGS_ASSERT_SCAN_COMMIT;
 927
 928     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 929         SvSetMagicSV(*data->longest, data->last_found);
 930         if (*data->longest == data->longest_fixed) {
 931             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 932             if (data->flags & SF_BEFORE_EOL)
 933                 data->flags
 934                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 935             else
 936                 data->flags &= ~SF_FIX_BEFORE_EOL;
 937             data->minlen_fixed=minlenp;
 938             data->lookbehind_fixed=0;
 939         }
 940         else { /* *data->longest == data->longest_float */
 941             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 942             data->offset_float_max = (l
 943                           ? data->last_start_max
 944                           : (data->pos_delta > SSize_t_MAX - data->pos_min
 945                                          ? SSize_t_MAX
 946                                          : data->pos_min + data->pos_delta));
 947             if (is_inf
 948                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 949                 data->offset_float_max = SSize_t_MAX;
 950             if (data->flags & SF_BEFORE_EOL)
 951                 data->flags
 952                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 953             else
 954                 data->flags &= ~SF_FL_BEFORE_EOL;
 955             data->minlen_float=minlenp;
 956             data->lookbehind_float=0;
 957         }
 958     }
 959     SvCUR_set(data->last_found, 0);
 960     {
 961         SV * const sv = data->last_found;
 962         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 963             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 964             if (mg)
 965                 mg->mg_len = 0;
 966         }
 967     }
 968     data->last_end = -1;
 969     data->flags &= ~SF_BEFORE_EOL;
 970     DEBUG_STUDYDATA("commit: ",data,0);
 971 }
 972
 973 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 974  * list that describes which code points it matches */
 975
 976 STATIC void
 977 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 978 {
 979     /* Set the SSC 'ssc' to match an empty string or any code point */
 980
 981     PERL_ARGS_ASSERT_SSC_ANYTHING;
 982
 983     assert(is_ANYOF_SYNTHETIC(ssc));
 984
 985     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 986     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 987     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 988 }
 989
 990 STATIC int
 991 S_ssc_is_anything(const regnode_ssc *ssc)
 992 {
 993     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 994      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 995      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 996      * in any way, so there's no point in using it */
 997
 998     UV start, end;
 999     bool ret;
1000
1001     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1002
1003     assert(is_ANYOF_SYNTHETIC(ssc));
1004
1005     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1006         return FALSE;
1007     }
1008
1009     /* See if the list consists solely of the range 0 - Infinity */
1010     invlist_iterinit(ssc->invlist);
1011     ret = invlist_iternext(ssc->invlist, &start, &end)
1012           && start == 0
1013           && end == UV_MAX;
1014
1015     invlist_iterfinish(ssc->invlist);
1016
1017     if (ret) {
1018         return TRUE;
1019     }
1020
1021     /* If e.g., both \w and \W are set, matches everything */
1022     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1023         int i;
1024         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1025             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1026                 return TRUE;
1027             }
1028         }
1029     }
1030
1031     return FALSE;
1032 }
1033
1034 STATIC void
1035 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1036 {
1037     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1038      * string, any code point, or any posix class under locale */
1039
1040     PERL_ARGS_ASSERT_SSC_INIT;
1041
1042     Zero(ssc, 1, regnode_ssc);
1043     set_ANYOF_SYNTHETIC(ssc);
1044     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1045     ssc_anything(ssc);
1046
1047     /* If any portion of the regex is to operate under locale rules that aren't
1048      * fully known at compile time, initialization includes it.  The reason
1049      * this isn't done for all regexes is that the optimizer was written under
1050      * the assumption that locale was all-or-nothing.  Given the complexity and
1051      * lack of documentation in the optimizer, and that there are inadequate
1052      * test cases for locale, many parts of it may not work properly, it is
1053      * safest to avoid locale unless necessary. */
1054     if (RExC_contains_locale) {
1055         ANYOF_POSIXL_SETALL(ssc);
1056     }
1057     else {
1058         ANYOF_POSIXL_ZERO(ssc);
1059     }
1060 }
1061
1062 STATIC int
1063 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1064                         const regnode_ssc *ssc)
1065 {
1066     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1067      * to the list of code points matched, and locale posix classes; hence does
1068      * not check its flags) */
1069
1070     UV start, end;
1071     bool ret;
1072
1073     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1074
1075     assert(is_ANYOF_SYNTHETIC(ssc));
1076
1077     invlist_iterinit(ssc->invlist);
1078     ret = invlist_iternext(ssc->invlist, &start, &end)
1079           && start == 0
1080           && end == UV_MAX;
1081
1082     invlist_iterfinish(ssc->invlist);
1083
1084     if (! ret) {
1085         return FALSE;
1086     }
1087
1088     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1089         return FALSE;
1090     }
1091
1092     return TRUE;
1093 }
1094
1095 STATIC SV*
1096 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1097                                const regnode_charclass* const node)
1098 {
1099     /* Returns a mortal inversion list defining which code points are matched
1100      * by 'node', which is of type ANYOF.  Handles complementing the result if
1101      * appropriate.  If some code points aren't knowable at this time, the
1102      * returned list must, and will, contain every code point that is a
1103      * possibility. */
1104
1105     SV* invlist = sv_2mortal(_new_invlist(0));
1106     SV* only_utf8_locale_invlist = NULL;
1107     unsigned int i;
1108     const U32 n = ARG(node);
1109     bool new_node_has_latin1 = FALSE;
1110
1111     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1112
1113     /* Look at the data structure created by S_set_ANYOF_arg() */
1114     if (n != ANYOF_ONLY_HAS_BITMAP) {
1115         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1116         AV * const av = MUTABLE_AV(SvRV(rv));
1117         SV **const ary = AvARRAY(av);
1118         assert(RExC_rxi->data->what[n] == 's');
1119
1120         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1121             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1122         }
1123         else if (ary[0] && ary[0] != &PL_sv_undef) {
1124
1125             /* Here, no compile-time swash, and there are things that won't be
1126              * known until runtime -- we have to assume it could be anything */
1127             return _add_range_to_invlist(invlist, 0, UV_MAX);
1128         }
1129         else if (ary[3] && ary[3] != &PL_sv_undef) {
1130
1131             /* Here no compile-time swash, and no run-time only data.  Use the
1132              * node's inversion list */
1133             invlist = sv_2mortal(invlist_clone(ary[3]));
1134         }
1135
1136         /* Get the code points valid only under UTF-8 locales */
1137         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1138             && ary[2] && ary[2] != &PL_sv_undef)
1139         {
1140             only_utf8_locale_invlist = ary[2];
1141         }
1142     }
1143
1144     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1145      * code points, and an inversion list for the others, but if there are code
1146      * points that should match only conditionally on the target string being
1147      * UTF-8, those are placed in the inversion list, and not the bitmap.
1148      * Since there are circumstances under which they could match, they are
1149      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1150      * to exclude them here, so that when we invert below, the end result
1151      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1152      * have to do this here before we add the unconditionally matched code
1153      * points */
1154     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1155         _invlist_intersection_complement_2nd(invlist,
1156                                              PL_UpperLatin1,
1157                                              &invlist);
1158     }
1159
1160     /* Add in the points from the bit map */
1161     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1162         if (ANYOF_BITMAP_TEST(node, i)) {
1163             invlist = add_cp_to_invlist(invlist, i);
1164             new_node_has_latin1 = TRUE;
1165         }
1166     }
1167
1168     /* If this can match all upper Latin1 code points, have to add them
1169      * as well */
1170     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1171         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1172     }
1173
1174     /* Similarly for these */
1175     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1176         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1177     }
1178
1179     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1180         _invlist_invert(invlist);
1181     }
1182     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1183
1184         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1185          * locale.  We can skip this if there are no 0-255 at all. */
1186         _invlist_union(invlist, PL_Latin1, &invlist);
1187     }
1188
1189     /* Similarly add the UTF-8 locale possible matches.  These have to be
1190      * deferred until after the non-UTF-8 locale ones are taken care of just
1191      * above, or it leads to wrong results under ANYOF_INVERT */
1192     if (only_utf8_locale_invlist) {
1193         _invlist_union_maybe_complement_2nd(invlist,
1194                                             only_utf8_locale_invlist,
1195                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1196                                             &invlist);
1197     }
1198
1199     return invlist;
1200 }
1201
1202 /* These two functions currently do the exact same thing */
1203 #define ssc_init_zero           ssc_init
1204
1205 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1206 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1207
1208 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1209  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1210  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1211
1212 STATIC void
1213 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1214                 const regnode_charclass *and_with)
1215 {
1216     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1217      * another SSC or a regular ANYOF class.  Can create false positives. */
1218
1219     SV* anded_cp_list;
1220     U8  anded_flags;
1221
1222     PERL_ARGS_ASSERT_SSC_AND;
1223
1224     assert(is_ANYOF_SYNTHETIC(ssc));
1225
1226     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1227      * the code point inversion list and just the relevant flags */
1228     if (is_ANYOF_SYNTHETIC(and_with)) {
1229         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1230         anded_flags = ANYOF_FLAGS(and_with);
1231
1232         /* XXX This is a kludge around what appears to be deficiencies in the
1233          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1234          * there are paths through the optimizer where it doesn't get weeded
1235          * out when it should.  And if we don't make some extra provision for
1236          * it like the code just below, it doesn't get added when it should.
1237          * This solution is to add it only when AND'ing, which is here, and
1238          * only when what is being AND'ed is the pristine, original node
1239          * matching anything.  Thus it is like adding it to ssc_anything() but
1240          * only when the result is to be AND'ed.  Probably the same solution
1241          * could be adopted for the same problem we have with /l matching,
1242          * which is solved differently in S_ssc_init(), and that would lead to
1243          * fewer false positives than that solution has.  But if this solution
1244          * creates bugs, the consequences are only that a warning isn't raised
1245          * that should be; while the consequences for having /l bugs is
1246          * incorrect matches */
1247         if (ssc_is_anything((regnode_ssc *)and_with)) {
1248             anded_flags |= ANYOF_WARN_SUPER;
1249         }
1250     }
1251     else {
1252         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1253         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1254     }
1255
1256     ANYOF_FLAGS(ssc) &= anded_flags;
1257
1258     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1259      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1260      * 'and_with' may be inverted.  When not inverted, we have the situation of
1261      * computing:
1262      *  (C1 | P1) & (C2 | P2)
1263      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1264      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1265      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1266      *                    <=  ((C1 & C2) | P1 | P2)
1267      * Alternatively, the last few steps could be:
1268      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1269      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1270      *                    <=  (C1 | C2 | (P1 & P2))
1271      * We favor the second approach if either P1 or P2 is non-empty.  This is
1272      * because these components are a barrier to doing optimizations, as what
1273      * they match cannot be known until the moment of matching as they are
1274      * dependent on the current locale, 'AND"ing them likely will reduce or
1275      * eliminate them.
1276      * But we can do better if we know that C1,P1 are in their initial state (a
1277      * frequent occurrence), each matching everything:
1278      *  (<everything>) & (C2 | P2) =  C2 | P2
1279      * Similarly, if C2,P2 are in their initial state (again a frequent
1280      * occurrence), the result is a no-op
1281      *  (C1 | P1) & (<everything>) =  C1 | P1
1282      *
1283      * Inverted, we have
1284      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1285      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1286      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1287      * */
1288
1289     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1290         && ! is_ANYOF_SYNTHETIC(and_with))
1291     {
1292         unsigned int i;
1293
1294         ssc_intersection(ssc,
1295                          anded_cp_list,
1296                          FALSE /* Has already been inverted */
1297                          );
1298
1299         /* If either P1 or P2 is empty, the intersection will be also; can skip
1300          * the loop */
1301         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1302             ANYOF_POSIXL_ZERO(ssc);
1303         }
1304         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1305
1306             /* Note that the Posix class component P from 'and_with' actually
1307              * looks like:
1308              *      P = Pa | Pb | ... | Pn
1309              * where each component is one posix class, such as in [\w\s].
1310              * Thus
1311              *      ~P = ~(Pa | Pb | ... | Pn)
1312              *         = ~Pa & ~Pb & ... & ~Pn
1313              *        <= ~Pa | ~Pb | ... | ~Pn
1314              * The last is something we can easily calculate, but unfortunately
1315              * is likely to have many false positives.  We could do better
1316              * in some (but certainly not all) instances if two classes in
1317              * P have known relationships.  For example
1318              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1319              * So
1320              *      :lower: & :print: = :lower:
1321              * And similarly for classes that must be disjoint.  For example,
1322              * since \s and \w can have no elements in common based on rules in
1323              * the POSIX standard,
1324              *      \w & ^\S = nothing
1325              * Unfortunately, some vendor locales do not meet the Posix
1326              * standard, in particular almost everything by Microsoft.
1327              * The loop below just changes e.g., \w into \W and vice versa */
1328
1329             regnode_charclass_posixl temp;
1330             int add = 1;    /* To calculate the index of the complement */
1331
1332             ANYOF_POSIXL_ZERO(&temp);
1333             for (i = 0; i < ANYOF_MAX; i++) {
1334                 assert(i % 2 != 0
1335                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1336                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1337
1338                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1339                     ANYOF_POSIXL_SET(&temp, i + add);
1340                 }
1341                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1342             }
1343             ANYOF_POSIXL_AND(&temp, ssc);
1344
1345         } /* else ssc already has no posixes */
1346     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1347          in its initial state */
1348     else if (! is_ANYOF_SYNTHETIC(and_with)
1349              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1350     {
1351         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1352          * copy it over 'ssc' */
1353         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1354             if (is_ANYOF_SYNTHETIC(and_with)) {
1355                 StructCopy(and_with, ssc, regnode_ssc);
1356             }
1357             else {
1358                 ssc->invlist = anded_cp_list;
1359                 ANYOF_POSIXL_ZERO(ssc);
1360                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1361                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1362                 }
1363             }
1364         }
1365         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1366                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1367         {
1368             /* One or the other of P1, P2 is non-empty. */
1369             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1370                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1371             }
1372             ssc_union(ssc, anded_cp_list, FALSE);
1373         }
1374         else { /* P1 = P2 = empty */
1375             ssc_intersection(ssc, anded_cp_list, FALSE);
1376         }
1377     }
1378 }
1379
1380 STATIC void
1381 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1382                const regnode_charclass *or_with)
1383 {
1384     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1385      * another SSC or a regular ANYOF class.  Can create false positives if
1386      * 'or_with' is to be inverted. */
1387
1388     SV* ored_cp_list;
1389     U8 ored_flags;
1390
1391     PERL_ARGS_ASSERT_SSC_OR;
1392
1393     assert(is_ANYOF_SYNTHETIC(ssc));
1394
1395     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1396      * the code point inversion list and just the relevant flags */
1397     if (is_ANYOF_SYNTHETIC(or_with)) {
1398         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1399         ored_flags = ANYOF_FLAGS(or_with);
1400     }
1401     else {
1402         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1403         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1404     }
1405
1406     ANYOF_FLAGS(ssc) |= ored_flags;
1407
1408     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1409      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1410      * 'or_with' may be inverted.  When not inverted, we have the simple
1411      * situation of computing:
1412      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1413      * If P1|P2 yields a situation with both a class and its complement are
1414      * set, like having both \w and \W, this matches all code points, and we
1415      * can delete these from the P component of the ssc going forward.  XXX We
1416      * might be able to delete all the P components, but I (khw) am not certain
1417      * about this, and it is better to be safe.
1418      *
1419      * Inverted, we have
1420      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1421      *                         <=  (C1 | P1) | ~C2
1422      *                         <=  (C1 | ~C2) | P1
1423      * (which results in actually simpler code than the non-inverted case)
1424      * */
1425
1426     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1427         && ! is_ANYOF_SYNTHETIC(or_with))
1428     {
1429         /* We ignore P2, leaving P1 going forward */
1430     }   /* else  Not inverted */
1431     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1432         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1433         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1434             unsigned int i;
1435             for (i = 0; i < ANYOF_MAX; i += 2) {
1436                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1437                 {
1438                     ssc_match_all_cp(ssc);
1439                     ANYOF_POSIXL_CLEAR(ssc, i);
1440                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1441                 }
1442             }
1443         }
1444     }
1445
1446     ssc_union(ssc,
1447               ored_cp_list,
1448               FALSE /* Already has been inverted */
1449               );
1450 }
1451
1452 PERL_STATIC_INLINE void
1453 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1454 {
1455     PERL_ARGS_ASSERT_SSC_UNION;
1456
1457     assert(is_ANYOF_SYNTHETIC(ssc));
1458
1459     _invlist_union_maybe_complement_2nd(ssc->invlist,
1460                                         invlist,
1461                                         invert2nd,
1462                                         &ssc->invlist);
1463 }
1464
1465 PERL_STATIC_INLINE void
1466 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1467                          SV* const invlist,
1468                          const bool invert2nd)
1469 {
1470     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1471
1472     assert(is_ANYOF_SYNTHETIC(ssc));
1473
1474     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1475                                                invlist,
1476                                                invert2nd,
1477                                                &ssc->invlist);
1478 }
1479
1480 PERL_STATIC_INLINE void
1481 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1482 {
1483     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1484
1485     assert(is_ANYOF_SYNTHETIC(ssc));
1486
1487     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1488 }
1489
1490 PERL_STATIC_INLINE void
1491 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1492 {
1493     /* AND just the single code point 'cp' into the SSC 'ssc' */
1494
1495     SV* cp_list = _new_invlist(2);
1496
1497     PERL_ARGS_ASSERT_SSC_CP_AND;
1498
1499     assert(is_ANYOF_SYNTHETIC(ssc));
1500
1501     cp_list = add_cp_to_invlist(cp_list, cp);
1502     ssc_intersection(ssc, cp_list,
1503                      FALSE /* Not inverted */
1504                      );
1505     SvREFCNT_dec_NN(cp_list);
1506 }
1507
1508 PERL_STATIC_INLINE void
1509 S_ssc_clear_locale(regnode_ssc *ssc)
1510 {
1511     /* Set the SSC 'ssc' to not match any locale things */
1512     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1513
1514     assert(is_ANYOF_SYNTHETIC(ssc));
1515
1516     ANYOF_POSIXL_ZERO(ssc);
1517     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1518 }
1519
1520 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1521
1522 STATIC bool
1523 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1524 {
1525     /* The synthetic start class is used to hopefully quickly winnow down
1526      * places where a pattern could start a match in the target string.  If it
1527      * doesn't really narrow things down that much, there isn't much point to
1528      * having the overhead of using it.  This function uses some very crude
1529      * heuristics to decide if to use the ssc or not.
1530      *
1531      * It returns TRUE if 'ssc' rules out more than half what it considers to
1532      * be the "likely" possible matches, but of course it doesn't know what the
1533      * actual things being matched are going to be; these are only guesses
1534      *
1535      * For /l matches, it assumes that the only likely matches are going to be
1536      *      in the 0-255 range, uniformly distributed, so half of that is 127
1537      * For /a and /d matches, it assumes that the likely matches will be just
1538      *      the ASCII range, so half of that is 63
1539      * For /u and there isn't anything matching above the Latin1 range, it
1540      *      assumes that that is the only range likely to be matched, and uses
1541      *      half that as the cut-off: 127.  If anything matches above Latin1,
1542      *      it assumes that all of Unicode could match (uniformly), except for
1543      *      non-Unicode code points and things in the General Category "Other"
1544      *      (unassigned, private use, surrogates, controls and formats).  This
1545      *      is a much large number. */
1546
1547     const U32 max_match = (LOC)
1548                           ? 127
1549                           : (! UNI_SEMANTICS)
1550                             ? 63
1551                             : (invlist_highest(ssc->invlist) < 256)
1552                               ? 127
1553                               : ((NON_OTHER_COUNT + 1) / 2) - 1;
1554     U32 count = 0;      /* Running total of number of code points matched by
1555                            'ssc' */
1556     UV start, end;      /* Start and end points of current range in inversion
1557                            list */
1558
1559     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1560
1561     invlist_iterinit(ssc->invlist);
1562     while (invlist_iternext(ssc->invlist, &start, &end)) {
1563
1564         /* /u is the only thing that we expect to match above 255; so if not /u
1565          * and even if there are matches above 255, ignore them.  This catches
1566          * things like \d under /d which does match the digits above 255, but
1567          * since the pattern is /d, it is not likely to be expecting them */
1568         if (! UNI_SEMANTICS) {
1569             if (start > 255) {
1570                 break;
1571             }
1572             end = MIN(end, 255);
1573         }
1574         count += end - start + 1;
1575         if (count > max_match) {
1576             invlist_iterfinish(ssc->invlist);
1577             return FALSE;
1578         }
1579     }
1580
1581     return TRUE;
1582 }
1583
1584
1585 STATIC void
1586 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1587 {
1588     /* The inversion list in the SSC is marked mortal; now we need a more
1589      * permanent copy, which is stored the same way that is done in a regular
1590      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1591      * map */
1592
1593     SV* invlist = invlist_clone(ssc->invlist);
1594
1595     PERL_ARGS_ASSERT_SSC_FINALIZE;
1596
1597     assert(is_ANYOF_SYNTHETIC(ssc));
1598
1599     /* The code in this file assumes that all but these flags aren't relevant
1600      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1601      * by the time we reach here */
1602     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1603
1604     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1605
1606     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1607                                 NULL, NULL, NULL, FALSE);
1608
1609     /* Make sure is clone-safe */
1610     ssc->invlist = NULL;
1611
1612     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1613         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1614     }
1615
1616     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1617 }
1618
1619 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1620 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1621 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1622 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1623                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1624                                : 0 )
1625
1626
1627 #ifdef DEBUGGING
1628 /*
1629    dump_trie(trie,widecharmap,revcharmap)
1630    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1631    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1632
1633    These routines dump out a trie in a somewhat readable format.
1634    The _interim_ variants are used for debugging the interim
1635    tables that are used to generate the final compressed
1636    representation which is what dump_trie expects.
1637
1638    Part of the reason for their existence is to provide a form
1639    of documentation as to how the different representations function.
1640
1641 */
1642
1643 /*
1644   Dumps the final compressed table form of the trie to Perl_debug_log.
1645   Used for debugging make_trie().
1646 */
1647
1648 STATIC void
1649 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1650             AV *revcharmap, U32 depth)
1651 {
1652     U32 state;
1653     SV *sv=sv_newmortal();
1654     int colwidth= widecharmap ? 6 : 4;
1655     U16 word;
1656     GET_RE_DEBUG_FLAGS_DECL;
1657
1658     PERL_ARGS_ASSERT_DUMP_TRIE;
1659
1660     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1661         (int)depth * 2 + 2,"",
1662         "Match","Base","Ofs" );
1663
1664     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, state, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1678         (int)depth * 2 + 2,"");
1679
1680     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1682     PerlIO_printf( Perl_debug_log, "\n");
1683
1684     for( state = 1 ; state < trie->statecount ; state++ ) {
1685         const U32 base = trie->states[ state ].trans.base;
1686
1687         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1688                                        (int)depth * 2 + 2,"", (UV)state);
1689
1690         if ( trie->states[ state ].wordnum ) {
1691             PerlIO_printf( Perl_debug_log, " W%4X",
1692                                            trie->states[ state ].wordnum );
1693         } else {
1694             PerlIO_printf( Perl_debug_log, "%6s", "" );
1695         }
1696
1697         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1698
1699         if ( base ) {
1700             U32 ofs = 0;
1701
1702             while( ( base + ofs  < trie->uniquecharcount ) ||
1703                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1704                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1705                                                                     != state))
1706                     ofs++;
1707
1708             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1709
1710             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1711                 if ( ( base + ofs >= trie->uniquecharcount )
1712                         && ( base + ofs - trie->uniquecharcount
1713                                                         < trie->lasttrans )
1714                         && trie->trans[ base + ofs
1715                                     - trie->uniquecharcount ].check == state )
1716                 {
1717                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1718                     colwidth,
1719                     (UV)trie->trans[ base + ofs
1720                                              - trie->uniquecharcount ].next );
1721                 } else {
1722                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1723                 }
1724             }
1725
1726             PerlIO_printf( Perl_debug_log, "]");
1727
1728         }
1729         PerlIO_printf( Perl_debug_log, "\n" );
1730     }
1731     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1732                                 (int)depth*2, "");
1733     for (word=1; word <= trie->wordcount; word++) {
1734         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1735             (int)word, (int)(trie->wordinfo[word].prev),
1736             (int)(trie->wordinfo[word].len));
1737     }
1738     PerlIO_printf(Perl_debug_log, "\n" );
1739 }
1740 /*
1741   Dumps a fully constructed but uncompressed trie in list form.
1742   List tries normally only are used for construction when the number of
1743   possible chars (trie->uniquecharcount) is very high.
1744   Used for debugging make_trie().
1745 */
1746 STATIC void
1747 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1748                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1749                          U32 depth)
1750 {
1751     U32 state;
1752     SV *sv=sv_newmortal();
1753     int colwidth= widecharmap ? 6 : 4;
1754     GET_RE_DEBUG_FLAGS_DECL;
1755
1756     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1757
1758     /* print out the table precompression.  */
1759     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1760         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1761         "------:-----+-----------------\n" );
1762
1763     for( state=1 ; state < next_alloc ; state ++ ) {
1764         U16 charid;
1765
1766         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1767             (int)depth * 2 + 2,"", (UV)state  );
1768         if ( ! trie->states[ state ].wordnum ) {
1769             PerlIO_printf( Perl_debug_log, "%5s| ","");
1770         } else {
1771             PerlIO_printf( Perl_debug_log, "W%4x| ",
1772                 trie->states[ state ].wordnum
1773             );
1774         }
1775         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1776             SV ** const tmp = av_fetch( revcharmap,
1777                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1778             if ( tmp ) {
1779                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1780                     colwidth,
1781                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1782                               colwidth,
1783                               PL_colors[0], PL_colors[1],
1784                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1785                               | PERL_PV_ESCAPE_FIRSTCHAR
1786                     ) ,
1787                     TRIE_LIST_ITEM(state,charid).forid,
1788                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1789                 );
1790                 if (!(charid % 10))
1791                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1792                         (int)((depth * 2) + 14), "");
1793             }
1794         }
1795         PerlIO_printf( Perl_debug_log, "\n");
1796     }
1797 }
1798
1799 /*
1800   Dumps a fully constructed but uncompressed trie in table form.
1801   This is the normal DFA style state transition table, with a few
1802   twists to facilitate compression later.
1803   Used for debugging make_trie().
1804 */
1805 STATIC void
1806 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1807                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1808                           U32 depth)
1809 {
1810     U32 state;
1811     U16 charid;
1812     SV *sv=sv_newmortal();
1813     int colwidth= widecharmap ? 6 : 4;
1814     GET_RE_DEBUG_FLAGS_DECL;
1815
1816     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1817
1818     /*
1819        print out the table precompression so that we can do a visual check
1820        that they are identical.
1821      */
1822
1823     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1824
1825     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1826         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1827         if ( tmp ) {
1828             PerlIO_printf( Perl_debug_log, "%*s",
1829                 colwidth,
1830                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1831                             PL_colors[0], PL_colors[1],
1832                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1833                             PERL_PV_ESCAPE_FIRSTCHAR
1834                 )
1835             );
1836         }
1837     }
1838
1839     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1840
1841     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1842         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1843     }
1844
1845     PerlIO_printf( Perl_debug_log, "\n" );
1846
1847     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1848
1849         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1850             (int)depth * 2 + 2,"",
1851             (UV)TRIE_NODENUM( state ) );
1852
1853         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1854             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1855             if (v)
1856                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1857             else
1858                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1859         }
1860         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1861             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1862                                             (UV)trie->trans[ state ].check );
1863         } else {
1864             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1865                                             (UV)trie->trans[ state ].check,
1866             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1867         }
1868     }
1869 }
1870
1871 #endif
1872
1873
1874 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1875   startbranch: the first branch in the whole branch sequence
1876   first      : start branch of sequence of branch-exact nodes.
1877                May be the same as startbranch
1878   last       : Thing following the last branch.
1879                May be the same as tail.
1880   tail       : item following the branch sequence
1881   count      : words in the sequence
1882   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1883   depth      : indent depth
1884
1885 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1886
1887 A trie is an N'ary tree where the branches are determined by digital
1888 decomposition of the key. IE, at the root node you look up the 1st character and
1889 follow that branch repeat until you find the end of the branches. Nodes can be
1890 marked as "accepting" meaning they represent a complete word. Eg:
1891
1892   /he|she|his|hers/
1893
1894 would convert into the following structure. Numbers represent states, letters
1895 following numbers represent valid transitions on the letter from that state, if
1896 the number is in square brackets it represents an accepting state, otherwise it
1897 will be in parenthesis.
1898
1899       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1900       |    |
1901       |   (2)
1902       |    |
1903      (1)   +-i->(6)-+-s->[7]
1904       |
1905       +-s->(3)-+-h->(4)-+-e->[5]
1906
1907       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1908
1909 This shows that when matching against the string 'hers' we will begin at state 1
1910 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1911 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1912 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1913 single traverse. We store a mapping from accepting to state to which word was
1914 matched, and then when we have multiple possibilities we try to complete the
1915 rest of the regex in the order in which they occured in the alternation.
1916
1917 The only prior NFA like behaviour that would be changed by the TRIE support is
1918 the silent ignoring of duplicate alternations which are of the form:
1919
1920  / (DUPE|DUPE) X? (?{ ... }) Y /x
1921
1922 Thus EVAL blocks following a trie may be called a different number of times with
1923 and without the optimisation. With the optimisations dupes will be silently
1924 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1925 the following demonstrates:
1926
1927  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1928
1929 which prints out 'word' three times, but
1930
1931  'words'=~/(word|word|word)(?{ print $1 })S/
1932
1933 which doesnt print it out at all. This is due to other optimisations kicking in.
1934
1935 Example of what happens on a structural level:
1936
1937 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1938
1939    1: CURLYM[1] {1,32767}(18)
1940    5:   BRANCH(8)
1941    6:     EXACT <ac>(16)
1942    8:   BRANCH(11)
1943    9:     EXACT <ad>(16)
1944   11:   BRANCH(14)
1945   12:     EXACT <ab>(16)
1946   16:   SUCCEED(0)
1947   17:   NOTHING(18)
1948   18: END(0)
1949
1950 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1951 and should turn into:
1952
1953    1: CURLYM[1] {1,32767}(18)
1954    5:   TRIE(16)
1955         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1956           <ac>
1957           <ad>
1958           <ab>
1959   16:   SUCCEED(0)
1960   17:   NOTHING(18)
1961   18: END(0)
1962
1963 Cases where tail != last would be like /(?foo|bar)baz/:
1964
1965    1: BRANCH(4)
1966    2:   EXACT <foo>(8)
1967    4: BRANCH(7)
1968    5:   EXACT <bar>(8)
1969    7: TAIL(8)
1970    8: EXACT <baz>(10)
1971   10: END(0)
1972
1973 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1974 and would end up looking like:
1975
1976     1: TRIE(8)
1977       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1978         <foo>
1979         <bar>
1980    7: TAIL(8)
1981    8: EXACT <baz>(10)
1982   10: END(0)
1983
1984     d = uvchr_to_utf8_flags(d, uv, 0);
1985
1986 is the recommended Unicode-aware way of saying
1987
1988     *(d++) = uv;
1989 */
1990
1991 #define TRIE_STORE_REVCHAR(val)                                            \
1992     STMT_START {                                                           \
1993         if (UTF) {                                                         \
1994             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1995             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1996             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1997             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1998             SvPOK_on(zlopp);                                               \
1999             SvUTF8_on(zlopp);                                              \
2000             av_push(revcharmap, zlopp);                                    \
2001         } else {                                                           \
2002             char ooooff = (char)val;                                           \
2003             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2004         }                                                                  \
2005         } STMT_END
2006
2007 /* This gets the next character from the input, folding it if not already
2008  * folded. */
2009 #define TRIE_READ_CHAR STMT_START {                                           \
2010     wordlen++;                                                                \
2011     if ( UTF ) {                                                              \
2012         /* if it is UTF then it is either already folded, or does not need    \
2013          * folding */                                                         \
2014         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2015     }                                                                         \
2016     else if (folder == PL_fold_latin1) {                                      \
2017         /* This folder implies Unicode rules, which in the range expressible  \
2018          *  by not UTF is the lower case, with the two exceptions, one of     \
2019          *  which should have been taken care of before calling this */       \
2020         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2021         uvc = toLOWER_L1(*uc);                                                \
2022         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2023         len = 1;                                                              \
2024     } else {                                                                  \
2025         /* raw data, will be folded later if needed */                        \
2026         uvc = (U32)*uc;                                                       \
2027         len = 1;                                                              \
2028     }                                                                         \
2029 } STMT_END
2030
2031
2032
2033 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2034     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2035         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2036         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2037     }                                                           \
2038     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2039     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2040     TRIE_LIST_CUR( state )++;                                   \
2041 } STMT_END
2042
2043 #define TRIE_LIST_NEW(state) STMT_START {                       \
2044     Newxz( trie->states[ state ].trans.list,               \
2045         4, reg_trie_trans_le );                                 \
2046      TRIE_LIST_CUR( state ) = 1;                                \
2047      TRIE_LIST_LEN( state ) = 4;                                \
2048 } STMT_END
2049
2050 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2051     U16 dupe= trie->states[ state ].wordnum;                    \
2052     regnode * const noper_next = regnext( noper );              \
2053                                                                 \
2054     DEBUG_r({                                                   \
2055         /* store the word for dumping */                        \
2056         SV* tmp;                                                \
2057         if (OP(noper) != NOTHING)                               \
2058             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2059         else                                                    \
2060             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2061         av_push( trie_words, tmp );                             \
2062     });                                                         \
2063                                                                 \
2064     curword++;                                                  \
2065     trie->wordinfo[curword].prev   = 0;                         \
2066     trie->wordinfo[curword].len    = wordlen;                   \
2067     trie->wordinfo[curword].accept = state;                     \
2068                                                                 \
2069     if ( noper_next < tail ) {                                  \
2070         if (!trie->jump)                                        \
2071             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2072                                                  sizeof(U16) ); \
2073         trie->jump[curword] = (U16)(noper_next - convert);      \
2074         if (!jumper)                                            \
2075             jumper = noper_next;                                \
2076         if (!nextbranch)                                        \
2077             nextbranch= regnext(cur);                           \
2078     }                                                           \
2079                                                                 \
2080     if ( dupe ) {                                               \
2081         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2082         /* chain, so that when the bits of chain are later    */\
2083         /* linked together, the dups appear in the chain      */\
2084         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2085         trie->wordinfo[dupe].prev = curword;                    \
2086     } else {                                                    \
2087         /* we haven't inserted this word yet.                */ \
2088         trie->states[ state ].wordnum = curword;                \
2089     }                                                           \
2090 } STMT_END
2091
2092
2093 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2094      ( ( base + charid >=  ucharcount                                   \
2095          && base + charid < ubound                                      \
2096          && state == trie->trans[ base - ucharcount + charid ].check    \
2097          && trie->trans[ base - ucharcount + charid ].next )            \
2098            ? trie->trans[ base - ucharcount + charid ].next             \
2099            : ( state==1 ? special : 0 )                                 \
2100       )
2101
2102 #define MADE_TRIE       1
2103 #define MADE_JUMP_TRIE  2
2104 #define MADE_EXACT_TRIE 4
2105
2106 STATIC I32
2107 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2108                   regnode *first, regnode *last, regnode *tail,
2109                   U32 word_count, U32 flags, U32 depth)
2110 {
2111     /* first pass, loop through and scan words */
2112     reg_trie_data *trie;
2113     HV *widecharmap = NULL;
2114     AV *revcharmap = newAV();
2115     regnode *cur;
2116     STRLEN len = 0;
2117     UV uvc = 0;
2118     U16 curword = 0;
2119     U32 next_alloc = 0;
2120     regnode *jumper = NULL;
2121     regnode *nextbranch = NULL;
2122     regnode *convert = NULL;
2123     U32 *prev_states; /* temp array mapping each state to previous one */
2124     /* we just use folder as a flag in utf8 */
2125     const U8 * folder = NULL;
2126
2127 #ifdef DEBUGGING
2128     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2129     AV *trie_words = NULL;
2130     /* along with revcharmap, this only used during construction but both are
2131      * useful during debugging so we store them in the struct when debugging.
2132      */
2133 #else
2134     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2135     STRLEN trie_charcount=0;
2136 #endif
2137     SV *re_trie_maxbuff;
2138     GET_RE_DEBUG_FLAGS_DECL;
2139
2140     PERL_ARGS_ASSERT_MAKE_TRIE;
2141 #ifndef DEBUGGING
2142     PERL_UNUSED_ARG(depth);
2143 #endif
2144
2145     switch (flags) {
2146         case EXACT: case EXACTL: break;
2147         case EXACTFA:
2148         case EXACTFU_SS:
2149         case EXACTFU:
2150         case EXACTFLU8: folder = PL_fold_latin1; break;
2151         case EXACTF:  folder = PL_fold; break;
2152         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2153     }
2154
2155     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2156     trie->refcount = 1;
2157     trie->startstate = 1;
2158     trie->wordcount = word_count;
2159     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2160     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2161     if (flags == EXACT || flags == EXACTL)
2162         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2163     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2164                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2165
2166     DEBUG_r({
2167         trie_words = newAV();
2168     });
2169
2170     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2171     assert(re_trie_maxbuff);
2172     if (!SvIOK(re_trie_maxbuff)) {
2173         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2174     }
2175     DEBUG_TRIE_COMPILE_r({
2176         PerlIO_printf( Perl_debug_log,
2177           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2178           (int)depth * 2 + 2, "",
2179           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2180           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2181     });
2182
2183    /* Find the node we are going to overwrite */
2184     if ( first == startbranch && OP( last ) != BRANCH ) {
2185         /* whole branch chain */
2186         convert = first;
2187     } else {
2188         /* branch sub-chain */
2189         convert = NEXTOPER( first );
2190     }
2191
2192     /*  -- First loop and Setup --
2193
2194        We first traverse the branches and scan each word to determine if it
2195        contains widechars, and how many unique chars there are, this is
2196        important as we have to build a table with at least as many columns as we
2197        have unique chars.
2198
2199        We use an array of integers to represent the character codes 0..255
2200        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2201        the native representation of the character value as the key and IV's for
2202        the coded index.
2203
2204        *TODO* If we keep track of how many times each character is used we can
2205        remap the columns so that the table compression later on is more
2206        efficient in terms of memory by ensuring the most common value is in the
2207        middle and the least common are on the outside.  IMO this would be better
2208        than a most to least common mapping as theres a decent chance the most
2209        common letter will share a node with the least common, meaning the node
2210        will not be compressible. With a middle is most common approach the worst
2211        case is when we have the least common nodes twice.
2212
2213      */
2214
2215     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2216         regnode *noper = NEXTOPER( cur );
2217         const U8 *uc = (U8*)STRING( noper );
2218         const U8 *e  = uc + STR_LEN( noper );
2219         int foldlen = 0;
2220         U32 wordlen      = 0;         /* required init */
2221         STRLEN minchars = 0;
2222         STRLEN maxchars = 0;
2223         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2224                                                bitmap?*/
2225
2226         if (OP(noper) == NOTHING) {
2227             regnode *noper_next= regnext(noper);
2228             if (noper_next != tail && OP(noper_next) == flags) {
2229                 noper = noper_next;
2230                 uc= (U8*)STRING(noper);
2231                 e= uc + STR_LEN(noper);
2232                 trie->minlen= STR_LEN(noper);
2233             } else {
2234                 trie->minlen= 0;
2235                 continue;
2236             }
2237         }
2238
2239         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2240             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2241                                           regardless of encoding */
2242             if (OP( noper ) == EXACTFU_SS) {
2243                 /* false positives are ok, so just set this */
2244                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2245             }
2246         }
2247         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2248                                            branch */
2249             TRIE_CHARCOUNT(trie)++;
2250             TRIE_READ_CHAR;
2251
2252             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2253              * is in effect.  Under /i, this character can match itself, or
2254              * anything that folds to it.  If not under /i, it can match just
2255              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2256              * all fold to k, and all are single characters.   But some folds
2257              * expand to more than one character, so for example LATIN SMALL
2258              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2259              * the string beginning at 'uc' is 'ffi', it could be matched by
2260              * three characters, or just by the one ligature character. (It
2261              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2262              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2263              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2264              * match.)  The trie needs to know the minimum and maximum number
2265              * of characters that could match so that it can use size alone to
2266              * quickly reject many match attempts.  The max is simple: it is
2267              * the number of folded characters in this branch (since a fold is
2268              * never shorter than what folds to it. */
2269
2270             maxchars++;
2271
2272             /* And the min is equal to the max if not under /i (indicated by
2273              * 'folder' being NULL), or there are no multi-character folds.  If
2274              * there is a multi-character fold, the min is incremented just
2275              * once, for the character that folds to the sequence.  Each
2276              * character in the sequence needs to be added to the list below of
2277              * characters in the trie, but we count only the first towards the
2278              * min number of characters needed.  This is done through the
2279              * variable 'foldlen', which is returned by the macros that look
2280              * for these sequences as the number of bytes the sequence
2281              * occupies.  Each time through the loop, we decrement 'foldlen' by
2282              * how many bytes the current char occupies.  Only when it reaches
2283              * 0 do we increment 'minchars' or look for another multi-character
2284              * sequence. */
2285             if (folder == NULL) {
2286                 minchars++;
2287             }
2288             else if (foldlen > 0) {
2289                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2290             }
2291             else {
2292                 minchars++;
2293
2294                 /* See if *uc is the beginning of a multi-character fold.  If
2295                  * so, we decrement the length remaining to look at, to account
2296                  * for the current character this iteration.  (We can use 'uc'
2297                  * instead of the fold returned by TRIE_READ_CHAR because for
2298                  * non-UTF, the latin1_safe macro is smart enough to account
2299                  * for all the unfolded characters, and because for UTF, the
2300                  * string will already have been folded earlier in the
2301                  * compilation process */
2302                 if (UTF) {
2303                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2304                         foldlen -= UTF8SKIP(uc);
2305                     }
2306                 }
2307                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2308                     foldlen--;
2309                 }
2310             }
2311
2312             /* The current character (and any potential folds) should be added
2313              * to the possible matching characters for this position in this
2314              * branch */
2315             if ( uvc < 256 ) {
2316                 if ( folder ) {
2317                     U8 folded= folder[ (U8) uvc ];
2318                     if ( !trie->charmap[ folded ] ) {
2319                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2320                         TRIE_STORE_REVCHAR( folded );
2321                     }
2322                 }
2323                 if ( !trie->charmap[ uvc ] ) {
2324                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2325                     TRIE_STORE_REVCHAR( uvc );
2326                 }
2327                 if ( set_bit ) {
2328                     /* store the codepoint in the bitmap, and its folded
2329                      * equivalent. */
2330                     TRIE_BITMAP_SET(trie, uvc);
2331
2332                     /* store the folded codepoint */
2333                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2334
2335                     if ( !UTF ) {
2336                         /* store first byte of utf8 representation of
2337                            variant codepoints */
2338                         if (! UVCHR_IS_INVARIANT(uvc)) {
2339                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2340                         }
2341                     }
2342                     set_bit = 0; /* We've done our bit :-) */
2343                 }
2344             } else {
2345
2346                 /* XXX We could come up with the list of code points that fold
2347                  * to this using PL_utf8_foldclosures, except not for
2348                  * multi-char folds, as there may be multiple combinations
2349                  * there that could work, which needs to wait until runtime to
2350                  * resolve (The comment about LIGATURE FFI above is such an
2351                  * example */
2352
2353                 SV** svpp;
2354                 if ( !widecharmap )
2355                     widecharmap = newHV();
2356
2357                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2358
2359                 if ( !svpp )
2360                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2361
2362                 if ( !SvTRUE( *svpp ) ) {
2363                     sv_setiv( *svpp, ++trie->uniquecharcount );
2364                     TRIE_STORE_REVCHAR(uvc);
2365                 }
2366             }
2367         } /* end loop through characters in this branch of the trie */
2368
2369         /* We take the min and max for this branch and combine to find the min
2370          * and max for all branches processed so far */
2371         if( cur == first ) {
2372             trie->minlen = minchars;
2373             trie->maxlen = maxchars;
2374         } else if (minchars < trie->minlen) {
2375             trie->minlen = minchars;
2376         } else if (maxchars > trie->maxlen) {
2377             trie->maxlen = maxchars;
2378         }
2379     } /* end first pass */
2380     DEBUG_TRIE_COMPILE_r(
2381         PerlIO_printf( Perl_debug_log,
2382                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2383                 (int)depth * 2 + 2,"",
2384                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2385                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2386                 (int)trie->minlen, (int)trie->maxlen )
2387     );
2388
2389     /*
2390         We now know what we are dealing with in terms of unique chars and
2391         string sizes so we can calculate how much memory a naive
2392         representation using a flat table  will take. If it's over a reasonable
2393         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2394         conservative but potentially much slower representation using an array
2395         of lists.
2396
2397         At the end we convert both representations into the same compressed
2398         form that will be used in regexec.c for matching with. The latter
2399         is a form that cannot be used to construct with but has memory
2400         properties similar to the list form and access properties similar
2401         to the table form making it both suitable for fast searches and
2402         small enough that its feasable to store for the duration of a program.
2403
2404         See the comment in the code where the compressed table is produced
2405         inplace from the flat tabe representation for an explanation of how
2406         the compression works.
2407
2408     */
2409
2410
2411     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2412     prev_states[1] = 0;
2413
2414     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2415                                                     > SvIV(re_trie_maxbuff) )
2416     {
2417         /*
2418             Second Pass -- Array Of Lists Representation
2419
2420             Each state will be represented by a list of charid:state records
2421             (reg_trie_trans_le) the first such element holds the CUR and LEN
2422             points of the allocated array. (See defines above).
2423
2424             We build the initial structure using the lists, and then convert
2425             it into the compressed table form which allows faster lookups
2426             (but cant be modified once converted).
2427         */
2428
2429         STRLEN transcount = 1;
2430
2431         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2432             "%*sCompiling trie using list compiler\n",
2433             (int)depth * 2 + 2, ""));
2434
2435         trie->states = (reg_trie_state *)
2436             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2437                                   sizeof(reg_trie_state) );
2438         TRIE_LIST_NEW(1);
2439         next_alloc = 2;
2440
2441         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2442
2443             regnode *noper   = NEXTOPER( cur );
2444             U8 *uc           = (U8*)STRING( noper );
2445             const U8 *e      = uc + STR_LEN( noper );
2446             U32 state        = 1;         /* required init */
2447             U16 charid       = 0;         /* sanity init */
2448             U32 wordlen      = 0;         /* required init */
2449
2450             if (OP(noper) == NOTHING) {
2451                 regnode *noper_next= regnext(noper);
2452                 if (noper_next != tail && OP(noper_next) == flags) {
2453                     noper = noper_next;
2454                     uc= (U8*)STRING(noper);
2455                     e= uc + STR_LEN(noper);
2456                 }
2457             }
2458
2459             if (OP(noper) != NOTHING) {
2460                 for ( ; uc < e ; uc += len ) {
2461
2462                     TRIE_READ_CHAR;
2463
2464                     if ( uvc < 256 ) {
2465                         charid = trie->charmap[ uvc ];
2466                     } else {
2467                         SV** const svpp = hv_fetch( widecharmap,
2468                                                     (char*)&uvc,
2469                                                     sizeof( UV ),
2470                                                     0);
2471                         if ( !svpp ) {
2472                             charid = 0;
2473                         } else {
2474                             charid=(U16)SvIV( *svpp );
2475                         }
2476                     }
2477                     /* charid is now 0 if we dont know the char read, or
2478                      * nonzero if we do */
2479                     if ( charid ) {
2480
2481                         U16 check;
2482                         U32 newstate = 0;
2483
2484                         charid--;
2485                         if ( !trie->states[ state ].trans.list ) {
2486                             TRIE_LIST_NEW( state );
2487                         }
2488                         for ( check = 1;
2489                               check <= TRIE_LIST_USED( state );
2490                               check++ )
2491                         {
2492                             if ( TRIE_LIST_ITEM( state, check ).forid
2493                                                                     == charid )
2494                             {
2495                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2496                                 break;
2497                             }
2498                         }
2499                         if ( ! newstate ) {
2500                             newstate = next_alloc++;
2501                             prev_states[newstate] = state;
2502                             TRIE_LIST_PUSH( state, charid, newstate );
2503                             transcount++;
2504                         }
2505                         state = newstate;
2506                     } else {
2507                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2508                     }
2509                 }
2510             }
2511             TRIE_HANDLE_WORD(state);
2512
2513         } /* end second pass */
2514
2515         /* next alloc is the NEXT state to be allocated */
2516         trie->statecount = next_alloc;
2517         trie->states = (reg_trie_state *)
2518             PerlMemShared_realloc( trie->states,
2519                                    next_alloc
2520                                    * sizeof(reg_trie_state) );
2521
2522         /* and now dump it out before we compress it */
2523         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2524                                                          revcharmap, next_alloc,
2525                                                          depth+1)
2526         );
2527
2528         trie->trans = (reg_trie_trans *)
2529             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2530         {
2531             U32 state;
2532             U32 tp = 0;
2533             U32 zp = 0;
2534
2535
2536             for( state=1 ; state < next_alloc ; state ++ ) {
2537                 U32 base=0;
2538
2539                 /*
2540                 DEBUG_TRIE_COMPILE_MORE_r(
2541                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2542                 );
2543                 */
2544
2545                 if (trie->states[state].trans.list) {
2546                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2547                     U16 maxid=minid;
2548                     U16 idx;
2549
2550                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2551                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2552                         if ( forid < minid ) {
2553                             minid=forid;
2554                         } else if ( forid > maxid ) {
2555                             maxid=forid;
2556                         }
2557                     }
2558                     if ( transcount < tp + maxid - minid + 1) {
2559                         transcount *= 2;
2560                         trie->trans = (reg_trie_trans *)
2561                             PerlMemShared_realloc( trie->trans,
2562                                                      transcount
2563                                                      * sizeof(reg_trie_trans) );
2564                         Zero( trie->trans + (transcount / 2),
2565                               transcount / 2,
2566                               reg_trie_trans );
2567                     }
2568                     base = trie->uniquecharcount + tp - minid;
2569                     if ( maxid == minid ) {
2570                         U32 set = 0;
2571                         for ( ; zp < tp ; zp++ ) {
2572                             if ( ! trie->trans[ zp ].next ) {
2573                                 base = trie->uniquecharcount + zp - minid;
2574                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2575                                                                    1).newstate;
2576                                 trie->trans[ zp ].check = state;
2577                                 set = 1;
2578                                 break;
2579                             }
2580                         }
2581                         if ( !set ) {
2582                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2583                                                                    1).newstate;
2584                             trie->trans[ tp ].check = state;
2585                             tp++;
2586                             zp = tp;
2587                         }
2588                     } else {
2589                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2590                             const U32 tid = base
2591                                            - trie->uniquecharcount
2592                                            + TRIE_LIST_ITEM( state, idx ).forid;
2593                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2594                                                                 idx ).newstate;
2595                             trie->trans[ tid ].check = state;
2596                         }
2597                         tp += ( maxid - minid + 1 );
2598                     }
2599                     Safefree(trie->states[ state ].trans.list);
2600                 }
2601                 /*
2602                 DEBUG_TRIE_COMPILE_MORE_r(
2603                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2604                 );
2605                 */
2606                 trie->states[ state ].trans.base=base;
2607             }
2608             trie->lasttrans = tp + 1;
2609         }
2610     } else {
2611         /*
2612            Second Pass -- Flat Table Representation.
2613
2614            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2615            each.  We know that we will need Charcount+1 trans at most to store
2616            the data (one row per char at worst case) So we preallocate both
2617            structures assuming worst case.
2618
2619            We then construct the trie using only the .next slots of the entry
2620            structs.
2621
2622            We use the .check field of the first entry of the node temporarily
2623            to make compression both faster and easier by keeping track of how
2624            many non zero fields are in the node.
2625
2626            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2627            transition.
2628
2629            There are two terms at use here: state as a TRIE_NODEIDX() which is
2630            a number representing the first entry of the node, and state as a
2631            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2632            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2633            if there are 2 entrys per node. eg:
2634
2635              A B       A B
2636           1. 2 4    1. 3 7
2637           2. 0 3    3. 0 5
2638           3. 0 0    5. 0 0
2639           4. 0 0    7. 0 0
2640
2641            The table is internally in the right hand, idx form. However as we
2642            also have to deal with the states array which is indexed by nodenum
2643            we have to use TRIE_NODENUM() to convert.
2644
2645         */
2646         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2647             "%*sCompiling trie using table compiler\n",
2648             (int)depth * 2 + 2, ""));
2649
2650         trie->trans = (reg_trie_trans *)
2651             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2652                                   * trie->uniquecharcount + 1,
2653                                   sizeof(reg_trie_trans) );
2654         trie->states = (reg_trie_state *)
2655             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2656                                   sizeof(reg_trie_state) );
2657         next_alloc = trie->uniquecharcount + 1;
2658
2659
2660         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2661
2662             regnode *noper   = NEXTOPER( cur );
2663             const U8 *uc     = (U8*)STRING( noper );
2664             const U8 *e      = uc + STR_LEN( noper );
2665
2666             U32 state        = 1;         /* required init */
2667
2668             U16 charid       = 0;         /* sanity init */
2669             U32 accept_state = 0;         /* sanity init */
2670
2671             U32 wordlen      = 0;         /* required init */
2672
2673             if (OP(noper) == NOTHING) {
2674                 regnode *noper_next= regnext(noper);
2675                 if (noper_next != tail && OP(noper_next) == flags) {
2676                     noper = noper_next;
2677                     uc= (U8*)STRING(noper);
2678                     e= uc + STR_LEN(noper);
2679                 }
2680             }
2681
2682             if ( OP(noper) != NOTHING ) {
2683                 for ( ; uc < e ; uc += len ) {
2684
2685                     TRIE_READ_CHAR;
2686
2687                     if ( uvc < 256 ) {
2688                         charid = trie->charmap[ uvc ];
2689                     } else {
2690                         SV* const * const svpp = hv_fetch( widecharmap,
2691                                                            (char*)&uvc,
2692                                                            sizeof( UV ),
2693                                                            0);
2694                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2695                     }
2696                     if ( charid ) {
2697                         charid--;
2698                         if ( !trie->trans[ state + charid ].next ) {
2699                             trie->trans[ state + charid ].next = next_alloc;
2700                             trie->trans[ state ].check++;
2701                             prev_states[TRIE_NODENUM(next_alloc)]
2702                                     = TRIE_NODENUM(state);
2703                             next_alloc += trie->uniquecharcount;
2704                         }
2705                         state = trie->trans[ state + charid ].next;
2706                     } else {
2707                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2708                     }
2709                     /* charid is now 0 if we dont know the char read, or
2710                      * nonzero if we do */
2711                 }
2712             }
2713             accept_state = TRIE_NODENUM( state );
2714             TRIE_HANDLE_WORD(accept_state);
2715
2716         } /* end second pass */
2717
2718         /* and now dump it out before we compress it */
2719         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2720                                                           revcharmap,
2721                                                           next_alloc, depth+1));
2722
2723         {
2724         /*
2725            * Inplace compress the table.*
2726
2727            For sparse data sets the table constructed by the trie algorithm will
2728            be mostly 0/FAIL transitions or to put it another way mostly empty.
2729            (Note that leaf nodes will not contain any transitions.)
2730
2731            This algorithm compresses the tables by eliminating most such
2732            transitions, at the cost of a modest bit of extra work during lookup:
2733
2734            - Each states[] entry contains a .base field which indicates the
2735            index in the state[] array wheres its transition data is stored.
2736
2737            - If .base is 0 there are no valid transitions from that node.
2738
2739            - If .base is nonzero then charid is added to it to find an entry in
2740            the trans array.
2741
2742            -If trans[states[state].base+charid].check!=state then the
2743            transition is taken to be a 0/Fail transition. Thus if there are fail
2744            transitions at the front of the node then the .base offset will point
2745            somewhere inside the previous nodes data (or maybe even into a node
2746            even earlier), but the .check field determines if the transition is
2747            valid.
2748
2749            XXX - wrong maybe?
2750            The following process inplace converts the table to the compressed
2751            table: We first do not compress the root node 1,and mark all its
2752            .check pointers as 1 and set its .base pointer as 1 as well. This
2753            allows us to do a DFA construction from the compressed table later,
2754            and ensures that any .base pointers we calculate later are greater
2755            than 0.
2756
2757            - We set 'pos' to indicate the first entry of the second node.
2758
2759            - We then iterate over the columns of the node, finding the first and
2760            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2761            and set the .check pointers accordingly, and advance pos
2762            appropriately and repreat for the next node. Note that when we copy
2763            the next pointers we have to convert them from the original
2764            NODEIDX form to NODENUM form as the former is not valid post
2765            compression.
2766
2767            - If a node has no transitions used we mark its base as 0 and do not
2768            advance the pos pointer.
2769
2770            - If a node only has one transition we use a second pointer into the
2771            structure to fill in allocated fail transitions from other states.
2772            This pointer is independent of the main pointer and scans forward
2773            looking for null transitions that are allocated to a state. When it
2774            finds one it writes the single transition into the "hole".  If the
2775            pointer doesnt find one the single transition is appended as normal.
2776
2777            - Once compressed we can Renew/realloc the structures to release the
2778            excess space.
2779
2780            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2781            specifically Fig 3.47 and the associated pseudocode.
2782
2783            demq
2784         */
2785         const U32 laststate = TRIE_NODENUM( next_alloc );
2786         U32 state, charid;
2787         U32 pos = 0, zp=0;
2788         trie->statecount = laststate;
2789
2790         for ( state = 1 ; state < laststate ; state++ ) {
2791             U8 flag = 0;
2792             const U32 stateidx = TRIE_NODEIDX( state );
2793             const U32 o_used = trie->trans[ stateidx ].check;
2794             U32 used = trie->trans[ stateidx ].check;
2795             trie->trans[ stateidx ].check = 0;
2796
2797             for ( charid = 0;
2798                   used && charid < trie->uniquecharcount;
2799                   charid++ )
2800             {
2801                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2802                     if ( trie->trans[ stateidx + charid ].next ) {
2803                         if (o_used == 1) {
2804                             for ( ; zp < pos ; zp++ ) {
2805                                 if ( ! trie->trans[ zp ].next ) {
2806                                     break;
2807                                 }
2808                             }
2809                             trie->states[ state ].trans.base
2810                                                     = zp
2811                                                       + trie->uniquecharcount
2812                                                       - charid ;
2813                             trie->trans[ zp ].next
2814                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2815                                                              + charid ].next );
2816                             trie->trans[ zp ].check = state;
2817                             if ( ++zp > pos ) pos = zp;
2818                             break;
2819                         }
2820                         used--;
2821                     }
2822                     if ( !flag ) {
2823                         flag = 1;
2824                         trie->states[ state ].trans.base
2825                                        = pos + trie->uniquecharcount - charid ;
2826                     }
2827                     trie->trans[ pos ].next
2828                         = SAFE_TRIE_NODENUM(
2829                                        trie->trans[ stateidx + charid ].next );
2830                     trie->trans[ pos ].check = state;
2831                     pos++;
2832                 }
2833             }
2834         }
2835         trie->lasttrans = pos + 1;
2836         trie->states = (reg_trie_state *)
2837             PerlMemShared_realloc( trie->states, laststate
2838                                    * sizeof(reg_trie_state) );
2839         DEBUG_TRIE_COMPILE_MORE_r(
2840             PerlIO_printf( Perl_debug_log,
2841                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2842                 (int)depth * 2 + 2,"",
2843                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2844                        + 1 ),
2845                 (IV)next_alloc,
2846                 (IV)pos,
2847                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2848             );
2849
2850         } /* end table compress */
2851     }
2852     DEBUG_TRIE_COMPILE_MORE_r(
2853             PerlIO_printf(Perl_debug_log,
2854                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2855                 (int)depth * 2 + 2, "",
2856                 (UV)trie->statecount,
2857                 (UV)trie->lasttrans)
2858     );
2859     /* resize the trans array to remove unused space */
2860     trie->trans = (reg_trie_trans *)
2861         PerlMemShared_realloc( trie->trans, trie->lasttrans
2862                                * sizeof(reg_trie_trans) );
2863
2864     {   /* Modify the program and insert the new TRIE node */
2865         U8 nodetype =(U8)(flags & 0xFF);
2866         char *str=NULL;
2867
2868 #ifdef DEBUGGING
2869         regnode *optimize = NULL;
2870 #ifdef RE_TRACK_PATTERN_OFFSETS
2871
2872         U32 mjd_offset = 0;
2873         U32 mjd_nodelen = 0;
2874 #endif /* RE_TRACK_PATTERN_OFFSETS */
2875 #endif /* DEBUGGING */
2876         /*
2877            This means we convert either the first branch or the first Exact,
2878            depending on whether the thing following (in 'last') is a branch
2879            or not and whther first is the startbranch (ie is it a sub part of
2880            the alternation or is it the whole thing.)
2881            Assuming its a sub part we convert the EXACT otherwise we convert
2882            the whole branch sequence, including the first.
2883          */
2884         /* Find the node we are going to overwrite */
2885         if ( first != startbranch || OP( last ) == BRANCH ) {
2886             /* branch sub-chain */
2887             NEXT_OFF( first ) = (U16)(last - first);
2888 #ifdef RE_TRACK_PATTERN_OFFSETS
2889             DEBUG_r({
2890                 mjd_offset= Node_Offset((convert));
2891                 mjd_nodelen= Node_Length((convert));
2892             });
2893 #endif
2894             /* whole branch chain */
2895         }
2896 #ifdef RE_TRACK_PATTERN_OFFSETS
2897         else {
2898             DEBUG_r({
2899                 const  regnode *nop = NEXTOPER( convert );
2900                 mjd_offset= Node_Offset((nop));
2901                 mjd_nodelen= Node_Length((nop));
2902             });
2903         }
2904         DEBUG_OPTIMISE_r(
2905             PerlIO_printf(Perl_debug_log,
2906                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2907                 (int)depth * 2 + 2, "",
2908                 (UV)mjd_offset, (UV)mjd_nodelen)
2909         );
2910 #endif
2911         /* But first we check to see if there is a common prefix we can
2912            split out as an EXACT and put in front of the TRIE node.  */
2913         trie->startstate= 1;
2914         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2915             U32 state;
2916             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2917                 U32 ofs = 0;
2918                 I32 idx = -1;
2919                 U32 count = 0;
2920                 const U32 base = trie->states[ state ].trans.base;
2921
2922                 if ( trie->states[state].wordnum )
2923                         count = 1;
2924
2925                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2926                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2927                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2928                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2929                     {
2930                         if ( ++count > 1 ) {
2931                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2932                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2933                             if ( state == 1 ) break;
2934                             if ( count == 2 ) {
2935                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2936                                 DEBUG_OPTIMISE_r(
2937                                     PerlIO_printf(Perl_debug_log,
2938                                         "%*sNew Start State=%"UVuf" Class: [",
2939                                         (int)depth * 2 + 2, "",
2940                                         (UV)state));
2941                                 if (idx >= 0) {
2942                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2943                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2944
2945                                     TRIE_BITMAP_SET(trie,*ch);
2946                                     if ( folder )
2947                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2948                                     DEBUG_OPTIMISE_r(
2949                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2950                                     );
2951                                 }
2952                             }
2953                             TRIE_BITMAP_SET(trie,*ch);
2954                             if ( folder )
2955                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2956                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2957                         }
2958                         idx = ofs;
2959                     }
2960                 }
2961                 if ( count == 1 ) {
2962                     SV **tmp = av_fetch( revcharmap, idx, 0);
2963                     STRLEN len;
2964                     char *ch = SvPV( *tmp, len );
2965                     DEBUG_OPTIMISE_r({
2966                         SV *sv=sv_newmortal();
2967                         PerlIO_printf( Perl_debug_log,
2968                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2969                             (int)depth * 2 + 2, "",
2970                             (UV)state, (UV)idx,
2971                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2972                                 PL_colors[0], PL_colors[1],
2973                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2974                                 PERL_PV_ESCAPE_FIRSTCHAR
2975                             )
2976                         );
2977                     });
2978                     if ( state==1 ) {
2979                         OP( convert ) = nodetype;
2980                         str=STRING(convert);
2981                         STR_LEN(convert)=0;
2982                     }
2983                     STR_LEN(convert) += len;
2984                     while (len--)
2985                         *str++ = *ch++;
2986                 } else {
2987 #ifdef DEBUGGING
2988                     if (state>1)
2989                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2990 #endif
2991                     break;
2992                 }
2993             }
2994             trie->prefixlen = (state-1);
2995             if (str) {
2996                 regnode *n = convert+NODE_SZ_STR(convert);
2997                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2998                 trie->startstate = state;
2999                 trie->minlen -= (state - 1);
3000                 trie->maxlen -= (state - 1);
3001 #ifdef DEBUGGING
3002                /* At least the UNICOS C compiler choked on this
3003                 * being argument to DEBUG_r(), so let's just have
3004                 * it right here. */
3005                if (
3006 #ifdef PERL_EXT_RE_BUILD
3007                    1
3008 #else
3009                    DEBUG_r_TEST
3010 #endif
3011                    ) {
3012                    regnode *fix = convert;
3013                    U32 word = trie->wordcount;
3014                    mjd_nodelen++;
3015                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3016                    while( ++fix < n ) {
3017                        Set_Node_Offset_Length(fix, 0, 0);
3018                    }
3019                    while (word--) {
3020                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3021                        if (tmp) {
3022                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3023                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3024                            else
3025                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3026                        }
3027                    }
3028                }
3029 #endif
3030                 if (trie->maxlen) {
3031                     convert = n;
3032                 } else {
3033                     NEXT_OFF(convert) = (U16)(tail - convert);
3034                     DEBUG_r(optimize= n);
3035                 }
3036             }
3037         }
3038         if (!jumper)
3039             jumper = last;
3040         if ( trie->maxlen ) {
3041             NEXT_OFF( convert ) = (U16)(tail - convert);
3042             ARG_SET( convert, data_slot );
3043             /* Store the offset to the first unabsorbed branch in
3044                jump[0], which is otherwise unused by the jump logic.
3045                We use this when dumping a trie and during optimisation. */
3046             if (trie->jump)
3047                 trie->jump[0] = (U16)(nextbranch - convert);
3048
3049             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3050              *   and there is a bitmap
3051              *   and the first "jump target" node we found leaves enough room
3052              * then convert the TRIE node into a TRIEC node, with the bitmap
3053              * embedded inline in the opcode - this is hypothetically faster.
3054              */
3055             if ( !trie->states[trie->startstate].wordnum
3056                  && trie->bitmap
3057                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3058             {
3059                 OP( convert ) = TRIEC;
3060                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3061                 PerlMemShared_free(trie->bitmap);
3062                 trie->bitmap= NULL;
3063             } else
3064                 OP( convert ) = TRIE;
3065
3066             /* store the type in the flags */
3067             convert->flags = nodetype;
3068             DEBUG_r({
3069             optimize = convert
3070                       + NODE_STEP_REGNODE
3071                       + regarglen[ OP( convert ) ];
3072             });
3073             /* XXX We really should free up the resource in trie now,
3074                    as we won't use them - (which resources?) dmq */
3075         }
3076         /* needed for dumping*/
3077         DEBUG_r(if (optimize) {
3078             regnode *opt = convert;
3079
3080             while ( ++opt < optimize) {
3081                 Set_Node_Offset_Length(opt,0,0);
3082             }
3083             /*
3084                 Try to clean up some of the debris left after the
3085                 optimisation.
3086              */
3087             while( optimize < jumper ) {
3088                 mjd_nodelen += Node_Length((optimize));
3089                 OP( optimize ) = OPTIMIZED;
3090                 Set_Node_Offset_Length(optimize,0,0);
3091                 optimize++;
3092             }
3093             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3094         });
3095     } /* end node insert */
3096
3097     /*  Finish populating the prev field of the wordinfo array.  Walk back
3098      *  from each accept state until we find another accept state, and if
3099      *  so, point the first word's .prev field at the second word. If the
3100      *  second already has a .prev field set, stop now. This will be the
3101      *  case either if we've already processed that word's accept state,
3102      *  or that state had multiple words, and the overspill words were
3103      *  already linked up earlier.
3104      */
3105     {
3106         U16 word;
3107         U32 state;
3108         U16 prev;
3109
3110         for (word=1; word <= trie->wordcount; word++) {
3111             prev = 0;
3112             if (trie->wordinfo[word].prev)
3113                 continue;
3114             state = trie->wordinfo[word].accept;
3115             while (state) {
3116                 state = prev_states[state];
3117                 if (!state)
3118                     break;
3119                 prev = trie->states[state].wordnum;
3120                 if (prev)
3121                     break;
3122             }
3123             trie->wordinfo[word].prev = prev;
3124         }
3125         Safefree(prev_states);
3126     }
3127
3128
3129     /* and now dump out the compressed format */
3130     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3131
3132     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3133 #ifdef DEBUGGING
3134     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3135     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3136 #else
3137     SvREFCNT_dec_NN(revcharmap);
3138 #endif
3139     return trie->jump
3140            ? MADE_JUMP_TRIE
3141            : trie->startstate>1
3142              ? MADE_EXACT_TRIE
3143              : MADE_TRIE;
3144 }
3145
3146 STATIC regnode *
3147 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3148 {
3149 /* The Trie is constructed and compressed now so we can build a fail array if
3150  * it's needed
3151
3152    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3153    3.32 in the
3154    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3155    Ullman 1985/88
3156    ISBN 0-201-10088-6
3157
3158    We find the fail state for each state in the trie, this state is the longest
3159    proper suffix of the current state's 'word' that is also a proper prefix of
3160    another word in our trie. State 1 represents the word '' and is thus the
3161    default fail state. This allows the DFA not to have to restart after its
3162    tried and failed a word at a given point, it simply continues as though it
3163    had been matching the other word in the first place.
3164    Consider
3165       'abcdgu'=~/abcdefg|cdgu/
3166    When we get to 'd' we are still matching the first word, we would encounter
3167    'g' which would fail, which would bring us to the state representing 'd' in
3168    the second word where we would try 'g' and succeed, proceeding to match
3169    'cdgu'.
3170  */
3171  /* add a fail transition */
3172     const U32 trie_offset = ARG(source);
3173     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3174     U32 *q;
3175     const U32 ucharcount = trie->uniquecharcount;
3176     const U32 numstates = trie->statecount;
3177     const U32 ubound = trie->lasttrans + ucharcount;
3178     U32 q_read = 0;
3179     U32 q_write = 0;
3180     U32 charid;
3181     U32 base = trie->states[ 1 ].trans.base;
3182     U32 *fail;
3183     reg_ac_data *aho;
3184     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3185     regnode *stclass;
3186     GET_RE_DEBUG_FLAGS_DECL;
3187
3188     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3189     PERL_UNUSED_CONTEXT;
3190 #ifndef DEBUGGING
3191     PERL_UNUSED_ARG(depth);
3192 #endif
3193
3194     if ( OP(source) == TRIE ) {
3195         struct regnode_1 *op = (struct regnode_1 *)
3196             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3197         StructCopy(source,op,struct regnode_1);
3198         stclass = (regnode *)op;
3199     } else {
3200         struct regnode_charclass *op = (struct regnode_charclass *)
3201             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3202         StructCopy(source,op,struct regnode_charclass);
3203         stclass = (regnode *)op;
3204     }
3205     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3206
3207     ARG_SET( stclass, data_slot );
3208     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3209     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3210     aho->trie=trie_offset;
3211     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3212     Copy( trie->states, aho->states, numstates, reg_trie_state );
3213     Newxz( q, numstates, U32);
3214     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3215     aho->refcount = 1;
3216     fail = aho->fail;
3217     /* initialize fail[0..1] to be 1 so that we always have
3218        a valid final fail state */
3219     fail[ 0 ] = fail[ 1 ] = 1;
3220
3221     for ( charid = 0; charid < ucharcount ; charid++ ) {
3222         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3223         if ( newstate ) {
3224             q[ q_write ] = newstate;
3225             /* set to point at the root */
3226             fail[ q[ q_write++ ] ]=1;
3227         }
3228     }
3229     while ( q_read < q_write) {
3230         const U32 cur = q[ q_read++ % numstates ];
3231         base = trie->states[ cur ].trans.base;
3232
3233         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3234             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3235             if (ch_state) {
3236                 U32 fail_state = cur;
3237                 U32 fail_base;
3238                 do {
3239                     fail_state = fail[ fail_state ];
3240                     fail_base = aho->states[ fail_state ].trans.base;
3241                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3242
3243                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3244                 fail[ ch_state ] = fail_state;
3245                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3246                 {
3247                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3248                 }
3249                 q[ q_write++ % numstates] = ch_state;
3250             }
3251         }
3252     }
3253     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3254        when we fail in state 1, this allows us to use the
3255        charclass scan to find a valid start char. This is based on the principle
3256        that theres a good chance the string being searched contains lots of stuff
3257        that cant be a start char.
3258      */
3259     fail[ 0 ] = fail[ 1 ] = 0;
3260     DEBUG_TRIE_COMPILE_r({
3261         PerlIO_printf(Perl_debug_log,
3262                       "%*sStclass Failtable (%"UVuf" states): 0",
3263                       (int)(depth * 2), "", (UV)numstates
3264         );
3265         for( q_read=1; q_read<numstates; q_read++ ) {
3266             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3267         }
3268         PerlIO_printf(Perl_debug_log, "\n");
3269     });
3270     Safefree(q);
3271     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3272     return stclass;
3273 }
3274
3275
3276 #define DEBUG_PEEP(str,scan,depth) \
3277     DEBUG_OPTIMISE_r({if (scan){ \
3278        regnode *Next = regnext(scan); \
3279        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3280        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3281            (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3282            Next ? (REG_NODE_NUM(Next)) : 0 ); \
3283        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3284        PerlIO_printf(Perl_debug_log, "\n"); \
3285    }});
3286
3287 /* The below joins as many adjacent EXACTish nodes as possible into a single
3288  * one.  The regop may be changed if the node(s) contain certain sequences that
3289  * require special handling.  The joining is only done if:
3290  * 1) there is room in the current conglomerated node to entirely contain the
3291  *    next one.
3292  * 2) they are the exact same node type
3293  *
3294  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3295  * these get optimized out
3296  *
3297  * If a node is to match under /i (folded), the number of characters it matches
3298  * can be different than its character length if it contains a multi-character
3299  * fold.  *min_subtract is set to the total delta number of characters of the
3300  * input nodes.
3301  *
3302  * And *unfolded_multi_char is set to indicate whether or not the node contains
3303  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3304  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3305  * SMALL LETTER SHARP S, as only if the target string being matched against
3306  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3307  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3308  * whose components are all above the Latin1 range are not run-time locale
3309  * dependent, and have already been folded by the time this function is
3310  * called.)
3311  *
3312  * This is as good a place as any to discuss the design of handling these
3313  * multi-character fold sequences.  It's been wrong in Perl for a very long
3314  * time.  There are three code points in Unicode whose multi-character folds
3315  * were long ago discovered to mess things up.  The previous designs for
3316  * dealing with these involved assigning a special node for them.  This
3317  * approach doesn't always work, as evidenced by this example:
3318  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3319  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3320  * would match just the \xDF, it won't be able to handle the case where a
3321  * successful match would have to cross the node's boundary.  The new approach
3322  * that hopefully generally solves the problem generates an EXACTFU_SS node
3323  * that is "sss" in this case.
3324  *
3325  * It turns out that there are problems with all multi-character folds, and not
3326  * just these three.  Now the code is general, for all such cases.  The
3327  * approach taken is:
3328  * 1)   This routine examines each EXACTFish node that could contain multi-
3329  *      character folded sequences.  Since a single character can fold into
3330  *      such a sequence, the minimum match length for this node is less than
3331  *      the number of characters in the node.  This routine returns in
3332  *      *min_subtract how many characters to subtract from the the actual
3333  *      length of the string to get a real minimum match length; it is 0 if
3334  *      there are no multi-char foldeds.  This delta is used by the caller to
3335  *      adjust the min length of the match, and the delta between min and max,
3336  *      so that the optimizer doesn't reject these possibilities based on size
3337  *      constraints.
3338  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3339  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3340  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3341  *      there is a possible fold length change.  That means that a regular
3342  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3343  *      with length changes, and so can be processed faster.  regexec.c takes
3344  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3345  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3346  *      known until runtime).  This saves effort in regex matching.  However,
3347  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3348  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3349  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3350  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3351  *      possibilities for the non-UTF8 patterns are quite simple, except for
3352  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3353  *      members of a fold-pair, and arrays are set up for all of them so that
3354  *      the other member of the pair can be found quickly.  Code elsewhere in
3355  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3356  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3357  *      described in the next item.
3358  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3359  *      validity of the fold won't be known until runtime, and so must remain
3360  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3361  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3362  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3363  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3364  *      The reason this is a problem is that the optimizer part of regexec.c
3365  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3366  *      that a character in the pattern corresponds to at most a single
3367  *      character in the target string.  (And I do mean character, and not byte
3368  *      here, unlike other parts of the documentation that have never been
3369  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3370  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3371  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3372  *      nodes, violate the assumption, and they are the only instances where it
3373  *      is violated.  I'm reluctant to try to change the assumption, as the
3374  *      code involved is impenetrable to me (khw), so instead the code here
3375  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3376  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3377  *      boolean indicating whether or not the node contains such a fold.  When
3378  *      it is true, the caller sets a flag that later causes the optimizer in
3379  *      this file to not set values for the floating and fixed string lengths,
3380  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3381  *      assumption.  Thus, there is no optimization based on string lengths for
3382  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3383  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3384  *      assumption is wrong only in these cases is that all other non-UTF-8
3385  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3386  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3387  *      EXACTF nodes because we don't know at compile time if it actually
3388  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3389  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3390  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3391  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3392  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3393  *      string would require the pattern to be forced into UTF-8, the overhead
3394  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3395  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3396  *      locale.)
3397  *
3398  *      Similarly, the code that generates tries doesn't currently handle
3399  *      not-already-folded multi-char folds, and it looks like a pain to change
3400  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3401  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3402  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3403  *      using /iaa matching will be doing so almost entirely with ASCII
3404  *      strings, so this should rarely be encountered in practice */
3405
3406 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3407     if (PL_regkind[OP(scan)] == EXACT) \
3408         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3409
3410 STATIC U32
3411 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3412                    UV *min_subtract, bool *unfolded_multi_char,
3413                    U32 flags,regnode *val, U32 depth)
3414 {
3415     /* Merge several consecutive EXACTish nodes into one. */
3416     regnode *n = regnext(scan);
3417     U32 stringok = 1;
3418     regnode *next = scan + NODE_SZ_STR(scan);
3419     U32 merged = 0;
3420     U32 stopnow = 0;
3421 #ifdef DEBUGGING
3422     regnode *stop = scan;
3423     GET_RE_DEBUG_FLAGS_DECL;
3424 #else
3425     PERL_UNUSED_ARG(depth);
3426 #endif
3427
3428     PERL_ARGS_ASSERT_JOIN_EXACT;
3429 #ifndef EXPERIMENTAL_INPLACESCAN
3430     PERL_UNUSED_ARG(flags);
3431     PERL_UNUSED_ARG(val);
3432 #endif
3433     DEBUG_PEEP("join",scan,depth);
3434
3435     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3436      * EXACT ones that are mergeable to the current one. */
3437     while (n
3438            && (PL_regkind[OP(n)] == NOTHING
3439                || (stringok && OP(n) == OP(scan)))
3440            && NEXT_OFF(n)
3441            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3442     {
3443
3444         if (OP(n) == TAIL || n > next)
3445             stringok = 0;
3446         if (PL_regkind[OP(n)] == NOTHING) {
3447             DEBUG_PEEP("skip:",n,depth);
3448             NEXT_OFF(scan) += NEXT_OFF(n);
3449             next = n + NODE_STEP_REGNODE;
3450 #ifdef DEBUGGING
3451             if (stringok)
3452                 stop = n;
3453 #endif
3454             n = regnext(n);
3455         }
3456         else if (stringok) {
3457             const unsigned int oldl = STR_LEN(scan);
3458             regnode * const nnext = regnext(n);
3459
3460             /* XXX I (khw) kind of doubt that this works on platforms (should
3461              * Perl ever run on one) where U8_MAX is above 255 because of lots
3462              * of other assumptions */
3463             /* Don't join if the sum can't fit into a single node */
3464             if (oldl + STR_LEN(n) > U8_MAX)
3465                 break;
3466
3467             DEBUG_PEEP("merg",n,depth);
3468             merged++;
3469
3470             NEXT_OFF(scan) += NEXT_OFF(n);
3471             STR_LEN(scan) += STR_LEN(n);
3472             next = n + NODE_SZ_STR(n);
3473             /* Now we can overwrite *n : */
3474             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3475 #ifdef DEBUGGING
3476             stop = next - 1;
3477 #endif
3478             n = nnext;
3479             if (stopnow) break;
3480         }
3481
3482 #ifdef EXPERIMENTAL_INPLACESCAN
3483         if (flags && !NEXT_OFF(n)) {
3484             DEBUG_PEEP("atch", val, depth);
3485             if (reg_off_by_arg[OP(n)]) {
3486                 ARG_SET(n, val - n);
3487             }
3488             else {
3489                 NEXT_OFF(n) = val - n;
3490             }
3491             stopnow = 1;
3492         }
3493 #endif
3494     }
3495
3496     *min_subtract = 0;
3497     *unfolded_multi_char = FALSE;
3498
3499     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3500      * can now analyze for sequences of problematic code points.  (Prior to
3501      * this final joining, sequences could have been split over boundaries, and
3502      * hence missed).  The sequences only happen in folding, hence for any
3503      * non-EXACT EXACTish node */
3504     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3505         U8* s0 = (U8*) STRING(scan);
3506         U8* s = s0;
3507         U8* s_end = s0 + STR_LEN(scan);
3508
3509         int total_count_delta = 0;  /* Total delta number of characters that
3510                                        multi-char folds expand to */
3511
3512         /* One pass is made over the node's string looking for all the
3513          * possibilities.  To avoid some tests in the loop, there are two main
3514          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3515          * non-UTF-8 */
3516         if (UTF) {
3517             U8* folded = NULL;
3518
3519             if (OP(scan) == EXACTFL) {
3520                 U8 *d;
3521
3522                 /* An EXACTFL node would already have been changed to another
3523                  * node type unless there is at least one character in it that
3524                  * is problematic; likely a character whose fold definition
3525                  * won't be known until runtime, and so has yet to be folded.
3526                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3527                  * to handle the UTF-8 case, we need to create a temporary
3528                  * folded copy using UTF-8 locale rules in order to analyze it.
3529                  * This is because our macros that look to see if a sequence is
3530                  * a multi-char fold assume everything is folded (otherwise the
3531                  * tests in those macros would be too complicated and slow).
3532                  * Note that here, the non-problematic folds will have already
3533                  * been done, so we can just copy such characters.  We actually
3534                  * don't completely fold the EXACTFL string.  We skip the
3535                  * unfolded multi-char folds, as that would just create work
3536                  * below to figure out the size they already are */
3537
3538                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3539                 d = folded;
3540                 while (s < s_end) {
3541                     STRLEN s_len = UTF8SKIP(s);
3542                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3543                         Copy(s, d, s_len, U8);
3544                         d += s_len;
3545                     }
3546                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3547                         *unfolded_multi_char = TRUE;
3548                         Copy(s, d, s_len, U8);
3549                         d += s_len;
3550                     }
3551                     else if (isASCII(*s)) {
3552                         *(d++) = toFOLD(*s);
3553                     }
3554                     else {
3555                         STRLEN len;
3556                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3557                         d += len;
3558                     }
3559                     s += s_len;
3560                 }
3561
3562                 /* Point the remainder of the routine to look at our temporary
3563                  * folded copy */
3564                 s = folded;
3565                 s_end = d;
3566             } /* End of creating folded copy of EXACTFL string */
3567
3568             /* Examine the string for a multi-character fold sequence.  UTF-8
3569              * patterns have all characters pre-folded by the time this code is
3570              * executed */
3571             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3572                                      length sequence we are looking for is 2 */
3573             {
3574                 int count = 0;  /* How many characters in a multi-char fold */
3575                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3576                 if (! len) {    /* Not a multi-char fold: get next char */
3577                     s += UTF8SKIP(s);
3578                     continue;
3579                 }
3580
3581                 /* Nodes with 'ss' require special handling, except for
3582                  * EXACTFA-ish for which there is no multi-char fold to this */
3583                 if (len == 2 && *s == 's' && *(s+1) == 's'
3584                     && OP(scan) != EXACTFA
3585                     && OP(scan) != EXACTFA_NO_TRIE)
3586                 {
3587                     count = 2;
3588                     if (OP(scan) != EXACTFL) {
3589                         OP(scan) = EXACTFU_SS;
3590                     }
3591                     s += 2;
3592                 }
3593                 else { /* Here is a generic multi-char fold. */
3594                     U8* multi_end  = s + len;
3595
3596                     /* Count how many characters are in it.  In the case of
3597                      * /aa, no folds which contain ASCII code points are
3598                      * allowed, so check for those, and skip if found. */
3599                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3600                         count = utf8_length(s, multi_end);
3601                         s = multi_end;
3602                     }
3603                     else {
3604                         while (s < multi_end) {
3605                             if (isASCII(*s)) {
3606                                 s++;
3607                                 goto next_iteration;
3608                             }
3609                             else {
3610                                 s += UTF8SKIP(s);
3611                             }
3612                             count++;
3613                         }
3614                     }
3615                 }
3616
3617                 /* The delta is how long the sequence is minus 1 (1 is how long
3618                  * the character that folds to the sequence is) */
3619                 total_count_delta += count - 1;
3620               next_iteration: ;
3621             }
3622
3623             /* We created a temporary folded copy of the string in EXACTFL
3624              * nodes.  Therefore we need to be sure it doesn't go below zero,
3625              * as the real string could be shorter */
3626             if (OP(scan) == EXACTFL) {
3627                 int total_chars = utf8_length((U8*) STRING(scan),
3628                                            (U8*) STRING(scan) + STR_LEN(scan));
3629                 if (total_count_delta > total_chars) {
3630                     total_count_delta = total_chars;
3631                 }
3632             }
3633
3634             *min_subtract += total_count_delta;
3635             Safefree(folded);
3636         }
3637         else if (OP(scan) == EXACTFA) {
3638
3639             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3640              * fold to the ASCII range (and there are no existing ones in the
3641              * upper latin1 range).  But, as outlined in the comments preceding
3642              * this function, we need to flag any occurrences of the sharp s.
3643              * This character forbids trie formation (because of added
3644              * complexity) */
3645             while (s < s_end) {
3646                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3647                     OP(scan) = EXACTFA_NO_TRIE;
3648                     *unfolded_multi_char = TRUE;
3649                     break;
3650                 }
3651                 s++;
3652                 continue;
3653             }
3654         }
3655         else {
3656
3657             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3658              * folds that are all Latin1.  As explained in the comments
3659              * preceding this function, we look also for the sharp s in EXACTF
3660              * and EXACTFL nodes; it can be in the final position.  Otherwise
3661              * we can stop looking 1 byte earlier because have to find at least
3662              * two characters for a multi-fold */
3663             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3664                               ? s_end
3665                               : s_end -1;
3666
3667             while (s < upper) {
3668                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3669                 if (! len) {    /* Not a multi-char fold. */
3670                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3671                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3672                     {
3673                         *unfolded_multi_char = TRUE;
3674                     }
3675                     s++;
3676                     continue;
3677                 }
3678
3679                 if (len == 2
3680                     && isALPHA_FOLD_EQ(*s, 's')
3681                     && isALPHA_FOLD_EQ(*(s+1), 's'))
3682                 {
3683
3684                     /* EXACTF nodes need to know that the minimum length
3685                      * changed so that a sharp s in the string can match this
3686                      * ss in the pattern, but they remain EXACTF nodes, as they
3687                      * won't match this unless the target string is is UTF-8,
3688                      * which we don't know until runtime.  EXACTFL nodes can't
3689                      * transform into EXACTFU nodes */
3690                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3691                         OP(scan) = EXACTFU_SS;
3692                     }
3693                 }
3694
3695                 *min_subtract += len - 1;
3696                 s += len;
3697             }
3698         }
3699     }
3700
3701 #ifdef DEBUGGING
3702     /* Allow dumping but overwriting the collection of skipped
3703      * ops and/or strings with fake optimized ops */
3704     n = scan + NODE_SZ_STR(scan);
3705     while (n <= stop) {
3706         OP(n) = OPTIMIZED;
3707         FLAGS(n) = 0;
3708         NEXT_OFF(n) = 0;
3709         n++;
3710     }
3711 #endif
3712     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3713     return stopnow;
3714 }
3715
3716 /* REx optimizer.  Converts nodes into quicker variants "in place".
3717    Finds fixed substrings.  */
3718
3719 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3720    to the position after last scanned or to NULL. */
3721
3722 #define INIT_AND_WITHP \
3723     assert(!and_withp); \
3724     Newx(and_withp,1, regnode_ssc); \
3725     SAVEFREEPV(and_withp)
3726
3727
3728 static void
3729 S_unwind_scan_frames(pTHX_ const void *p)
3730 {
3731     scan_frame *f= (scan_frame *)p;
3732     do {
3733         scan_frame *n= f->next_frame;
3734         Safefree(f);
3735         f= n;
3736     } while (f);
3737 }
3738
3739
3740 STATIC SSize_t
3741 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3742                         SSize_t *minlenp, SSize_t *deltap,
3743                         regnode *last,
3744                         scan_data_t *data,
3745                         I32 stopparen,
3746                         U32 recursed_depth,
3747                         regnode_ssc *and_withp,
3748                         U32 flags, U32 depth)
3749                         /* scanp: Start here (read-write). */
3750                         /* deltap: Write maxlen-minlen here. */
3751                         /* last: Stop before this one. */
3752                         /* data: string data about the pattern */
3753                         /* stopparen: treat close N as END */
3754                         /* recursed: which subroutines have we recursed into */
3755                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3756 {
3757     /* There must be at least this number of characters to match */
3758     SSize_t min = 0;
3759     I32 pars = 0, code;
3760     regnode *scan = *scanp, *next;
3761     SSize_t delta = 0;
3762     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3763     int is_inf_internal = 0;            /* The studied chunk is infinite */
3764     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3765     scan_data_t data_fake;
3766     SV *re_trie_maxbuff = NULL;
3767     regnode *first_non_open = scan;
3768     SSize_t stopmin = SSize_t_MAX;
3769     scan_frame *frame = NULL;
3770     GET_RE_DEBUG_FLAGS_DECL;
3771
3772     PERL_ARGS_ASSERT_STUDY_CHUNK;
3773
3774
3775     if ( depth == 0 ) {
3776         while (first_non_open && OP(first_non_open) == OPEN)
3777             first_non_open=regnext(first_non_open);
3778     }
3779
3780
3781   fake_study_recurse:
3782     DEBUG_r(
3783         RExC_study_chunk_recursed_count++;
3784     );
3785     DEBUG_OPTIMISE_MORE_r(
3786     {
3787         PerlIO_printf(Perl_debug_log,
3788             "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3789             (int)(depth*2), "", (long)stopparen,
3790             (unsigned long)RExC_study_chunk_recursed_count,
3791             (unsigned long)depth, (unsigned long)recursed_depth,
3792             scan,
3793             last);
3794         if (recursed_depth) {
3795             U32 i;
3796             U32 j;
3797             for ( j = 0 ; j < recursed_depth ; j++ ) {
3798                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3799                     if (
3800                         PAREN_TEST(RExC_study_chunk_recursed +
3801                                    ( j * RExC_study_chunk_recursed_bytes), i )
3802                         && (
3803                             !j ||
3804                             !PAREN_TEST(RExC_study_chunk_recursed +
3805                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3806                         )
3807                     ) {
3808                         PerlIO_printf(Perl_debug_log," %d",(int)i);
3809                         break;
3810                     }
3811                 }
3812                 if ( j + 1 < recursed_depth ) {
3813                     PerlIO_printf(Perl_debug_log, ",");
3814                 }
3815             }
3816         }
3817         PerlIO_printf(Perl_debug_log,"\n");
3818     }
3819     );
3820     while ( scan && OP(scan) != END && scan < last ){
3821         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3822                                    node length to get a real minimum (because
3823                                    the folded version may be shorter) */
3824         bool unfolded_multi_char = FALSE;
3825         /* Peephole optimizer: */
3826         DEBUG_STUDYDATA("Peep:", data, depth);
3827         DEBUG_PEEP("Peep", scan, depth);
3828
3829
3830         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3831          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3832          * by a different invocation of reg() -- Yves
3833          */
3834         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3835
3836         /* Follow the next-chain of the current node and optimize
3837            away all the NOTHINGs from it.  */
3838         if (OP(scan) != CURLYX) {
3839             const int max = (reg_off_by_arg[OP(scan)]
3840                        ? I32_MAX
3841                        /* I32 may be smaller than U16 on CRAYs! */
3842                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3843             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3844             int noff;
3845             regnode *n = scan;
3846
3847             /* Skip NOTHING and LONGJMP. */
3848             while ((n = regnext(n))
3849                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3850                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3851                    && off + noff < max)
3852                 off += noff;
3853             if (reg_off_by_arg[OP(scan)])
3854                 ARG(scan) = off;
3855             else
3856                 NEXT_OFF(scan) = off;
3857         }
3858
3859         /* The principal pseudo-switch.  Cannot be a switch, since we
3860            look into several different things.  */
3861         if ( OP(scan) == DEFINEP ) {
3862             SSize_t minlen = 0;
3863             SSize_t deltanext = 0;
3864             SSize_t fake_last_close = 0;
3865             I32 f = SCF_IN_DEFINE;
3866
3867             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3868             scan = regnext(scan);
3869             assert( OP(scan) == IFTHEN );
3870             DEBUG_PEEP("expect IFTHEN", scan, depth);
3871
3872             data_fake.last_closep= &fake_last_close;
3873             minlen = *minlenp;
3874             next = regnext(scan);
3875             scan = NEXTOPER(NEXTOPER(scan));
3876             DEBUG_PEEP("scan", scan, depth);
3877             DEBUG_PEEP("next", next, depth);
3878
3879             /* we suppose the run is continuous, last=next...
3880              * NOTE we dont use the return here! */
3881             (void)study_chunk(pRExC_state, &scan, &minlen,
3882                               &deltanext, next, &data_fake, stopparen,
3883                               recursed_depth, NULL, f, depth+1);
3884
3885             scan = next;
3886         } else
3887         if (
3888             OP(scan) == BRANCH  ||
3889             OP(scan) == BRANCHJ ||
3890             OP(scan) == IFTHEN
3891         ) {
3892             next = regnext(scan);
3893             code = OP(scan);
3894
3895             /* The op(next)==code check below is to see if we
3896              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3897              * IFTHEN is special as it might not appear in pairs.
3898              * Not sure whether BRANCH-BRANCHJ is possible, regardless
3899              * we dont handle it cleanly. */
3900             if (OP(next) == code || code == IFTHEN) {
3901                 /* NOTE - There is similar code to this block below for
3902                  * handling TRIE nodes on a re-study.  If you change stuff here
3903                  * check there too. */
3904                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3905                 regnode_ssc accum;
3906                 regnode * const startbranch=scan;
3907
3908                 if (flags & SCF_DO_SUBSTR) {
3909                     /* Cannot merge strings after this. */
3910                     scan_commit(pRExC_state, data, minlenp, is_inf);
3911                 }
3912
3913                 if (flags & SCF_DO_STCLASS)
3914                     ssc_init_zero(pRExC_state, &accum);
3915
3916                 while (OP(scan) == code) {
3917                     SSize_t deltanext, minnext, fake;
3918                     I32 f = 0;
3919                     regnode_ssc this_class;
3920
3921                     DEBUG_PEEP("Branch", scan, depth);
3922
3923                     num++;
3924                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3925                     if (data) {
3926                         data_fake.whilem_c = data->whilem_c;
3927                         data_fake.last_closep = data->last_closep;
3928                     }
3929                     else
3930                         data_fake.last_closep = &fake;
3931
3932                     data_fake.pos_delta = delta;
3933                     next = regnext(scan);
3934
3935                     scan = NEXTOPER(scan); /* everything */
3936                     if (code != BRANCH)    /* everything but BRANCH */
3937                         scan = NEXTOPER(scan);
3938
3939                     if (flags & SCF_DO_STCLASS) {
3940                         ssc_init(pRExC_state, &this_class);
3941                         data_fake.start_class = &this_class;
3942                         f = SCF_DO_STCLASS_AND;
3943                     }
3944                     if (flags & SCF_WHILEM_VISITED_POS)
3945                         f |= SCF_WHILEM_VISITED_POS;
3946
3947                     /* we suppose the run is continuous, last=next...*/
3948                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3949                                       &deltanext, next, &data_fake, stopparen,
3950                                       recursed_depth, NULL, f,depth+1);
3951
3952                     if (min1 > minnext)
3953                         min1 = minnext;
3954                     if (deltanext == SSize_t_MAX) {
3955                         is_inf = is_inf_internal = 1;
3956                         max1 = SSize_t_MAX;
3957                     } else if (max1 < minnext + deltanext)
3958                         max1 = minnext + deltanext;
3959                     scan = next;
3960                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3961                         pars++;
3962                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3963                         if ( stopmin > minnext)
3964                             stopmin = min + min1;
3965                         flags &= ~SCF_DO_SUBSTR;
3966                         if (data)
3967                             data->flags |= SCF_SEEN_ACCEPT;
3968                     }
3969                     if (data) {
3970                         if (data_fake.flags & SF_HAS_EVAL)
3971                             data->flags |= SF_HAS_EVAL;
3972                         data->whilem_c = data_fake.whilem_c;
3973                     }
3974                     if (flags & SCF_DO_STCLASS)
3975                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3976                 }
3977                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3978                     min1 = 0;
3979                 if (flags & SCF_DO_SUBSTR) {
3980                     data->pos_min += min1;
3981                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3982                         data->pos_delta = SSize_t_MAX;
3983                     else
3984                         data->pos_delta += max1 - min1;
3985                     if (max1 != min1 || is_inf)
3986                         data->longest = &(data->longest_float);
3987                 }
3988                 min += min1;
3989                 if (delta == SSize_t_MAX
3990                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3991                     delta = SSize_t_MAX;
3992                 else
3993                     delta += max1 - min1;
3994                 if (flags & SCF_DO_STCLASS_OR) {
3995                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3996                     if (min1) {
3997                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3998                         flags &= ~SCF_DO_STCLASS;
3999                     }
4000                 }
4001                 else if (flags & SCF_DO_STCLASS_AND) {
4002                     if (min1) {
4003                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4004                         flags &= ~SCF_DO_STCLASS;
4005                     }
4006                     else {
4007                         /* Switch to OR mode: cache the old value of
4008                          * data->start_class */
4009                         INIT_AND_WITHP;
4010                         StructCopy(data->start_class, and_withp, regnode_ssc);
4011                         flags &= ~SCF_DO_STCLASS_AND;
4012                         StructCopy(&accum, data->start_class, regnode_ssc);
4013                         flags |= SCF_DO_STCLASS_OR;
4014                     }
4015                 }
4016
4017                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4018                         OP( startbranch ) == BRANCH )
4019                 {
4020                 /* demq.
4021
4022                    Assuming this was/is a branch we are dealing with: 'scan'
4023                    now points at the item that follows the branch sequence,
4024                    whatever it is. We now start at the beginning of the
4025                    sequence and look for subsequences of
4026
4027                    BRANCH->EXACT=>x1
4028                    BRANCH->EXACT=>x2
4029                    tail
4030
4031                    which would be constructed from a pattern like
4032                    /A|LIST|OF|WORDS/
4033
4034                    If we can find such a subsequence we need to turn the first
4035                    element into a trie and then add the subsequent branch exact
4036                    strings to the trie.
4037
4038                    We have two cases
4039
4040                      1. patterns where the whole set of branches can be
4041                         converted.
4042
4043                      2. patterns where only a subset can be converted.
4044
4045                    In case 1 we can replace the whole set with a single regop
4046                    for the trie. In case 2 we need to keep the start and end
4047                    branches so
4048
4049                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4050                      becomes BRANCH TRIE; BRANCH X;
4051
4052                   There is an additional case, that being where there is a
4053                   common prefix, which gets split out into an EXACT like node
4054                   preceding the TRIE node.
4055
4056                   If x(1..n)==tail then we can do a simple trie, if not we make
4057                   a "jump" trie, such that when we match the appropriate word
4058                   we "jump" to the appropriate tail node. Essentially we turn
4059                   a nested if into a case structure of sorts.
4060
4061                 */
4062
4063                     int made=0;
4064                     if (!re_trie_maxbuff) {
4065                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4066                         if (!SvIOK(re_trie_maxbuff))
4067                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4068                     }
4069                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4070                         regnode *cur;
4071                         regnode *first = (regnode *)NULL;
4072                         regnode *last = (regnode *)NULL;
4073                         regnode *tail = scan;
4074                         U8 trietype = 0;
4075                         U32 count=0;
4076
4077                         /* var tail is used because there may be a TAIL
4078                            regop in the way. Ie, the exacts will point to the
4079                            thing following the TAIL, but the last branch will
4080                            point at the TAIL. So we advance tail. If we
4081                            have nested (?:) we may have to move through several
4082                            tails.
4083                          */
4084
4085                         while ( OP( tail ) == TAIL ) {
4086                             /* this is the TAIL generated by (?:) */
4087                             tail = regnext( tail );
4088                         }
4089
4090
4091                         DEBUG_TRIE_COMPILE_r({
4092                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4093                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4094                               (int)depth * 2 + 2, "",
4095                               "Looking for TRIE'able sequences. Tail node is: ",
4096                               SvPV_nolen_const( RExC_mysv )
4097                             );
4098                         });
4099
4100                         /*
4101
4102                             Step through the branches
4103                                 cur represents each branch,
4104                                 noper is the first thing to be matched as part
4105                                       of that branch
4106                                 noper_next is the regnext() of that node.
4107
4108                             We normally handle a case like this
4109                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4110                             support building with NOJUMPTRIE, which restricts
4111                             the trie logic to structures like /FOO|BAR/.
4112
4113                             If noper is a trieable nodetype then the branch is
4114                             a possible optimization target. If we are building
4115                             under NOJUMPTRIE then we require that noper_next is
4116                             the same as scan (our current position in the regex
4117                             program).
4118
4119                             Once we have two or more consecutive such branches
4120                             we can create a trie of the EXACT's contents and
4121                             stitch it in place into the program.
4122
4123                             If the sequence represents all of the branches in
4124                             the alternation we replace the entire thing with a
4125                             single TRIE node.
4126
4127                             Otherwise when it is a subsequence we need to
4128                             stitch it in place and replace only the relevant
4129                             branches. This means the first branch has to remain
4130                             as it is used by the alternation logic, and its
4131                             next pointer, and needs to be repointed at the item
4132                             on the branch chain following the last branch we
4133                             have optimized away.
4134
4135                             This could be either a BRANCH, in which case the
4136                             subsequence is internal, or it could be the item
4137                             following the branch sequence in which case the
4138                             subsequence is at the end (which does not
4139                             necessarily mean the first node is the start of the
4140                             alternation).
4141
4142                             TRIE_TYPE(X) is a define which maps the optype to a
4143                             trietype.
4144
4145                                 optype          |  trietype
4146                                 ----------------+-----------
4147                                 NOTHING         | NOTHING
4148                                 EXACT           | EXACT
4149                                 EXACTFU         | EXACTFU
4150                                 EXACTFU_SS      | EXACTFU
4151                                 EXACTFA         | EXACTFA
4152                                 EXACTL          | EXACTL
4153                                 EXACTFLU8       | EXACTFLU8
4154
4155
4156                         */
4157 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4158                        ? NOTHING                                            \
4159                        : ( EXACT == (X) )                                   \
4160                          ? EXACT                                            \
4161                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4162                            ? EXACTFU                                        \
4163                            : ( EXACTFA == (X) )                             \
4164                              ? EXACTFA                                      \
4165                              : ( EXACTL == (X) )                            \
4166                                ? EXACTL                                     \
4167                                : ( EXACTFLU8 == (X) )                        \
4168                                  ? EXACTFLU8                                 \
4169                                  : 0 )
4170
4171                         /* dont use tail as the end marker for this traverse */
4172                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4173                             regnode * const noper = NEXTOPER( cur );
4174                             U8 noper_type = OP( noper );
4175                             U8 noper_trietype = TRIE_TYPE( noper_type );
4176 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4177                             regnode * const noper_next = regnext( noper );
4178                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4179                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4180 #endif
4181
4182                             DEBUG_TRIE_COMPILE_r({
4183                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4184                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4185                                    (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4186
4187                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4188                                 PerlIO_printf( Perl_debug_log, " -> %s",
4189                                     SvPV_nolen_const(RExC_mysv));
4190
4191                                 if ( noper_next ) {
4192                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4193                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4194                                     SvPV_nolen_const(RExC_mysv));
4195                                 }
4196                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4197                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4198                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4199                                 );
4200                             });
4201
4202                             /* Is noper a trieable nodetype that can be merged
4203                              * with the current trie (if there is one)? */
4204                             if ( noper_trietype
4205                                   &&
4206                                   (
4207                                         ( noper_trietype == NOTHING)
4208                                         || ( trietype == NOTHING )
4209                                         || ( trietype == noper_trietype )
4210                                   )
4211 #ifdef NOJUMPTRIE
4212                                   && noper_next == tail
4213 #endif
4214                                   && count < U16_MAX)
4215                             {
4216                                 /* Handle mergable triable node Either we are
4217                                  * the first node in a new trieable sequence,
4218                                  * in which case we do some bookkeeping,
4219                                  * otherwise we update the end pointer. */
4220                                 if ( !first ) {
4221                                     first = cur;
4222                                     if ( noper_trietype == NOTHING ) {
4223 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4224                                         regnode * const noper_next = regnext( noper );
4225                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4226                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4227 #endif
4228
4229                                         if ( noper_next_trietype ) {
4230                                             trietype = noper_next_trietype;
4231                                         } else if (noper_next_type)  {
4232                                             /* a NOTHING regop is 1 regop wide.
4233                                              * We need at least two for a trie
4234                                              * so we can't merge this in */
4235                                             first = NULL;
4236                                         }
4237                                     } else {
4238                                         trietype = noper_trietype;
4239                                     }
4240                                 } else {
4241                                     if ( trietype == NOTHING )
4242                                         trietype = noper_trietype;
4243                                     last = cur;
4244                                 }
4245                                 if (first)
4246                                     count++;
4247                             } /* end handle mergable triable node */
4248                             else {
4249                                 /* handle unmergable node -
4250                                  * noper may either be a triable node which can
4251                                  * not be tried together with the current trie,
4252                                  * or a non triable node */
4253                                 if ( last ) {
4254                                     /* If last is set and trietype is not
4255                                      * NOTHING then we have found at least two
4256                                      * triable branch sequences in a row of a
4257                                      * similar trietype so we can turn them
4258                                      * into a trie. If/when we allow NOTHING to
4259                                      * start a trie sequence this condition
4260                                      * will be required, and it isn't expensive
4261                                      * so we leave it in for now. */
4262                                     if ( trietype && trietype != NOTHING )
4263                                         make_trie( pRExC_state,
4264                                                 startbranch, first, cur, tail,
4265                                                 count, trietype, depth+1 );
4266                                     last = NULL; /* note: we clear/update
4267                                                     first, trietype etc below,
4268                                                     so we dont do it here */
4269                                 }
4270                                 if ( noper_trietype
4271 #ifdef NOJUMPTRIE
4272                                      && noper_next == tail
4273 #endif
4274                                 ){
4275                                     /* noper is triable, so we can start a new
4276                                      * trie sequence */
4277                                     count = 1;
4278                                     first = cur;
4279                                     trietype = noper_trietype;
4280                                 } else if (first) {
4281                                     /* if we already saw a first but the
4282                                      * current node is not triable then we have
4283                                      * to reset the first information. */
4284                                     count = 0;
4285                                     first = NULL;
4286                                     trietype = 0;
4287                                 }
4288                             } /* end handle unmergable node */
4289                         } /* loop over branches */
4290                         DEBUG_TRIE_COMPILE_r({
4291                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4292                             PerlIO_printf( Perl_debug_log,
4293                               "%*s- %s (%d) <SCAN FINISHED>\n",
4294                               (int)depth * 2 + 2,
4295                               "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4296
4297                         });
4298                         if ( last && trietype ) {
4299                             if ( trietype != NOTHING ) {
4300                                 /* the last branch of the sequence was part of
4301                                  * a trie, so we have to construct it here
4302                                  * outside of the loop */
4303                                 made= make_trie( pRExC_state, startbranch,
4304                                                  first, scan, tail, count,
4305                                                  trietype, depth+1 );
4306 #ifdef TRIE_STUDY_OPT
4307                                 if ( ((made == MADE_EXACT_TRIE &&
4308                                      startbranch == first)
4309                                      || ( first_non_open == first )) &&
4310                                      depth==0 ) {
4311                                     flags |= SCF_TRIE_RESTUDY;
4312                                     if ( startbranch == first
4313                                          && scan == tail )
4314                                     {
4315                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4316                                     }
4317                                 }
4318 #endif
4319                             } else {
4320                                 /* at this point we know whatever we have is a
4321                                  * NOTHING sequence/branch AND if 'startbranch'
4322                                  * is 'first' then we can turn the whole thing
4323                                  * into a NOTHING
4324                                  */
4325                                 if ( startbranch == first ) {
4326                                     regnode *opt;
4327                                     /* the entire thing is a NOTHING sequence,
4328                                      * something like this: (?:|) So we can
4329                                      * turn it into a plain NOTHING op. */
4330                                     DEBUG_TRIE_COMPILE_r({
4331                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4332                                         PerlIO_printf( Perl_debug_log,
4333                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4334                                           "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4335
4336                                     });
4337                                     OP(startbranch)= NOTHING;
4338                                     NEXT_OFF(startbranch)= tail - startbranch;
4339                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4340                                         OP(opt)= OPTIMIZED;
4341                                 }
4342                             }
4343                         } /* end if ( last) */
4344                     } /* TRIE_MAXBUF is non zero */
4345
4346                 } /* do trie */
4347
4348             }
4349             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4350                 scan = NEXTOPER(NEXTOPER(scan));
4351             } else                      /* single branch is optimized. */
4352                 scan = NEXTOPER(scan);
4353             continue;
4354         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4355             I32 paren = 0;
4356             regnode *start = NULL;
4357             regnode *end = NULL;
4358             U32 my_recursed_depth= recursed_depth;
4359
4360
4361             if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4362                 /* Do setup, note this code has side effects beyond
4363                  * the rest of this block. Specifically setting
4364                  * RExC_recurse[] must happen at least once during
4365                  * study_chunk(). */
4366                 if (OP(scan) == GOSUB) {
4367                     paren = ARG(scan);
4368                     RExC_recurse[ARG2L(scan)] = scan;
4369                     start = RExC_open_parens[paren-1];
4370                     end   = RExC_close_parens[paren-1];
4371                 } else {
4372                     start = RExC_rxi->program + 1;
4373                     end   = RExC_opend;
4374                 }
4375                 /* NOTE we MUST always execute the above code, even
4376                  * if we do nothing with a GOSUB/GOSTART */
4377                 if (
4378                     ( flags & SCF_IN_DEFINE )
4379                     ||
4380                     (
4381                         (is_inf_internal || is_inf || data->flags & SF_IS_INF)
4382                         &&
4383                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4384                     )
4385                 ) {
4386                     /* no need to do anything here if we are in a define. */
4387                     /* or we are after some kind of infinite construct
4388                      * so we can skip recursing into this item.
4389                      * Since it is infinite we will not change the maxlen
4390                      * or delta, and if we miss something that might raise
4391                      * the minlen it will merely pessimise a little.
4392                      *
4393                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4394                      * might result in a minlen of 1 and not of 4,
4395                      * but this doesn't make us mismatch, just try a bit
4396                      * harder than we should.
4397                      * */
4398                     scan= regnext(scan);
4399                     continue;
4400                 }
4401
4402                 if (
4403                     !recursed_depth
4404                     ||
4405                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4406                 ) {
4407                     /* it is quite possible that there are more efficient ways
4408                      * to do this. We maintain a bitmap per level of recursion
4409                      * of which patterns we have entered so we can detect if a
4410                      * pattern creates a possible infinite loop. When we
4411                      * recurse down a level we copy the previous levels bitmap
4412                      * down. When we are at recursion level 0 we zero the top
4413                      * level bitmap. It would be nice to implement a different
4414                      * more efficient way of doing this. In particular the top
4415                      * level bitmap may be unnecessary.
4416                      */
4417                     if (!recursed_depth) {
4418                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4419                     } else {
4420                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4421                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4422                              RExC_study_chunk_recursed_bytes, U8);
4423                     }
4424                     /* we havent recursed into this paren yet, so recurse into it */
4425                     DEBUG_STUDYDATA("set:", data,depth);
4426                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4427                     my_recursed_depth= recursed_depth + 1;
4428                 } else {
4429                     DEBUG_STUDYDATA("inf:", data,depth);
4430                     /* some form of infinite recursion, assume infinite length
4431                      * */
4432                     if (flags & SCF_DO_SUBSTR) {
4433                         scan_commit(pRExC_state, data, minlenp, is_inf);
4434                         data->longest = &(data->longest_float);
4435                     }
4436                     is_inf = is_inf_internal = 1;
4437                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4438                         ssc_anything(data->start_class);
4439                     flags &= ~SCF_DO_STCLASS;
4440
4441                     start= NULL; /* reset start so we dont recurse later on. */
4442                 }
4443             } else {
4444                 paren = stopparen;
4445                 start = scan + 2;
4446                 end = regnext(scan);
4447             }
4448             if (start) {
4449                 scan_frame *newframe;
4450                 assert(end);
4451                 if (!RExC_frame_last) {
4452                     Newxz(newframe, 1, scan_frame);
4453                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4454                     RExC_frame_head= newframe;
4455                     RExC_frame_count++;
4456                 } else if (!RExC_frame_last->next_frame) {
4457                     Newxz(newframe,1,scan_frame);
4458                     RExC_frame_last->next_frame= newframe;
4459                     newframe->prev_frame= RExC_frame_last;
4460                     RExC_frame_count++;
4461                 } else {
4462                     newframe= RExC_frame_last->next_frame;
4463                 }
4464                 RExC_frame_last= newframe;
4465
4466                 newframe->next_regnode = regnext(scan);
4467                 newframe->last_regnode = last;
4468                 newframe->stopparen = stopparen;
4469                 newframe->prev_recursed_depth = recursed_depth;
4470                 newframe->this_prev_frame= frame;
4471
4472                 DEBUG_STUDYDATA("frame-new:",data,depth);
4473                 DEBUG_PEEP("fnew", scan, depth);
4474
4475                 frame = newframe;
4476                 scan =  start;
4477                 stopparen = paren;
4478                 last = end;
4479                 depth = depth + 1;
4480                 recursed_depth= my_recursed_depth;
4481
4482                 continue;
4483             }
4484         }
4485         else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4486             SSize_t l = STR_LEN(scan);
4487             UV uc;
4488             if (UTF) {
4489                 const U8 * const s = (U8*)STRING(scan);
4490                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4491                 l = utf8_length(s, s + l);
4492             } else {
4493                 uc = *((U8*)STRING(scan));
4494             }
4495             min += l;
4496             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4497                 /* The code below prefers earlier match for fixed
4498                    offset, later match for variable offset.  */
4499                 if (data->last_end == -1) { /* Update the start info. */
4500                     data->last_start_min = data->pos_min;
4501                     data->last_start_max = is_inf
4502                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4503                 }
4504                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4505                 if (UTF)
4506                     SvUTF8_on(data->last_found);
4507                 {
4508                     SV * const sv = data->last_found;
4509                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4510                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4511                     if (mg && mg->mg_len >= 0)
4512                         mg->mg_len += utf8_length((U8*)STRING(scan),
4513                                               (U8*)STRING(scan)+STR_LEN(scan));
4514                 }
4515                 data->last_end = data->pos_min + l;
4516                 data->pos_min += l; /* As in the first entry. */
4517                 data->flags &= ~SF_BEFORE_EOL;
4518             }
4519
4520             /* ANDing the code point leaves at most it, and not in locale, and
4521              * can't match null string */
4522             if (flags & SCF_DO_STCLASS_AND) {
4523                 ssc_cp_and(data->start_class, uc);
4524                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4525                 ssc_clear_locale(data->start_class);
4526             }
4527             else if (flags & SCF_DO_STCLASS_OR) {
4528                 ssc_add_cp(data->start_class, uc);
4529                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4530
4531                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4532                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4533             }
4534             flags &= ~SCF_DO_STCLASS;
4535         }
4536         else if (PL_regkind[OP(scan)] == EXACT) {
4537             /* But OP != EXACT!, so is EXACTFish */
4538             SSize_t l = STR_LEN(scan);
4539             const U8 * s = (U8*)STRING(scan);
4540
4541             /* Search for fixed substrings supports EXACT only. */
4542             if (flags & SCF_DO_SUBSTR) {
4543                 assert(data);
4544                 scan_commit(pRExC_state, data, minlenp, is_inf);
4545             }
4546             if (UTF) {
4547                 l = utf8_length(s, s + l);
4548             }
4549             if (unfolded_multi_char) {
4550                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4551             }
4552             min += l - min_subtract;
4553             assert (min >= 0);
4554             delta += min_subtract;
4555             if (flags & SCF_DO_SUBSTR) {
4556                 data->pos_min += l - min_subtract;
4557                 if (data->pos_min < 0) {
4558                     data->pos_min = 0;
4559                 }
4560                 data->pos_delta += min_subtract;
4561                 if (min_subtract) {
4562                     data->longest = &(data->longest_float);
4563                 }
4564             }
4565
4566             if (flags & SCF_DO_STCLASS) {
4567                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4568
4569                 assert(EXACTF_invlist);
4570                 if (flags & SCF_DO_STCLASS_AND) {
4571                     if (OP(scan) != EXACTFL)
4572                         ssc_clear_locale(data->start_class);
4573                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4574                     ANYOF_POSIXL_ZERO(data->start_class);
4575                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4576                 }
4577                 else {  /* SCF_DO_STCLASS_OR */
4578                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4579                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4580
4581                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4582                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4583                 }
4584                 flags &= ~SCF_DO_STCLASS;
4585                 SvREFCNT_dec(EXACTF_invlist);
4586             }
4587         }
4588         else if (REGNODE_VARIES(OP(scan))) {
4589             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4590             I32 fl = 0, f = flags;
4591             regnode * const oscan = scan;
4592             regnode_ssc this_class;
4593             regnode_ssc *oclass = NULL;
4594             I32 next_is_eval = 0;
4595
4596             switch (PL_regkind[OP(scan)]) {
4597             case WHILEM:                /* End of (?:...)* . */
4598                 scan = NEXTOPER(scan);
4599                 goto finish;
4600             case PLUS:
4601                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4602                     next = NEXTOPER(scan);
4603                     if (OP(next) == EXACT
4604                         || OP(next) == EXACTL
4605                         || (flags & SCF_DO_STCLASS))
4606                     {
4607                         mincount = 1;
4608                         maxcount = REG_INFTY;
4609                         next = regnext(scan);
4610                         scan = NEXTOPER(scan);
4611                         goto do_curly;
4612                     }
4613                 }
4614                 if (flags & SCF_DO_SUBSTR)
4615                     data->pos_min++;
4616                 min++;
4617                 /* FALLTHROUGH */
4618             case STAR:
4619                 if (flags & SCF_DO_STCLASS) {
4620                     mincount = 0;
4621                     maxcount = REG_INFTY;
4622                     next = regnext(scan);
4623                     scan = NEXTOPER(scan);
4624                     goto do_curly;
4625                 }
4626                 if (flags & SCF_DO_SUBSTR) {
4627                     scan_commit(pRExC_state, data, minlenp, is_inf);
4628                     /* Cannot extend fixed substrings */
4629                     data->longest = &(data->longest_float);
4630                 }
4631                 is_inf = is_inf_internal = 1;
4632                 scan = regnext(scan);
4633                 goto optimize_curly_tail;
4634             case CURLY:
4635                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4636                     && (scan->flags == stopparen))
4637                 {
4638                     mincount = 1;
4639                     maxcount = 1;
4640                 } else {
4641                     mincount = ARG1(scan);
4642                     maxcount = ARG2(scan);
4643                 }
4644                 next = regnext(scan);
4645                 if (OP(scan) == CURLYX) {
4646                     I32 lp = (data ? *(data->last_closep) : 0);
4647                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4648                 }
4649                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4650                 next_is_eval = (OP(scan) == EVAL);
4651               do_curly:
4652                 if (flags & SCF_DO_SUBSTR) {
4653                     if (mincount == 0)
4654                         scan_commit(pRExC_state, data, minlenp, is_inf);
4655                     /* Cannot extend fixed substrings */
4656                     pos_before = data->pos_min;
4657                 }
4658                 if (data) {
4659                     fl = data->flags;
4660                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4661                     if (is_inf)
4662                         data->flags |= SF_IS_INF;
4663                 }
4664                 if (flags & SCF_DO_STCLASS) {
4665                     ssc_init(pRExC_state, &this_class);
4666                     oclass = data->start_class;
4667                     data->start_class = &this_class;
4668                     f |= SCF_DO_STCLASS_AND;
4669                     f &= ~SCF_DO_STCLASS_OR;
4670                 }
4671                 /* Exclude from super-linear cache processing any {n,m}
4672                    regops for which the combination of input pos and regex
4673                    pos is not enough information to determine if a match
4674                    will be possible.
4675
4676                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4677                    regex pos at the \s*, the prospects for a match depend not
4678                    only on the input position but also on how many (bar\s*)
4679                    repeats into the {4,8} we are. */
4680                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4681                     f &= ~SCF_WHILEM_VISITED_POS;
4682
4683                 /* This will finish on WHILEM, setting scan, or on NULL: */
4684                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4685                                   last, data, stopparen, recursed_depth, NULL,
4686                                   (mincount == 0
4687                                    ? (f & ~SCF_DO_SUBSTR)
4688                                    : f)
4689                                   ,depth+1);
4690
4691                 if (flags & SCF_DO_STCLASS)
4692                     data->start_class = oclass;
4693                 if (mincount == 0 || minnext == 0) {
4694                     if (flags & SCF_DO_STCLASS_OR) {
4695                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4696                     }
4697                     else if (flags & SCF_DO_STCLASS_AND) {
4698                         /* Switch to OR mode: cache the old value of
4699                          * data->start_class */
4700                         INIT_AND_WITHP;
4701                         StructCopy(data->start_class, and_withp, regnode_ssc);
4702                         flags &= ~SCF_DO_STCLASS_AND;
4703                         StructCopy(&this_class, data->start_class, regnode_ssc);
4704                         flags |= SCF_DO_STCLASS_OR;
4705                         ANYOF_FLAGS(data->start_class)
4706                                                 |= SSC_MATCHES_EMPTY_STRING;
4707                     }
4708                 } else {                /* Non-zero len */
4709                     if (flags & SCF_DO_STCLASS_OR) {
4710                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4711                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4712                     }
4713                     else if (flags & SCF_DO_STCLASS_AND)
4714                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4715                     flags &= ~SCF_DO_STCLASS;
4716                 }
4717                 if (!scan)              /* It was not CURLYX, but CURLY. */
4718                     scan = next;
4719                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4720                     /* ? quantifier ok, except for (?{ ... }) */
4721                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4722                     && (minnext == 0) && (deltanext == 0)
4723                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4724                     && maxcount <= REG_INFTY/3) /* Complement check for big
4725                                                    count */
4726                 {
4727                     /* Fatal warnings may leak the regexp without this: */
4728                     SAVEFREESV(RExC_rx_sv);
4729                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4730                         "Quantifier unexpected on zero-length expression "
4731                         "in regex m/%"UTF8f"/",
4732                          UTF8fARG(UTF, RExC_end - RExC_precomp,
4733                                   RExC_precomp));
4734                     (void)ReREFCNT_inc(RExC_rx_sv);
4735                 }
4736
4737                 min += minnext * mincount;
4738                 is_inf_internal |= deltanext == SSize_t_MAX
4739                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4740                 is_inf |= is_inf_internal;
4741                 if (is_inf) {
4742                     delta = SSize_t_MAX;
4743                 } else {
4744                     delta += (minnext + deltanext) * maxcount
4745                              - minnext * mincount;
4746                 }
4747                 /* Try powerful optimization CURLYX => CURLYN. */
4748                 if (  OP(oscan) == CURLYX && data
4749                       && data->flags & SF_IN_PAR
4750                       && !(data->flags & SF_HAS_EVAL)
4751                       && !deltanext && minnext == 1 ) {
4752                     /* Try to optimize to CURLYN.  */
4753                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4754                     regnode * const nxt1 = nxt;
4755 #ifdef DEBUGGING
4756                     regnode *nxt2;
4757 #endif
4758
4759                     /* Skip open. */
4760                     nxt = regnext(nxt);
4761                     if (!REGNODE_SIMPLE(OP(nxt))
4762                         && !(PL_regkind[OP(nxt)] == EXACT
4763                              && STR_LEN(nxt) == 1))
4764                         goto nogo;
4765 #ifdef DEBUGGING
4766                     nxt2 = nxt;
4767 #endif
4768                     nxt = regnext(nxt);
4769                     if (OP(nxt) != CLOSE)
4770                         goto nogo;
4771                     if (RExC_open_parens) {
4772                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4773                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4774                     }
4775                     /* Now we know that nxt2 is the only contents: */
4776                     oscan->flags = (U8)ARG(nxt);
4777                     OP(oscan) = CURLYN;
4778                     OP(nxt1) = NOTHING; /* was OPEN. */
4779
4780 #ifdef DEBUGGING
4781                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4782                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4783                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4784                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4785                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4786                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4787 #endif
4788                 }
4789               nogo:
4790
4791                 /* Try optimization CURLYX => CURLYM. */
4792                 if (  OP(oscan) == CURLYX && data
4793                       && !(data->flags & SF_HAS_PAR)
4794                       && !(data->flags & SF_HAS_EVAL)
4795                       && !deltanext     /* atom is fixed width */
4796                       && minnext != 0   /* CURLYM can't handle zero width */
4797
4798                          /* Nor characters whose fold at run-time may be
4799                           * multi-character */
4800                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4801                 ) {
4802                     /* XXXX How to optimize if data == 0? */
4803                     /* Optimize to a simpler form.  */
4804                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4805                     regnode *nxt2;
4806
4807                     OP(oscan) = CURLYM;
4808                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4809                             && (OP(nxt2) != WHILEM))
4810                         nxt = nxt2;
4811                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4812                     /* Need to optimize away parenths. */
4813                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4814                         /* Set the parenth number.  */
4815                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4816
4817                         oscan->flags = (U8)ARG(nxt);
4818                         if (RExC_open_parens) {
4819                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4820                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4821                         }
4822                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4823                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4824
4825 #ifdef DEBUGGING
4826                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4827                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4828                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4829                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4830 #endif
4831 #if 0
4832                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4833                             regnode *nnxt = regnext(nxt1);
4834                             if (nnxt == nxt) {
4835                                 if (reg_off_by_arg[OP(nxt1)])
4836                                     ARG_SET(nxt1, nxt2 - nxt1);
4837                                 else if (nxt2 - nxt1 < U16_MAX)
4838                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4839                                 else
4840                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4841                             }
4842                             nxt1 = nnxt;
4843                         }
4844 #endif
4845                         /* Optimize again: */
4846                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4847                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4848                     }
4849                     else
4850                         oscan->flags = 0;
4851                 }
4852                 else if ((OP(oscan) == CURLYX)
4853                          && (flags & SCF_WHILEM_VISITED_POS)
4854                          /* See the comment on a similar expression above.
4855                             However, this time it's not a subexpression
4856                             we care about, but the expression itself. */
4857                          && (maxcount == REG_INFTY)
4858                          && data && ++data->whilem_c < 16) {
4859                     /* This stays as CURLYX, we can put the count/of pair. */
4860                     /* Find WHILEM (as in regexec.c) */
4861                     regnode *nxt = oscan + NEXT_OFF(oscan);
4862
4863                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4864                         nxt += ARG(nxt);
4865                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4866                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4867                 }
4868                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4869                     pars++;
4870                 if (flags & SCF_DO_SUBSTR) {
4871                     SV *last_str = NULL;
4872                     STRLEN last_chrs = 0;
4873                     int counted = mincount != 0;
4874
4875                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4876                                                                   string. */
4877                         SSize_t b = pos_before >= data->last_start_min
4878                             ? pos_before : data->last_start_min;
4879                         STRLEN l;
4880                         const char * const s = SvPV_const(data->last_found, l);
4881                         SSize_t old = b - data->last_start_min;
4882
4883                         if (UTF)
4884                             old = utf8_hop((U8*)s, old) - (U8*)s;
4885                         l -= old;
4886                         /* Get the added string: */
4887                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4888                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4889                                             (U8*)(s + old + l)) : l;
4890                         if (deltanext == 0 && pos_before == b) {
4891                             /* What was added is a constant string */
4892                             if (mincount > 1) {
4893
4894                                 SvGROW(last_str, (mincount * l) + 1);
4895                                 repeatcpy(SvPVX(last_str) + l,
4896                                           SvPVX_const(last_str), l,
4897                                           mincount - 1);
4898                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4899                                 /* Add additional parts. */
4900                                 SvCUR_set(data->last_found,
4901                                           SvCUR(data->last_found) - l);
4902                                 sv_catsv(data->last_found, last_str);
4903                                 {
4904                                     SV * sv = data->last_found;
4905                                     MAGIC *mg =
4906                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4907                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4908                                     if (mg && mg->mg_len >= 0)
4909                                         mg->mg_len += last_chrs * (mincount-1);
4910                                 }
4911                                 last_chrs *= mincount;
4912                                 data->last_end += l * (mincount - 1);
4913                             }
4914                         } else {
4915                             /* start offset must point into the last copy */
4916                             data->last_start_min += minnext * (mincount - 1);
4917                             data->last_start_max =
4918                               is_inf
4919                                ? SSize_t_MAX
4920                                : data->last_start_max +
4921                                  (maxcount - 1) * (minnext + data->pos_delta);
4922                         }
4923                     }
4924                     /* It is counted once already... */
4925                     data->pos_min += minnext * (mincount - counted);
4926 #if 0
4927 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4928                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4929                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
4930     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4931     (UV)mincount);
4932 if (deltanext != SSize_t_MAX)
4933 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4934     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4935           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4936 #endif
4937                     if (deltanext == SSize_t_MAX
4938                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4939                         data->pos_delta = SSize_t_MAX;
4940                     else
4941                         data->pos_delta += - counted * deltanext +
4942                         (minnext + deltanext) * maxcount - minnext * mincount;
4943                     if (mincount != maxcount) {
4944                          /* Cannot extend fixed substrings found inside
4945                             the group.  */
4946                         scan_commit(pRExC_state, data, minlenp, is_inf);
4947                         if (mincount && last_str) {
4948                             SV * const sv = data->last_found;
4949                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4950                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4951
4952                             if (mg)
4953                                 mg->mg_len = -1;
4954                             sv_setsv(sv, last_str);
4955                             data->last_end = data->pos_min;
4956                             data->last_start_min = data->pos_min - last_chrs;
4957                             data->last_start_max = is_inf
4958                                 ? SSize_t_MAX
4959                                 : data->pos_min + data->pos_delta - last_chrs;
4960                         }
4961                         data->longest = &(data->longest_float);
4962                     }
4963                     SvREFCNT_dec(last_str);
4964                 }
4965                 if (data && (fl & SF_HAS_EVAL))
4966                     data->flags |= SF_HAS_EVAL;
4967               optimize_curly_tail:
4968                 if (OP(oscan) != CURLYX) {
4969                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4970                            && NEXT_OFF(next))
4971                         NEXT_OFF(oscan) += NEXT_OFF(next);
4972                 }
4973                 continue;
4974
4975             default:
4976 #ifdef DEBUGGING
4977                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4978                                                                     OP(scan));
4979 #endif
4980             case REF:
4981             case CLUMP:
4982                 if (flags & SCF_DO_SUBSTR) {
4983                     /* Cannot expect anything... */
4984                     scan_commit(pRExC_state, data, minlenp, is_inf);
4985                     data->longest = &(data->longest_float);
4986                 }
4987                 is_inf = is_inf_internal = 1;
4988                 if (flags & SCF_DO_STCLASS_OR) {
4989                     if (OP(scan) == CLUMP) {
4990                         /* Actually is any start char, but very few code points
4991                          * aren't start characters */
4992                         ssc_match_all_cp(data->start_class);
4993                     }
4994                     else {
4995                         ssc_anything(data->start_class);
4996                     }
4997                 }
4998                 flags &= ~SCF_DO_STCLASS;
4999                 break;
5000             }
5001         }
5002         else if (OP(scan) == LNBREAK) {
5003             if (flags & SCF_DO_STCLASS) {
5004                 if (flags & SCF_DO_STCLASS_AND) {
5005                     ssc_intersection(data->start_class,
5006                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5007                     ssc_clear_locale(data->start_class);
5008                     ANYOF_FLAGS(data->start_class)
5009                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5010                 }
5011                 else if (flags & SCF_DO_STCLASS_OR) {
5012                     ssc_union(data->start_class,
5013                               PL_XPosix_ptrs[_CC_VERTSPACE],
5014                               FALSE);
5015                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5016
5017                     /* See commit msg for
5018                      * 749e076fceedeb708a624933726e7989f2302f6a */
5019                     ANYOF_FLAGS(data->start_class)
5020                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5021                 }
5022                 flags &= ~SCF_DO_STCLASS;
5023             }
5024             min++;
5025             if (delta != SSize_t_MAX)
5026                 delta++;    /* Because of the 2 char string cr-lf */
5027             if (flags & SCF_DO_SUBSTR) {
5028                 /* Cannot expect anything... */
5029                 scan_commit(pRExC_state, data, minlenp, is_inf);
5030                 data->pos_min += 1;
5031                 data->pos_delta += 1;
5032                 data->longest = &(data->longest_float);
5033             }
5034         }
5035         else if (REGNODE_SIMPLE(OP(scan))) {
5036
5037             if (flags & SCF_DO_SUBSTR) {
5038                 scan_commit(pRExC_state, data, minlenp, is_inf);
5039                 data->pos_min++;
5040             }
5041             min++;
5042             if (flags & SCF_DO_STCLASS) {
5043                 bool invert = 0;
5044                 SV* my_invlist = NULL;
5045                 U8 namedclass;
5046
5047                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5048                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5049
5050                 /* Some of the logic below assumes that switching
5051                    locale on will only add false positives. */
5052                 switch (OP(scan)) {
5053
5054                 default:
5055 #ifdef DEBUGGING
5056                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5057                                                                      OP(scan));
5058 #endif
5059                 case CANY:
5060                 case SANY:
5061                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5062                         ssc_match_all_cp(data->start_class);
5063                     break;
5064
5065                 case REG_ANY:
5066                     {
5067                         SV* REG_ANY_invlist = _new_invlist(2);
5068                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5069                                                             '\n');
5070                         if (flags & SCF_DO_STCLASS_OR) {
5071                             ssc_union(data->start_class,
5072                                       REG_ANY_invlist,
5073                                       TRUE /* TRUE => invert, hence all but \n
5074                                             */
5075                                       );
5076                         }
5077                         else if (flags & SCF_DO_STCLASS_AND) {
5078                             ssc_intersection(data->start_class,
5079                                              REG_ANY_invlist,
5080                                              TRUE  /* TRUE => invert */
5081                                              );
5082                             ssc_clear_locale(data->start_class);
5083                         }
5084                         SvREFCNT_dec_NN(REG_ANY_invlist);
5085                     }
5086                     break;
5087
5088                 case ANYOFL:
5089                 case ANYOF:
5090                     if (flags & SCF_DO_STCLASS_AND)
5091                         ssc_and(pRExC_state, data->start_class,
5092                                 (regnode_charclass *) scan);
5093                     else
5094                         ssc_or(pRExC_state, data->start_class,
5095                                                           (regnode_charclass *) scan);
5096                     break;
5097
5098                 case NPOSIXL:
5099                     invert = 1;
5100                     /* FALLTHROUGH */
5101
5102                 case POSIXL:
5103                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5104                     if (flags & SCF_DO_STCLASS_AND) {
5105                         bool was_there = cBOOL(
5106                                           ANYOF_POSIXL_TEST(data->start_class,
5107                                                                  namedclass));
5108                         ANYOF_POSIXL_ZERO(data->start_class);
5109                         if (was_there) {    /* Do an AND */
5110                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5111                         }
5112                         /* No individual code points can now match */
5113                         data->start_class->invlist
5114                                                 = sv_2mortal(_new_invlist(0));
5115                     }
5116                     else {
5117                         int complement = namedclass + ((invert) ? -1 : 1);
5118
5119                         assert(flags & SCF_DO_STCLASS_OR);
5120
5121                         /* If the complement of this class was already there,
5122                          * the result is that they match all code points,
5123                          * (\d + \D == everything).  Remove the classes from
5124                          * future consideration.  Locale is not relevant in
5125                          * this case */
5126                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5127                             ssc_match_all_cp(data->start_class);
5128                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5129                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5130                         }
5131                         else {  /* The usual case; just add this class to the
5132                                    existing set */
5133                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5134                         }
5135                     }
5136                     break;
5137
5138                 case NPOSIXA:   /* For these, we always know the exact set of
5139                                    what's matched */
5140                     invert = 1;
5141                     /* FALLTHROUGH */
5142                 case POSIXA:
5143                     if (FLAGS(scan) == _CC_ASCII) {
5144                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5145                     }
5146                     else {
5147                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5148                                               PL_XPosix_ptrs[_CC_ASCII],
5149                                               &my_invlist);
5150                     }
5151                     goto join_posix;
5152
5153                 case NPOSIXD:
5154                 case NPOSIXU:
5155                     invert = 1;
5156                     /* FALLTHROUGH */
5157                 case POSIXD:
5158                 case POSIXU:
5159                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5160
5161                     /* NPOSIXD matches all upper Latin1 code points unless the
5162                      * target string being matched is UTF-8, which is
5163                      * unknowable until match time.  Since we are going to
5164                      * invert, we want to get rid of all of them so that the
5165                      * inversion will match all */
5166                     if (OP(scan) == NPOSIXD) {
5167                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5168                                           &my_invlist);
5169                     }
5170
5171                   join_posix:
5172
5173                     if (flags & SCF_DO_STCLASS_AND) {
5174                         ssc_intersection(data->start_class, my_invlist, invert);
5175                         ssc_clear_locale(data->start_class);
5176                     }
5177                     else {
5178                         assert(flags & SCF_DO_STCLASS_OR);
5179                         ssc_union(data->start_class, my_invlist, invert);
5180                     }
5181                     SvREFCNT_dec(my_invlist);
5182                 }
5183                 if (flags & SCF_DO_STCLASS_OR)
5184                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5185                 flags &= ~SCF_DO_STCLASS;
5186             }
5187         }
5188         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5189             data->flags |= (OP(scan) == MEOL
5190                             ? SF_BEFORE_MEOL
5191                             : SF_BEFORE_SEOL);
5192             scan_commit(pRExC_state, data, minlenp, is_inf);
5193
5194         }
5195         else if (  PL_regkind[OP(scan)] == BRANCHJ
5196                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5197                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5198                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5199         {
5200             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5201                 || OP(scan) == UNLESSM )
5202             {
5203                 /* Negative Lookahead/lookbehind
5204                    In this case we can't do fixed string optimisation.
5205                 */
5206
5207                 SSize_t deltanext, minnext, fake = 0;
5208                 regnode *nscan;
5209                 regnode_ssc intrnl;
5210                 int f = 0;
5211
5212                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5213                 if (data) {
5214                     data_fake.whilem_c = data->whilem_c;
5215                     data_fake.last_closep = data->last_closep;
5216                 }
5217                 else
5218                     data_fake.last_closep = &fake;
5219                 data_fake.pos_delta = delta;
5220                 if ( flags & SCF_DO_STCLASS && !scan->flags
5221                      && OP(scan) == IFMATCH ) { /* Lookahead */
5222                     ssc_init(pRExC_state, &intrnl);
5223                     data_fake.start_class = &intrnl;
5224                     f |= SCF_DO_STCLASS_AND;
5225                 }
5226                 if (flags & SCF_WHILEM_VISITED_POS)
5227                     f |= SCF_WHILEM_VISITED_POS;
5228                 next = regnext(scan);
5229                 nscan = NEXTOPER(NEXTOPER(scan));
5230                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5231                                       last, &data_fake, stopparen,
5232                                       recursed_depth, NULL, f, depth+1);
5233                 if (scan->flags) {
5234                     if (deltanext) {
5235                         FAIL("Variable length lookbehind not implemented");
5236                     }
5237                     else if (minnext > (I32)U8_MAX) {
5238                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5239                               (UV)U8_MAX);
5240                     }
5241                     scan->flags = (U8)minnext;
5242                 }
5243                 if (data) {
5244                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5245                         pars++;
5246                     if (data_fake.flags & SF_HAS_EVAL)
5247                         data->flags |= SF_HAS_EVAL;
5248                     data->whilem_c = data_fake.whilem_c;
5249                 }
5250                 if (f & SCF_DO_STCLASS_AND) {
5251                     if (flags & SCF_DO_STCLASS_OR) {
5252                         /* OR before, AND after: ideally we would recurse with
5253                          * data_fake to get the AND applied by study of the
5254                          * remainder of the pattern, and then derecurse;
5255                          * *** HACK *** for now just treat as "no information".
5256                          * See [perl #56690].
5257                          */
5258                         ssc_init(pRExC_state, data->start_class);
5259                     }  else {
5260                         /* AND before and after: combine and continue.  These
5261                          * assertions are zero-length, so can match an EMPTY
5262                          * string */
5263                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5264                         ANYOF_FLAGS(data->start_class)
5265                                                    |= SSC_MATCHES_EMPTY_STRING;
5266                     }
5267                 }
5268             }
5269 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5270             else {
5271                 /* Positive Lookahead/lookbehind
5272                    In this case we can do fixed string optimisation,
5273                    but we must be careful about it. Note in the case of
5274                    lookbehind the positions will be offset by the minimum
5275                    length of the pattern, something we won't know about
5276                    until after the recurse.
5277                 */
5278                 SSize_t deltanext, fake = 0;
5279                 regnode *nscan;
5280                 regnode_ssc intrnl;
5281                 int f = 0;
5282                 /* We use SAVEFREEPV so that when the full compile
5283                     is finished perl will clean up the allocated
5284                     minlens when it's all done. This way we don't
5285                     have to worry about freeing them when we know
5286                     they wont be used, which would be a pain.
5287                  */
5288                 SSize_t *minnextp;
5289                 Newx( minnextp, 1, SSize_t );
5290                 SAVEFREEPV(minnextp);
5291
5292                 if (data) {
5293                     StructCopy(data, &data_fake, scan_data_t);
5294                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5295                         f |= SCF_DO_SUBSTR;
5296                         if (scan->flags)
5297                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5298                         data_fake.last_found=newSVsv(data->last_found);
5299                     }
5300                 }
5301                 else
5302                     data_fake.last_closep = &fake;
5303                 data_fake.flags = 0;
5304                 data_fake.pos_delta = delta;
5305                 if (is_inf)
5306                     data_fake.flags |= SF_IS_INF;
5307                 if ( flags & SCF_DO_STCLASS && !scan->flags
5308                      && OP(scan) == IFMATCH ) { /* Lookahead */
5309                     ssc_init(pRExC_state, &intrnl);
5310                     data_fake.start_class = &intrnl;
5311                     f |= SCF_DO_STCLASS_AND;
5312                 }
5313                 if (flags & SCF_WHILEM_VISITED_POS)
5314                     f |= SCF_WHILEM_VISITED_POS;
5315                 next = regnext(scan);
5316                 nscan = NEXTOPER(NEXTOPER(scan));
5317
5318                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5319                                         &deltanext, last, &data_fake,
5320                                         stopparen, recursed_depth, NULL,
5321                                         f,depth+1);
5322                 if (scan->flags) {
5323                     if (deltanext) {
5324                         FAIL("Variable length lookbehind not implemented");
5325                     }
5326                     else if (*minnextp > (I32)U8_MAX) {
5327                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5328                               (UV)U8_MAX);
5329                     }
5330                     scan->flags = (U8)*minnextp;
5331                 }
5332
5333                 *minnextp += min;
5334
5335                 if (f & SCF_DO_STCLASS_AND) {
5336                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5337                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5338                 }
5339                 if (data) {
5340                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5341                         pars++;
5342                     if (data_fake.flags & SF_HAS_EVAL)
5343                         data->flags |= SF_HAS_EVAL;
5344                     data->whilem_c = data_fake.whilem_c;
5345                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5346                         if (RExC_rx->minlen<*minnextp)
5347                             RExC_rx->minlen=*minnextp;
5348                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5349                         SvREFCNT_dec_NN(data_fake.last_found);
5350
5351                         if ( data_fake.minlen_fixed != minlenp )
5352                         {
5353                             data->offset_fixed= data_fake.offset_fixed;
5354                             data->minlen_fixed= data_fake.minlen_fixed;
5355                             data->lookbehind_fixed+= scan->flags;
5356                         }
5357                         if ( data_fake.minlen_float != minlenp )
5358                         {
5359                             data->minlen_float= data_fake.minlen_float;
5360                             data->offset_float_min=data_fake.offset_float_min;
5361                             data->offset_float_max=data_fake.offset_float_max;
5362                             data->lookbehind_float+= scan->flags;
5363                         }
5364                     }
5365                 }
5366             }
5367 #endif
5368         }
5369         else if (OP(scan) == OPEN) {
5370             if (stopparen != (I32)ARG(scan))
5371                 pars++;
5372         }
5373         else if (OP(scan) == CLOSE) {
5374             if (stopparen == (I32)ARG(scan)) {
5375                 break;
5376             }
5377             if ((I32)ARG(scan) == is_par) {
5378                 next = regnext(scan);
5379
5380                 if ( next && (OP(next) != WHILEM) && next < last)
5381                     is_par = 0;         /* Disable optimization */
5382             }
5383             if (data)
5384                 *(data->last_closep) = ARG(scan);
5385         }
5386         else if (OP(scan) == EVAL) {
5387                 if (data)
5388                     data->flags |= SF_HAS_EVAL;
5389         }
5390         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5391             if (flags & SCF_DO_SUBSTR) {
5392                 scan_commit(pRExC_state, data, minlenp, is_inf);
5393                 flags &= ~SCF_DO_SUBSTR;
5394             }
5395             if (data && OP(scan)==ACCEPT) {
5396                 data->flags |= SCF_SEEN_ACCEPT;
5397                 if (stopmin > min)
5398                     stopmin = min;
5399             }
5400         }
5401         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5402         {
5403                 if (flags & SCF_DO_SUBSTR) {
5404                     scan_commit(pRExC_state, data, minlenp, is_inf);
5405                     data->longest = &(data->longest_float);
5406                 }
5407                 is_inf = is_inf_internal = 1;
5408                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5409                     ssc_anything(data->start_class);
5410                 flags &= ~SCF_DO_STCLASS;
5411         }
5412         else if (OP(scan) == GPOS) {
5413             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5414                 !(delta || is_inf || (data && data->pos_delta)))
5415             {
5416                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5417                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5418                 if (RExC_rx->gofs < (STRLEN)min)
5419                     RExC_rx->gofs = min;
5420             } else {
5421                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5422                 RExC_rx->gofs = 0;
5423             }
5424         }
5425 #ifdef TRIE_STUDY_OPT
5426 #ifdef FULL_TRIE_STUDY
5427         else if (PL_regkind[OP(scan)] == TRIE) {
5428             /* NOTE - There is similar code to this block above for handling
5429                BRANCH nodes on the initial study.  If you change stuff here
5430                check there too. */
5431             regnode *trie_node= scan;
5432             regnode *tail= regnext(scan);
5433             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5434             SSize_t max1 = 0, min1 = SSize_t_MAX;
5435             regnode_ssc accum;
5436
5437             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5438                 /* Cannot merge strings after this. */
5439                 scan_commit(pRExC_state, data, minlenp, is_inf);
5440             }
5441             if (flags & SCF_DO_STCLASS)
5442                 ssc_init_zero(pRExC_state, &accum);
5443
5444             if (!trie->jump) {
5445                 min1= trie->minlen;
5446                 max1= trie->maxlen;
5447             } else {
5448                 const regnode *nextbranch= NULL;
5449                 U32 word;
5450
5451                 for ( word=1 ; word <= trie->wordcount ; word++)
5452                 {
5453                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5454                     regnode_ssc this_class;
5455
5456                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5457                     if (data) {
5458                         data_fake.whilem_c = data->whilem_c;
5459                         data_fake.last_closep = data->last_closep;
5460                     }
5461                     else
5462                         data_fake.last_closep = &fake;
5463                     data_fake.pos_delta = delta;
5464                     if (flags & SCF_DO_STCLASS) {
5465                         ssc_init(pRExC_state, &this_class);
5466                         data_fake.start_class = &this_class;
5467                         f = SCF_DO_STCLASS_AND;
5468                     }
5469                     if (flags & SCF_WHILEM_VISITED_POS)
5470                         f |= SCF_WHILEM_VISITED_POS;
5471
5472                     if (trie->jump[word]) {
5473                         if (!nextbranch)
5474                             nextbranch = trie_node + trie->jump[0];
5475                         scan= trie_node + trie->jump[word];
5476                         /* We go from the jump point to the branch that follows
5477                            it. Note this means we need the vestigal unused
5478                            branches even though they arent otherwise used. */
5479                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5480                             &deltanext, (regnode *)nextbranch, &data_fake,
5481                             stopparen, recursed_depth, NULL, f,depth+1);
5482                     }
5483                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5484                         nextbranch= regnext((regnode*)nextbranch);
5485
5486                     if (min1 > (SSize_t)(minnext + trie->minlen))
5487                         min1 = minnext + trie->minlen;
5488                     if (deltanext == SSize_t_MAX) {
5489                         is_inf = is_inf_internal = 1;
5490                         max1 = SSize_t_MAX;
5491                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5492                         max1 = minnext + deltanext + trie->maxlen;
5493
5494                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5495                         pars++;
5496                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5497                         if ( stopmin > min + min1)
5498                             stopmin = min + min1;
5499                         flags &= ~SCF_DO_SUBSTR;
5500                         if (data)
5501                             data->flags |= SCF_SEEN_ACCEPT;
5502                     }
5503                     if (data) {
5504                         if (data_fake.flags & SF_HAS_EVAL)
5505                             data->flags |= SF_HAS_EVAL;
5506                         data->whilem_c = data_fake.whilem_c;
5507                     }
5508                     if (flags & SCF_DO_STCLASS)
5509                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5510                 }
5511             }
5512             if (flags & SCF_DO_SUBSTR) {
5513                 data->pos_min += min1;
5514                 data->pos_delta += max1 - min1;
5515                 if (max1 != min1 || is_inf)
5516                     data->longest = &(data->longest_float);
5517             }
5518             min += min1;
5519             if (delta != SSize_t_MAX)
5520                 delta += max1 - min1;
5521             if (flags & SCF_DO_STCLASS_OR) {
5522                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5523                 if (min1) {
5524                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5525                     flags &= ~SCF_DO_STCLASS;
5526                 }
5527             }
5528             else if (flags & SCF_DO_STCLASS_AND) {
5529                 if (min1) {
5530                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5531                     flags &= ~SCF_DO_STCLASS;
5532                 }
5533                 else {
5534                     /* Switch to OR mode: cache the old value of
5535                      * data->start_class */
5536                     INIT_AND_WITHP;
5537                     StructCopy(data->start_class, and_withp, regnode_ssc);
5538                     flags &= ~SCF_DO_STCLASS_AND;
5539                     StructCopy(&accum, data->start_class, regnode_ssc);
5540                     flags |= SCF_DO_STCLASS_OR;
5541                 }
5542             }
5543             scan= tail;
5544             continue;
5545         }
5546 #else
5547         else if (PL_regkind[OP(scan)] == TRIE) {
5548             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5549             U8*bang=NULL;
5550
5551             min += trie->minlen;
5552             delta += (trie->maxlen - trie->minlen);
5553             flags &= ~SCF_DO_STCLASS; /* xxx */
5554             if (flags & SCF_DO_SUBSTR) {
5555                 /* Cannot expect anything... */
5556                 scan_commit(pRExC_state, data, minlenp, is_inf);
5557                 data->pos_min += trie->minlen;
5558                 data->pos_delta += (trie->maxlen - trie->minlen);
5559                 if (trie->maxlen != trie->minlen)
5560                     data->longest = &(data->longest_float);
5561             }
5562             if (trie->jump) /* no more substrings -- for now /grr*/
5563                flags &= ~SCF_DO_SUBSTR;
5564         }
5565 #endif /* old or new */
5566 #endif /* TRIE_STUDY_OPT */
5567
5568         /* Else: zero-length, ignore. */
5569         scan = regnext(scan);
5570     }
5571     /* If we are exiting a recursion we can unset its recursed bit
5572      * and allow ourselves to enter it again - no danger of an
5573      * infinite loop there.
5574     if (stopparen > -1 && recursed) {
5575         DEBUG_STUDYDATA("unset:", data,depth);
5576         PAREN_UNSET( recursed, stopparen);
5577     }
5578     */
5579     if (frame) {
5580         depth = depth - 1;
5581
5582         DEBUG_STUDYDATA("frame-end:",data,depth);
5583         DEBUG_PEEP("fend", scan, depth);
5584
5585         /* restore previous context */
5586         last = frame->last_regnode;
5587         scan = frame->next_regnode;
5588         stopparen = frame->stopparen;
5589         recursed_depth = frame->prev_recursed_depth;
5590
5591         RExC_frame_last = frame->prev_frame;
5592         frame = frame->this_prev_frame;
5593         goto fake_study_recurse;
5594     }
5595
5596   finish:
5597     assert(!frame);
5598     DEBUG_STUDYDATA("pre-fin:",data,depth);
5599
5600     *scanp = scan;
5601     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5602
5603     if (flags & SCF_DO_SUBSTR && is_inf)
5604         data->pos_delta = SSize_t_MAX - data->pos_min;
5605     if (is_par > (I32)U8_MAX)
5606         is_par = 0;
5607     if (is_par && pars==1 && data) {
5608         data->flags |= SF_IN_PAR;
5609         data->flags &= ~SF_HAS_PAR;
5610     }
5611     else if (pars && data) {
5612         data->flags |= SF_HAS_PAR;
5613         data->flags &= ~SF_IN_PAR;
5614     }
5615     if (flags & SCF_DO_STCLASS_OR)
5616         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5617     if (flags & SCF_TRIE_RESTUDY)
5618         data->flags |=  SCF_TRIE_RESTUDY;
5619
5620     DEBUG_STUDYDATA("post-fin:",data,depth);
5621
5622     {
5623         SSize_t final_minlen= min < stopmin ? min : stopmin;
5624
5625         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5626             if (final_minlen > SSize_t_MAX - delta)
5627                 RExC_maxlen = SSize_t_MAX;
5628             else if (RExC_maxlen < final_minlen + delta)
5629                 RExC_maxlen = final_minlen + delta;
5630         }
5631         return final_minlen;
5632     }
5633     NOT_REACHED;
5634 }
5635
5636 STATIC U32
5637 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5638 {
5639     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5640
5641     PERL_ARGS_ASSERT_ADD_DATA;
5642
5643     Renewc(RExC_rxi->data,
5644            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5645            char, struct reg_data);
5646     if(count)
5647         Renew(RExC_rxi->data->what, count + n, U8);
5648     else
5649         Newx(RExC_rxi->data->what, n, U8);
5650     RExC_rxi->data->count = count + n;
5651     Copy(s, RExC_rxi->data->what + count, n, U8);
5652     return count;
5653 }
5654
5655 /*XXX: todo make this not included in a non debugging perl, but appears to be
5656  * used anyway there, in 'use re' */
5657 #ifndef PERL_IN_XSUB_RE
5658 void
5659 Perl_reginitcolors(pTHX)
5660 {
5661     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5662     if (s) {
5663         char *t = savepv(s);
5664         int i = 0;
5665         PL_colors[0] = t;
5666         while (++i < 6) {
5667             t = strchr(t, '\t');
5668             if (t) {
5669                 *t = '\0';
5670                 PL_colors[i] = ++t;
5671             }
5672             else
5673                 PL_colors[i] = t = (char *)"";
5674         }
5675     } else {
5676         int i = 0;
5677         while (i < 6)
5678             PL_colors[i++] = (char *)"";
5679     }
5680     PL_colorset = 1;
5681 }
5682 #endif
5683
5684
5685 #ifdef TRIE_STUDY_OPT
5686 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5687     STMT_START {                                            \
5688         if (                                                \
5689               (data.flags & SCF_TRIE_RESTUDY)               \
5690               && ! restudied++                              \
5691         ) {                                                 \
5692             dOsomething;                                    \
5693             goto reStudy;                                   \
5694         }                                                   \
5695     } STMT_END
5696 #else
5697 #define CHECK_RESTUDY_GOTO_butfirst
5698 #endif
5699
5700 /*
5701  * pregcomp - compile a regular expression into internal code
5702  *
5703  * Decides which engine's compiler to call based on the hint currently in
5704  * scope
5705  */
5706
5707 #ifndef PERL_IN_XSUB_RE
5708
5709 /* return the currently in-scope regex engine (or the default if none)  */
5710
5711 regexp_engine const *
5712 Perl_current_re_engine(pTHX)
5713 {
5714     if (IN_PERL_COMPILETIME) {
5715         HV * const table = GvHV(PL_hintgv);
5716         SV **ptr;
5717
5718         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5719             return &PL_core_reg_engine;
5720         ptr = hv_fetchs(table, "regcomp", FALSE);
5721         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5722             return &PL_core_reg_engine;
5723         return INT2PTR(regexp_engine*,SvIV(*ptr));
5724     }
5725     else {
5726         SV *ptr;
5727         if (!PL_curcop->cop_hints_hash)
5728             return &PL_core_reg_engine;
5729         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5730         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5731             return &PL_core_reg_engine;
5732         return INT2PTR(regexp_engine*,SvIV(ptr));
5733     }
5734 }
5735
5736
5737 REGEXP *
5738 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5739 {
5740     regexp_engine const *eng = current_re_engine();
5741     GET_RE_DEBUG_FLAGS_DECL;
5742
5743     PERL_ARGS_ASSERT_PREGCOMP;
5744
5745     /* Dispatch a request to compile a regexp to correct regexp engine. */
5746     DEBUG_COMPILE_r({
5747         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5748                         PTR2UV(eng));
5749     });
5750     return CALLREGCOMP_ENG(eng, pattern, flags);
5751 }
5752 #endif
5753
5754 /* public(ish) entry point for the perl core's own regex compiling code.
5755  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5756  * pattern rather than a list of OPs, and uses the internal engine rather
5757  * than the current one */
5758
5759 REGEXP *
5760 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5761 {
5762     SV *pat = pattern; /* defeat constness! */
5763     PERL_ARGS_ASSERT_RE_COMPILE;
5764     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5765 #ifdef PERL_IN_XSUB_RE
5766                                 &my_reg_engine,
5767 #else
5768                                 &PL_core_reg_engine,
5769 #endif
5770                                 NULL, NULL, rx_flags, 0);
5771 }
5772
5773
5774 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5775  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5776  * point to the realloced string and length.
5777  *
5778  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5779  * stuff added */
5780
5781 static void
5782 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5783                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5784 {
5785     U8 *const src = (U8*)*pat_p;
5786     U8 *dst, *d;
5787     int n=0;
5788     STRLEN s = 0;
5789     bool do_end = 0;
5790     GET_RE_DEBUG_FLAGS_DECL;
5791
5792     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5793         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5794
5795     Newx(dst, *plen_p * 2 + 1, U8);
5796     d = dst;
5797
5798     while (s < *plen_p) {
5799         append_utf8_from_native_byte(src[s], &d);
5800         if (n < num_code_blocks) {
5801             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5802                 pRExC_state->code_blocks[n].start = d - dst - 1;
5803                 assert(*(d - 1) == '(');
5804                 do_end = 1;
5805             }
5806             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5807                 pRExC_state->code_blocks[n].end = d - dst - 1;
5808                 assert(*(d - 1) == ')');
5809                 do_end = 0;
5810                 n++;
5811             }
5812         }
5813         s++;
5814     }
5815     *d = '\0';
5816     *plen_p = d - dst;
5817     *pat_p = (char*) dst;
5818     SAVEFREEPV(*pat_p);
5819     RExC_orig_utf8 = RExC_utf8 = 1;
5820 }
5821
5822
5823
5824 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5825  * while recording any code block indices, and handling overloading,
5826  * nested qr// objects etc.  If pat is null, it will allocate a new
5827  * string, or just return the first arg, if there's only one.
5828  *
5829  * Returns the malloced/updated pat.
5830  * patternp and pat_count is the array of SVs to be concatted;
5831  * oplist is the optional list of ops that generated the SVs;
5832  * recompile_p is a pointer to a boolean that will be set if
5833  *   the regex will need to be recompiled.
5834  * delim, if non-null is an SV that will be inserted between each element
5835  */
5836
5837 static SV*
5838 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5839                 SV *pat, SV ** const patternp, int pat_count,
5840                 OP *oplist, bool *recompile_p, SV *delim)
5841 {
5842     SV **svp;
5843     int n = 0;
5844     bool use_delim = FALSE;
5845     bool alloced = FALSE;
5846
5847     /* if we know we have at least two args, create an empty string,
5848      * then concatenate args to that. For no args, return an empty string */
5849     if (!pat && pat_count != 1) {
5850         pat = newSVpvs("");
5851         SAVEFREESV(pat);
5852         alloced = TRUE;
5853     }
5854
5855     for (svp = patternp; svp < patternp + pat_count; svp++) {
5856         SV *sv;
5857         SV *rx  = NULL;
5858         STRLEN orig_patlen = 0;
5859         bool code = 0;
5860         SV *msv = use_delim ? delim : *svp;
5861         if (!msv) msv = &PL_sv_undef;
5862
5863         /* if we've got a delimiter, we go round the loop twice for each
5864          * svp slot (except the last), using the delimiter the second
5865          * time round */
5866         if (use_delim) {
5867             svp--;
5868             use_delim = FALSE;
5869         }
5870         else if (delim)
5871             use_delim = TRUE;
5872
5873         if (SvTYPE(msv) == SVt_PVAV) {
5874             /* we've encountered an interpolated array within
5875              * the pattern, e.g. /...@a..../. Expand the list of elements,
5876              * then recursively append elements.
5877              * The code in this block is based on S_pushav() */
5878
5879             AV *const av = (AV*)msv;
5880             const SSize_t maxarg = AvFILL(av) + 1;
5881             SV **array;
5882
5883             if (oplist) {
5884                 assert(oplist->op_type == OP_PADAV
5885                     || oplist->op_type == OP_RV2AV);
5886                 oplist = OpSIBLING(oplist);
5887             }
5888
5889             if (SvRMAGICAL(av)) {
5890                 SSize_t i;
5891
5892                 Newx(array, maxarg, SV*);
5893                 SAVEFREEPV(array);
5894                 for (i=0; i < maxarg; i++) {
5895                     SV ** const svp = av_fetch(av, i, FALSE);
5896                     array[i] = svp ? *svp : &PL_sv_undef;
5897                 }
5898             }
5899             else
5900                 array = AvARRAY(av);
5901
5902             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5903                                 array, maxarg, NULL, recompile_p,
5904                                 /* $" */
5905                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5906
5907             continue;
5908         }
5909
5910
5911         /* we make the assumption here that each op in the list of
5912          * op_siblings maps to one SV pushed onto the stack,
5913          * except for code blocks, with have both an OP_NULL and
5914          * and OP_CONST.
5915          * This allows us to match up the list of SVs against the
5916          * list of OPs to find the next code block.
5917          *
5918          * Note that       PUSHMARK PADSV PADSV ..
5919          * is optimised to
5920          *                 PADRANGE PADSV  PADSV  ..
5921          * so the alignment still works. */
5922
5923         if (oplist) {
5924             if (oplist->op_type == OP_NULL
5925                 && (oplist->op_flags & OPf_SPECIAL))
5926             {
5927                 assert(n < pRExC_state->num_code_blocks);
5928                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5929                 pRExC_state->code_blocks[n].block = oplist;
5930                 pRExC_state->code_blocks[n].src_regex = NULL;
5931                 n++;
5932                 code = 1;
5933                 oplist = OpSIBLING(oplist); /* skip CONST */
5934                 assert(oplist);
5935             }
5936             oplist = OpSIBLING(oplist);;
5937         }
5938
5939         /* apply magic and QR overloading to arg */
5940
5941         SvGETMAGIC(msv);
5942         if (SvROK(msv) && SvAMAGIC(msv)) {
5943             SV *sv = AMG_CALLunary(msv, regexp_amg);
5944             if (sv) {
5945                 if (SvROK(sv))
5946                     sv = SvRV(sv);
5947                 if (SvTYPE(sv) != SVt_REGEXP)
5948                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5949                 msv = sv;
5950             }
5951         }
5952
5953         /* try concatenation overload ... */
5954         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5955                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5956         {
5957             sv_setsv(pat, sv);
5958             /* overloading involved: all bets are off over literal
5959              * code. Pretend we haven't seen it */
5960             pRExC_state->num_code_blocks -= n;
5961             n = 0;
5962         }
5963         else  {
5964             /* ... or failing that, try "" overload */
5965             while (SvAMAGIC(msv)
5966                     && (sv = AMG_CALLunary(msv, string_amg))
5967                     && sv != msv
5968                     &&  !(   SvROK(msv)
5969                           && SvROK(sv)
5970                           && SvRV(msv) == SvRV(sv))
5971             ) {
5972                 msv = sv;
5973                 SvGETMAGIC(msv);
5974             }
5975             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5976                 msv = SvRV(msv);
5977
5978             if (pat) {
5979                 /* this is a partially unrolled
5980                  *     sv_catsv_nomg(pat, msv);
5981                  * that allows us to adjust code block indices if
5982                  * needed */
5983                 STRLEN dlen;
5984                 char *dst = SvPV_force_nomg(pat, dlen);
5985                 orig_patlen = dlen;
5986                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5987                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5988                     sv_setpvn(pat, dst, dlen);
5989                     SvUTF8_on(pat);
5990                 }
5991                 sv_catsv_nomg(pat, msv);
5992                 rx = msv;
5993             }
5994             else
5995                 pat = msv;
5996
5997             if (code)
5998                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5999         }
6000
6001         /* extract any code blocks within any embedded qr//'s */
6002         if (rx && SvTYPE(rx) == SVt_REGEXP
6003             && RX_ENGINE((REGEXP*)rx)->op_comp)
6004         {
6005
6006             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6007             if (ri->num_code_blocks) {
6008                 int i;
6009                 /* the presence of an embedded qr// with code means
6010                  * we should always recompile: the text of the
6011                  * qr// may not have changed, but it may be a
6012                  * different closure than last time */
6013                 *recompile_p = 1;
6014                 Renew(pRExC_state->code_blocks,
6015                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6016                     struct reg_code_block);
6017                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6018
6019                 for (i=0; i < ri->num_code_blocks; i++) {
6020                     struct reg_code_block *src, *dst;
6021                     STRLEN offset =  orig_patlen
6022                         + ReANY((REGEXP *)rx)->pre_prefix;
6023                     assert(n < pRExC_state->num_code_blocks);
6024                     src = &ri->code_blocks[i];
6025                     dst = &pRExC_state->code_blocks[n];
6026                     dst->start      = src->start + offset;
6027                     dst->end        = src->end   + offset;
6028                     dst->block      = src->block;
6029                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6030                                             src->src_regex
6031                                                 ? src->src_regex
6032                                                 : (REGEXP*)rx);
6033                     n++;
6034                 }
6035             }
6036         }
6037     }
6038     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6039     if (alloced)
6040         SvSETMAGIC(pat);
6041
6042     return pat;
6043 }
6044
6045
6046
6047 /* see if there are any run-time code blocks in the pattern.
6048  * False positives are allowed */
6049
6050 static bool
6051 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6052                     char *pat, STRLEN plen)
6053 {
6054     int n = 0;
6055     STRLEN s;
6056
6057     PERL_UNUSED_CONTEXT;
6058
6059     for (s = 0; s < plen; s++) {
6060         if (n < pRExC_state->num_code_blocks
6061             && s == pRExC_state->code_blocks[n].start)
6062         {
6063             s = pRExC_state->code_blocks[n].end;
6064             n++;
6065             continue;
6066         }
6067         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6068          * positives here */
6069         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6070             (pat[s+2] == '{'
6071                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6072         )
6073             return 1;
6074     }
6075     return 0;
6076 }
6077
6078 /* Handle run-time code blocks. We will already have compiled any direct
6079  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6080  * copy of it, but with any literal code blocks blanked out and
6081  * appropriate chars escaped; then feed it into
6082  *
6083  *    eval "qr'modified_pattern'"
6084  *
6085  * For example,
6086  *
6087  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6088  *
6089  * becomes
6090  *
6091  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6092  *
6093  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6094  * and merge them with any code blocks of the original regexp.
6095  *
6096  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6097  * instead, just save the qr and return FALSE; this tells our caller that
6098  * the original pattern needs upgrading to utf8.
6099  */
6100
6101 static bool
6102 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6103     char *pat, STRLEN plen)
6104 {
6105     SV *qr;
6106
6107     GET_RE_DEBUG_FLAGS_DECL;
6108
6109     if (pRExC_state->runtime_code_qr) {
6110         /* this is the second time we've been called; this should
6111          * only happen if the main pattern got upgraded to utf8
6112          * during compilation; re-use the qr we compiled first time
6113          * round (which should be utf8 too)
6114          */
6115         qr = pRExC_state->runtime_code_qr;
6116         pRExC_state->runtime_code_qr = NULL;
6117         assert(RExC_utf8 && SvUTF8(qr));
6118     }
6119     else {
6120         int n = 0;
6121         STRLEN s;
6122         char *p, *newpat;
6123         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6124         SV *sv, *qr_ref;
6125         dSP;
6126
6127         /* determine how many extra chars we need for ' and \ escaping */
6128         for (s = 0; s < plen; s++) {
6129             if (pat[s] == '\'' || pat[s] == '\\')
6130                 newlen++;
6131         }
6132
6133         Newx(newpat, newlen, char);
6134         p = newpat;
6135         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6136
6137         for (s = 0; s < plen; s++) {
6138             if (n < pRExC_state->num_code_blocks
6139                 && s == pRExC_state->code_blocks[n].start)
6140             {
6141                 /* blank out literal code block */
6142                 assert(pat[s] == '(');
6143                 while (s <= pRExC_state->code_blocks[n].end) {
6144                     *p++ = '_';
6145                     s++;
6146                 }
6147                 s--;
6148                 n++;
6149                 continue;
6150             }
6151             if (pat[s] == '\'' || pat[s] == '\\')
6152                 *p++ = '\\';
6153             *p++ = pat[s];
6154         }
6155         *p++ = '\'';
6156         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6157             *p++ = 'x';
6158         *p++ = '\0';
6159         DEBUG_COMPILE_r({
6160             PerlIO_printf(Perl_debug_log,
6161                 "%sre-parsing pattern for runtime code:%s %s\n",
6162                 PL_colors[4],PL_colors[5],newpat);
6163         });
6164
6165         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6166         Safefree(newpat);
6167
6168         ENTER;
6169         SAVETMPS;
6170         PUSHSTACKi(PERLSI_REQUIRE);
6171         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6172          * parsing qr''; normally only q'' does this. It also alters
6173          * hints handling */
6174         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6175         SvREFCNT_dec_NN(sv);
6176         SPAGAIN;
6177         qr_ref = POPs;
6178         PUTBACK;
6179         {
6180             SV * const errsv = ERRSV;
6181             if (SvTRUE_NN(errsv))
6182             {
6183                 Safefree(pRExC_state->code_blocks);
6184                 /* use croak_sv ? */
6185                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6186             }
6187         }
6188         assert(SvROK(qr_ref));
6189         qr = SvRV(qr_ref);
6190         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6191         /* the leaving below frees the tmp qr_ref.
6192          * Give qr a life of its own */
6193         SvREFCNT_inc(qr);
6194         POPSTACK;
6195         FREETMPS;
6196         LEAVE;
6197
6198     }
6199
6200     if (!RExC_utf8 && SvUTF8(qr)) {
6201         /* first time through; the pattern got upgraded; save the
6202          * qr for the next time through */
6203         assert(!pRExC_state->runtime_code_qr);
6204         pRExC_state->runtime_code_qr = qr;
6205         return 0;
6206     }
6207
6208
6209     /* extract any code blocks within the returned qr//  */
6210
6211
6212     /* merge the main (r1) and run-time (r2) code blocks into one */
6213     {
6214         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6215         struct reg_code_block *new_block, *dst;
6216         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6217         int i1 = 0, i2 = 0;
6218
6219         if (!r2->num_code_blocks) /* we guessed wrong */
6220         {
6221             SvREFCNT_dec_NN(qr);
6222             return 1;
6223         }
6224
6225         Newx(new_block,
6226             r1->num_code_blocks + r2->num_code_blocks,
6227             struct reg_code_block);
6228         dst = new_block;
6229
6230         while (    i1 < r1->num_code_blocks
6231                 || i2 < r2->num_code_blocks)
6232         {
6233             struct reg_code_block *src;
6234             bool is_qr = 0;
6235
6236             if (i1 == r1->num_code_blocks) {
6237                 src = &r2->code_blocks[i2++];
6238                 is_qr = 1;
6239             }
6240             else if (i2 == r2->num_code_blocks)
6241                 src = &r1->code_blocks[i1++];
6242             else if (  r1->code_blocks[i1].start
6243                      < r2->code_blocks[i2].start)
6244             {
6245                 src = &r1->code_blocks[i1++];
6246                 assert(src->end < r2->code_blocks[i2].start);
6247             }
6248             else {
6249                 assert(  r1->code_blocks[i1].start
6250                        > r2->code_blocks[i2].start);
6251                 src = &r2->code_blocks[i2++];
6252                 is_qr = 1;
6253                 assert(src->end < r1->code_blocks[i1].start);
6254             }
6255
6256             assert(pat[src->start] == '(');
6257             assert(pat[src->end]   == ')');
6258             dst->start      = src->start;
6259             dst->end        = src->end;
6260             dst->block      = src->block;
6261             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6262                                     : src->src_regex;
6263             dst++;
6264         }
6265         r1->num_code_blocks += r2->num_code_blocks;
6266         Safefree(r1->code_blocks);
6267         r1->code_blocks = new_block;
6268     }
6269
6270     SvREFCNT_dec_NN(qr);
6271     return 1;
6272 }
6273
6274
6275 STATIC bool
6276 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6277                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6278                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6279                       STRLEN longest_length, bool eol, bool meol)
6280 {
6281     /* This is the common code for setting up the floating and fixed length
6282      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6283      * as to whether succeeded or not */
6284
6285     I32 t;
6286     SSize_t ml;
6287
6288     if (! (longest_length
6289            || (eol /* Can't have SEOL and MULTI */
6290                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6291           )
6292             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6293         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6294     {
6295         return FALSE;
6296     }
6297
6298     /* copy the information about the longest from the reg_scan_data
6299         over to the program. */
6300     if (SvUTF8(sv_longest)) {
6301         *rx_utf8 = sv_longest;
6302         *rx_substr = NULL;
6303     } else {
6304         *rx_substr = sv_longest;
6305         *rx_utf8 = NULL;
6306     }
6307     /* end_shift is how many chars that must be matched that
6308         follow this item. We calculate it ahead of time as once the
6309         lookbehind offset is added in we lose the ability to correctly
6310         calculate it.*/
6311     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6312     *rx_end_shift = ml - offset
6313         - longest_length + (SvTAIL(sv_longest) != 0)
6314         + lookbehind;
6315
6316     t = (eol/* Can't have SEOL and MULTI */
6317          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6318     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6319
6320     return TRUE;
6321 }
6322
6323 /*
6324  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6325  * regular expression into internal code.
6326  * The pattern may be passed either as:
6327  *    a list of SVs (patternp plus pat_count)
6328  *    a list of OPs (expr)
6329  * If both are passed, the SV list is used, but the OP list indicates
6330  * which SVs are actually pre-compiled code blocks
6331  *
6332  * The SVs in the list have magic and qr overloading applied to them (and
6333  * the list may be modified in-place with replacement SVs in the latter
6334  * case).
6335  *
6336  * If the pattern hasn't changed from old_re, then old_re will be
6337  * returned.
6338  *
6339  * eng is the current engine. If that engine has an op_comp method, then
6340  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6341  * do the initial concatenation of arguments and pass on to the external
6342  * engine.
6343  *
6344  * If is_bare_re is not null, set it to a boolean indicating whether the
6345  * arg list reduced (after overloading) to a single bare regex which has
6346  * been returned (i.e. /$qr/).
6347  *
6348  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6349  *
6350  * pm_flags contains the PMf_* flags, typically based on those from the
6351  * pm_flags field of the related PMOP. Currently we're only interested in
6352  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6353  *
6354  * We can't allocate space until we know how big the compiled form will be,
6355  * but we can't compile it (and thus know how big it is) until we've got a
6356  * place to put the code.  So we cheat:  we compile it twice, once with code
6357  * generation turned off and size counting turned on, and once "for real".
6358  * This also means that we don't allocate space until we are sure that the
6359  * thing really will compile successfully, and we never have to move the
6360  * code and thus invalidate pointers into it.  (Note that it has to be in
6361  * one piece because free() must be able to free it all.) [NB: not true in perl]
6362  *
6363  * Beware that the optimization-preparation code in here knows about some
6364  * of the structure of the compiled regexp.  [I'll say.]
6365  */
6366
6367 REGEXP *
6368 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6369                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6370                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6371 {
6372     REGEXP *rx;
6373     struct regexp *r;
6374     regexp_internal *ri;
6375     STRLEN plen;
6376     char *exp;
6377     regnode *scan;
6378     I32 flags;
6379     SSize_t minlen = 0;
6380     U32 rx_flags;
6381     SV *pat;
6382     SV *code_blocksv = NULL;
6383     SV** new_patternp = patternp;
6384
6385     /* these are all flags - maybe they should be turned
6386      * into a single int with different bit masks */
6387     I32 sawlookahead = 0;
6388     I32 sawplus = 0;
6389     I32 sawopen = 0;
6390     I32 sawminmod = 0;
6391
6392     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6393     bool recompile = 0;
6394     bool runtime_code = 0;
6395     scan_data_t data;
6396     RExC_state_t RExC_state;
6397     RExC_state_t * const pRExC_state = &RExC_state;
6398 #ifdef TRIE_STUDY_OPT
6399     int restudied = 0;
6400     RExC_state_t copyRExC_state;
6401 #endif
6402     GET_RE_DEBUG_FLAGS_DECL;
6403
6404     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6405
6406     DEBUG_r(if (!PL_colorset) reginitcolors());
6407
6408 #ifndef PERL_IN_XSUB_RE
6409     /* Initialize these here instead of as-needed, as is quick and avoids
6410      * having to test them each time otherwise */
6411     if (! PL_AboveLatin1) {
6412         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6413         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6414         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6415         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6416         PL_HasMultiCharFold =
6417                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6418
6419         /* This is calculated here, because the Perl program that generates the
6420          * static global ones doesn't currently have access to
6421          * NUM_ANYOF_CODE_POINTS */
6422         PL_InBitmap = _new_invlist(2);
6423         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6424                                                     NUM_ANYOF_CODE_POINTS - 1);
6425     }
6426 #endif
6427
6428     pRExC_state->code_blocks = NULL;
6429     pRExC_state->num_code_blocks = 0;
6430
6431     if (is_bare_re)
6432         *is_bare_re = FALSE;
6433
6434     if (expr && (expr->op_type == OP_LIST ||
6435                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6436         /* allocate code_blocks if needed */
6437         OP *o;
6438         int ncode = 0;
6439
6440         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6441             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6442                 ncode++; /* count of DO blocks */
6443         if (ncode) {
6444             pRExC_state->num_code_blocks = ncode;
6445             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6446         }
6447     }
6448
6449     if (!pat_count) {
6450         /* compile-time pattern with just OP_CONSTs and DO blocks */
6451
6452         int n;
6453         OP *o;
6454
6455         /* find how many CONSTs there are */
6456         assert(expr);
6457         n = 0;
6458         if (expr->op_type == OP_CONST)
6459             n = 1;
6460         else
6461             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6462                 if (o->op_type == OP_CONST)
6463                     n++;
6464             }
6465
6466         /* fake up an SV array */
6467
6468         assert(!new_patternp);
6469         Newx(new_patternp, n, SV*);
6470         SAVEFREEPV(new_patternp);
6471         pat_count = n;
6472
6473         n = 0;
6474         if (expr->op_type == OP_CONST)
6475             new_patternp[n] = cSVOPx_sv(expr);
6476         else
6477             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6478                 if (o->op_type == OP_CONST)
6479                     new_patternp[n++] = cSVOPo_sv;
6480             }
6481
6482     }
6483
6484     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6485         "Assembling pattern from %d elements%s\n", pat_count,
6486             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6487
6488     /* set expr to the first arg op */
6489
6490     if (pRExC_state->num_code_blocks
6491          && expr->op_type != OP_CONST)
6492     {
6493             expr = cLISTOPx(expr)->op_first;
6494             assert(   expr->op_type == OP_PUSHMARK
6495                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6496                    || expr->op_type == OP_PADRANGE);
6497             expr = OpSIBLING(expr);
6498     }
6499
6500     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6501                         expr, &recompile, NULL);
6502
6503     /* handle bare (possibly after overloading) regex: foo =~ $re */
6504     {
6505         SV *re = pat;
6506         if (SvROK(re))
6507             re = SvRV(re);
6508         if (SvTYPE(re) == SVt_REGEXP) {
6509             if (is_bare_re)
6510                 *is_bare_re = TRUE;
6511             SvREFCNT_inc(re);
6512             Safefree(pRExC_state->code_blocks);
6513             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6514                 "Precompiled pattern%s\n",
6515                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6516
6517             return (REGEXP*)re;
6518         }
6519     }
6520
6521     exp = SvPV_nomg(pat, plen);
6522
6523     if (!eng->op_comp) {
6524         if ((SvUTF8(pat) && IN_BYTES)
6525                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6526         {
6527             /* make a temporary copy; either to convert to bytes,
6528              * or to avoid repeating get-magic / overloaded stringify */
6529             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6530                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6531         }
6532         Safefree(pRExC_state->code_blocks);
6533         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6534     }
6535
6536     /* ignore the utf8ness if the pattern is 0 length */
6537     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6538     RExC_uni_semantics = 0;
6539     RExC_contains_locale = 0;
6540     RExC_contains_i = 0;
6541     pRExC_state->runtime_code_qr = NULL;
6542     RExC_frame_head= NULL;
6543     RExC_frame_last= NULL;
6544     RExC_frame_count= 0;
6545
6546     DEBUG_r({
6547         RExC_mysv1= sv_newmortal();
6548         RExC_mysv2= sv_newmortal();
6549     });
6550     DEBUG_COMPILE_r({
6551             SV *dsv= sv_newmortal();
6552             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6553             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6554                           PL_colors[4],PL_colors[5],s);
6555         });
6556
6557   redo_first_pass:
6558     /* we jump here if we upgrade the pattern to utf8 and have to
6559      * recompile */
6560
6561     if ((pm_flags & PMf_USE_RE_EVAL)
6562                 /* this second condition covers the non-regex literal case,
6563                  * i.e.  $foo =~ '(?{})'. */
6564                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6565     )
6566         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6567
6568     /* return old regex if pattern hasn't changed */
6569     /* XXX: note in the below we have to check the flags as well as the
6570      * pattern.
6571      *
6572      * Things get a touch tricky as we have to compare the utf8 flag
6573      * independently from the compile flags.  */
6574
6575     if (   old_re
6576         && !recompile
6577         && !!RX_UTF8(old_re) == !!RExC_utf8
6578         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6579         && RX_PRECOMP(old_re)
6580         && RX_PRELEN(old_re) == plen
6581         && memEQ(RX_PRECOMP(old_re), exp, plen)
6582         && !runtime_code /* with runtime code, always recompile */ )
6583     {
6584         Safefree(pRExC_state->code_blocks);
6585         return old_re;
6586     }
6587
6588     rx_flags = orig_rx_flags;
6589
6590     if (rx_flags & PMf_FOLD) {
6591         RExC_contains_i = 1;
6592     }
6593     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6594
6595         /* Set to use unicode semantics if the pattern is in utf8 and has the
6596          * 'depends' charset specified, as it means unicode when utf8  */
6597         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6598     }
6599
6600     RExC_precomp = exp;
6601     RExC_flags = rx_flags;
6602     RExC_pm_flags = pm_flags;
6603
6604     if (runtime_code) {
6605         if (TAINTING_get && TAINT_get)
6606             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6607
6608         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6609             /* whoops, we have a non-utf8 pattern, whilst run-time code
6610              * got compiled as utf8. Try again with a utf8 pattern */
6611             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6612                                     pRExC_state->num_code_blocks);
6613             goto redo_first_pass;
6614         }
6615     }
6616     assert(!pRExC_state->runtime_code_qr);
6617
6618     RExC_sawback = 0;
6619
6620     RExC_seen = 0;
6621     RExC_maxlen = 0;
6622     RExC_in_lookbehind = 0;
6623     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6624     RExC_extralen = 0;
6625     RExC_override_recoding = 0;
6626     RExC_in_multi_char_class = 0;
6627
6628     /* First pass: determine size, legality. */
6629     RExC_parse = exp;
6630     RExC_start = exp;
6631     RExC_end = exp + plen;
6632     RExC_naughty = 0;
6633     RExC_npar = 1;
6634     RExC_nestroot = 0;
6635     RExC_size = 0L;
6636     RExC_emit = (regnode *) &RExC_emit_dummy;
6637     RExC_whilem_seen = 0;
6638     RExC_open_parens = NULL;
6639     RExC_close_parens = NULL;
6640     RExC_opend = NULL;
6641     RExC_paren_names = NULL;
6642 #ifdef DEBUGGING
6643     RExC_paren_name_list = NULL;
6644 #endif
6645     RExC_recurse = NULL;
6646     RExC_study_chunk_recursed = NULL;
6647     RExC_study_chunk_recursed_bytes= 0;
6648     RExC_recurse_count = 0;
6649     pRExC_state->code_index = 0;
6650
6651 #if 0 /* REGC() is (currently) a NOP at the first pass.
6652        * Clever compilers notice this and complain. --jhi */
6653     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6654 #endif
6655     DEBUG_PARSE_r(
6656         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6657         RExC_lastnum=0;
6658         RExC_lastparse=NULL;
6659     );
6660     /* reg may croak on us, not giving us a chance to free
6661        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6662        need it to survive as long as the regexp (qr/(?{})/).
6663        We must check that code_blocksv is not already set, because we may
6664        have jumped back to restart the sizing pass. */
6665     if (pRExC_state->code_blocks && !code_blocksv) {
6666         code_blocksv = newSV_type(SVt_PV);
6667         SAVEFREESV(code_blocksv);
6668         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6669         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6670     }
6671     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6672         /* It's possible to write a regexp in ascii that represents Unicode
6673         codepoints outside of the byte range, such as via \x{100}. If we
6674         detect such a sequence we have to convert the entire pattern to utf8
6675         and then recompile, as our sizing calculation will have been based
6676         on 1 byte == 1 character, but we will need to use utf8 to encode
6677         at least some part of the pattern, and therefore must convert the whole
6678         thing.
6679         -- dmq */
6680         if (flags & RESTART_UTF8) {
6681             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6682                                     pRExC_state->num_code_blocks);
6683             goto redo_first_pass;
6684         }
6685         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6686     }
6687     if (code_blocksv)
6688         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6689
6690     DEBUG_PARSE_r({
6691         PerlIO_printf(Perl_debug_log,
6692             "Required size %"IVdf" nodes\n"
6693             "Starting second pass (creation)\n",
6694             (IV)RExC_size);
6695         RExC_lastnum=0;
6696         RExC_lastparse=NULL;
6697     });
6698
6699     /* The first pass could have found things that force Unicode semantics */
6700     if ((RExC_utf8 || RExC_uni_semantics)
6701          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6702     {
6703         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6704     }
6705
6706     /* Small enough for pointer-storage convention?
6707        If extralen==0, this means that we will not need long jumps. */
6708     if (RExC_size >= 0x10000L && RExC_extralen)
6709         RExC_size += RExC_extralen;
6710     else
6711         RExC_extralen = 0;
6712     if (RExC_whilem_seen > 15)
6713         RExC_whilem_seen = 15;
6714
6715     /* Allocate space and zero-initialize. Note, the two step process
6716        of zeroing when in debug mode, thus anything assigned has to
6717        happen after that */
6718     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6719     r = ReANY(rx);
6720     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6721          char, regexp_internal);
6722     if ( r == NULL || ri == NULL )
6723         FAIL("Regexp out of space");
6724 #ifdef DEBUGGING
6725     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6726     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6727          char);
6728 #else
6729     /* bulk initialize base fields with 0. */
6730     Zero(ri, sizeof(regexp_internal), char);
6731 #endif
6732
6733     /* non-zero initialization begins here */
6734     RXi_SET( r, ri );
6735     r->engine= eng;
6736     r->extflags = rx_flags;
6737     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6738
6739     if (pm_flags & PMf_IS_QR) {
6740         ri->code_blocks = pRExC_state->code_blocks;
6741         ri->num_code_blocks = pRExC_state->num_code_blocks;
6742     }
6743     else
6744     {
6745         int n;
6746         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6747             if (pRExC_state->code_blocks[n].src_regex)
6748                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6749         SAVEFREEPV(pRExC_state->code_blocks);
6750     }
6751
6752     {
6753         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6754         bool has_charset = (get_regex_charset(r->extflags)
6755                                                     != REGEX_DEPENDS_CHARSET);
6756
6757         /* The caret is output if there are any defaults: if not all the STD
6758          * flags are set, or if no character set specifier is needed */
6759         bool has_default =
6760                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6761                     || ! has_charset);
6762         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6763                                                    == REG_RUN_ON_COMMENT_SEEN);
6764         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6765                             >> RXf_PMf_STD_PMMOD_SHIFT);
6766         const char *fptr = STD_PAT_MODS;        /*"msixn"*/
6767         char *p;
6768         /* Allocate for the worst case, which is all the std flags are turned
6769          * on.  If more precision is desired, we could do a population count of
6770          * the flags set.  This could be done with a small lookup table, or by
6771          * shifting, masking and adding, or even, when available, assembly
6772          * language for a machine-language population count.
6773          * We never output a minus, as all those are defaults, so are
6774          * covered by the caret */
6775         const STRLEN wraplen = plen + has_p + has_runon
6776             + has_default       /* If needs a caret */
6777
6778                 /* If needs a character set specifier */
6779             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6780             + (sizeof(STD_PAT_MODS) - 1)
6781             + (sizeof("(?:)") - 1);
6782
6783         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6784         r->xpv_len_u.xpvlenu_pv = p;
6785         if (RExC_utf8)
6786             SvFLAGS(rx) |= SVf_UTF8;
6787         *p++='('; *p++='?';
6788
6789         /* If a default, cover it using the caret */
6790         if (has_default) {
6791             *p++= DEFAULT_PAT_MOD;
6792         }
6793         if (has_charset) {
6794             STRLEN len;
6795             const char* const name = get_regex_charset_name(r->extflags, &len);
6796             Copy(name, p, len, char);
6797             p += len;
6798         }
6799         if (has_p)
6800             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6801         {
6802             char ch;
6803             while((ch = *fptr++)) {
6804                 if(reganch & 1)
6805                     *p++ = ch;
6806                 reganch >>= 1;
6807             }
6808         }
6809
6810         *p++ = ':';
6811         Copy(RExC_precomp, p, plen, char);
6812         assert ((RX_WRAPPED(rx) - p) < 16);
6813         r->pre_prefix = p - RX_WRAPPED(rx);
6814         p += plen;
6815         if (has_runon)
6816             *p++ = '\n';
6817         *p++ = ')';
6818         *p = 0;
6819         SvCUR_set(rx, p - RX_WRAPPED(rx));
6820     }
6821
6822     r->intflags = 0;
6823     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6824
6825     /* setup various meta data about recursion, this all requires
6826      * RExC_npar to be correctly set, and a bit later on we clear it */
6827     if (RExC_seen & REG_RECURSE_SEEN) {
6828         Newxz(RExC_open_parens, RExC_npar,regnode *);
6829         SAVEFREEPV(RExC_open_parens);
6830         Newxz(RExC_close_parens,RExC_npar,regnode *);
6831         SAVEFREEPV(RExC_close_parens);
6832     }
6833     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6834         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6835          * So its 1 if there are no parens. */
6836         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6837                                          ((RExC_npar & 0x07) != 0);
6838         Newx(RExC_study_chunk_recursed,
6839              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6840         SAVEFREEPV(RExC_study_chunk_recursed);
6841     }
6842
6843     /* Useful during FAIL. */
6844 #ifdef RE_TRACK_PATTERN_OFFSETS
6845     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6846     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6847                           "%s %"UVuf" bytes for offset annotations.\n",
6848                           ri->u.offsets ? "Got" : "Couldn't get",
6849                           (UV)((2*RExC_size+1) * sizeof(U32))));
6850 #endif
6851     SetProgLen(ri,RExC_size);
6852     RExC_rx_sv = rx;
6853     RExC_rx = r;
6854     RExC_rxi = ri;
6855
6856     /* Second pass: emit code. */
6857     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6858     RExC_pm_flags = pm_flags;
6859     RExC_parse = exp;
6860     RExC_end = exp + plen;
6861     RExC_naughty = 0;
6862     RExC_npar = 1;
6863     RExC_emit_start = ri->program;
6864     RExC_emit = ri->program;
6865     RExC_emit_bound = ri->program + RExC_size + 1;
6866     pRExC_state->code_index = 0;
6867
6868     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6869     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6870         ReREFCNT_dec(rx);
6871         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6872     }
6873     /* XXXX To minimize changes to RE engine we always allocate
6874        3-units-long substrs field. */
6875     Newx(r->substrs, 1, struct reg_substr_data);
6876     if (RExC_recurse_count) {
6877         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6878         SAVEFREEPV(RExC_recurse);
6879     }
6880
6881 reStudy:
6882     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6883     DEBUG_r(
6884         RExC_study_chunk_recursed_count= 0;
6885     );
6886     Zero(r->substrs, 1, struct reg_substr_data);
6887     if (RExC_study_chunk_recursed) {
6888         Zero(RExC_study_chunk_recursed,
6889              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6890     }
6891
6892
6893 #ifdef TRIE_STUDY_OPT
6894     if (!restudied) {
6895         StructCopy(&zero_scan_data, &data, scan_data_t);
6896         copyRExC_state = RExC_state;
6897     } else {
6898         U32 seen=RExC_seen;
6899         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6900
6901         RExC_state = copyRExC_state;
6902         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6903             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6904         else
6905             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6906         StructCopy(&zero_scan_data, &data, scan_data_t);
6907     }
6908 #else
6909     StructCopy(&zero_scan_data, &data, scan_data_t);
6910 #endif
6911
6912     /* Dig out information for optimizations. */
6913     r->extflags = RExC_flags; /* was pm_op */
6914     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6915
6916     if (UTF)
6917         SvUTF8_on(rx);  /* Unicode in it? */
6918     ri->regstclass = NULL;
6919     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
6920         r->intflags |= PREGf_NAUGHTY;
6921     scan = ri->program + 1;             /* First BRANCH. */
6922
6923     /* testing for BRANCH here tells us whether there is "must appear"
6924        data in the pattern. If there is then we can use it for optimisations */
6925     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6926                                                   */
6927         SSize_t fake;
6928         STRLEN longest_float_length, longest_fixed_length;
6929         regnode_ssc ch_class; /* pointed to by data */
6930         int stclass_flag;
6931         SSize_t last_close = 0; /* pointed to by data */
6932         regnode *first= scan;
6933         regnode *first_next= regnext(first);
6934         /*
6935          * Skip introductions and multiplicators >= 1
6936          * so that we can extract the 'meat' of the pattern that must
6937          * match in the large if() sequence following.
6938          * NOTE that EXACT is NOT covered here, as it is normally
6939          * picked up by the optimiser separately.
6940          *
6941          * This is unfortunate as the optimiser isnt handling lookahead
6942          * properly currently.
6943          *
6944          */
6945         while ((OP(first) == OPEN && (sawopen = 1)) ||
6946                /* An OR of *one* alternative - should not happen now. */
6947             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6948             /* for now we can't handle lookbehind IFMATCH*/
6949             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6950             (OP(first) == PLUS) ||
6951             (OP(first) == MINMOD) ||
6952                /* An {n,m} with n>0 */
6953             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6954             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6955         {
6956                 /*
6957                  * the only op that could be a regnode is PLUS, all the rest
6958                  * will be regnode_1 or regnode_2.
6959                  *
6960                  * (yves doesn't think this is true)
6961                  */
6962                 if (OP(first) == PLUS)
6963                     sawplus = 1;
6964                 else {
6965                     if (OP(first) == MINMOD)
6966                         sawminmod = 1;
6967                     first += regarglen[OP(first)];
6968                 }
6969                 first = NEXTOPER(first);
6970                 first_next= regnext(first);
6971         }
6972
6973         /* Starting-point info. */
6974       again:
6975         DEBUG_PEEP("first:",first,0);
6976         /* Ignore EXACT as we deal with it later. */
6977         if (PL_regkind[OP(first)] == EXACT) {
6978             if (OP(first) == EXACT || OP(first) == EXACTL)
6979                 NOOP;   /* Empty, get anchored substr later. */
6980             else
6981                 ri->regstclass = first;
6982         }
6983 #ifdef TRIE_STCLASS
6984         else if (PL_regkind[OP(first)] == TRIE &&
6985                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6986         {
6987             /* this can happen only on restudy */
6988             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6989         }
6990 #endif
6991         else if (REGNODE_SIMPLE(OP(first)))
6992             ri->regstclass = first;
6993         else if (PL_regkind[OP(first)] == BOUND ||
6994                  PL_regkind[OP(first)] == NBOUND)
6995             ri->regstclass = first;
6996         else if (PL_regkind[OP(first)] == BOL) {
6997             r->intflags |= (OP(first) == MBOL
6998                            ? PREGf_ANCH_MBOL
6999                            : PREGf_ANCH_SBOL);
7000             first = NEXTOPER(first);
7001             goto again;
7002         }
7003         else if (OP(first) == GPOS) {
7004             r->intflags |= PREGf_ANCH_GPOS;
7005             first = NEXTOPER(first);
7006             goto again;
7007         }
7008         else if ((!sawopen || !RExC_sawback) &&
7009             !sawlookahead &&
7010             (OP(first) == STAR &&
7011             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7012             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7013         {
7014             /* turn .* into ^.* with an implied $*=1 */
7015             const int type =
7016                 (OP(NEXTOPER(first)) == REG_ANY)
7017                     ? PREGf_ANCH_MBOL
7018                     : PREGf_ANCH_SBOL;
7019             r->intflags |= (type | PREGf_IMPLICIT);
7020             first = NEXTOPER(first);
7021             goto again;
7022         }
7023         if (sawplus && !sawminmod && !sawlookahead
7024             && (!sawopen || !RExC_sawback)
7025             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7026             /* x+ must match at the 1st pos of run of x's */
7027             r->intflags |= PREGf_SKIP;
7028
7029         /* Scan is after the zeroth branch, first is atomic matcher. */
7030 #ifdef TRIE_STUDY_OPT
7031         DEBUG_PARSE_r(
7032             if (!restudied)
7033                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7034                               (IV)(first - scan + 1))
7035         );
7036 #else
7037         DEBUG_PARSE_r(
7038             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7039                 (IV)(first - scan + 1))
7040         );
7041 #endif
7042
7043
7044         /*
7045         * If there's something expensive in the r.e., find the
7046         * longest literal string that must appear and make it the
7047         * regmust.  Resolve ties in favor of later strings, since
7048         * the regstart check works with the beginning of the r.e.
7049         * and avoiding duplication strengthens checking.  Not a
7050         * strong reason, but sufficient in the absence of others.
7051         * [Now we resolve ties in favor of the earlier string if
7052         * it happens that c_offset_min has been invalidated, since the
7053         * earlier string may buy us something the later one won't.]
7054         */
7055
7056         data.longest_fixed = newSVpvs("");
7057         data.longest_float = newSVpvs("");
7058         data.last_found = newSVpvs("");
7059         data.longest = &(data.longest_fixed);
7060         ENTER_with_name("study_chunk");
7061         SAVEFREESV(data.longest_fixed);
7062         SAVEFREESV(data.longest_float);
7063         SAVEFREESV(data.last_found);
7064         first = scan;
7065         if (!ri->regstclass) {
7066             ssc_init(pRExC_state, &ch_class);
7067             data.start_class = &ch_class;
7068             stclass_flag = SCF_DO_STCLASS_AND;
7069         } else                          /* XXXX Check for BOUND? */
7070             stclass_flag = 0;
7071         data.last_closep = &last_close;
7072
7073         DEBUG_RExC_seen();
7074         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7075                              scan + RExC_size, /* Up to end */
7076             &data, -1, 0, NULL,
7077             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7078                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7079             0);
7080
7081
7082         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7083
7084
7085         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7086              && data.last_start_min == 0 && data.last_end > 0
7087              && !RExC_seen_zerolen
7088              && !(RExC_seen & REG_VERBARG_SEEN)
7089              && !(RExC_seen & REG_GPOS_SEEN)
7090         ){
7091             r->extflags |= RXf_CHECK_ALL;
7092         }
7093         scan_commit(pRExC_state, &data,&minlen,0);
7094
7095         longest_float_length = CHR_SVLEN(data.longest_float);
7096
7097         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7098                    && data.offset_fixed == data.offset_float_min
7099                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7100             && S_setup_longest (aTHX_ pRExC_state,
7101                                     data.longest_float,
7102                                     &(r->float_utf8),
7103                                     &(r->float_substr),
7104                                     &(r->float_end_shift),
7105                                     data.lookbehind_float,
7106                                     data.offset_float_min,
7107                                     data.minlen_float,
7108                                     longest_float_length,
7109                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7110                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7111         {
7112             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7113             r->float_max_offset = data.offset_float_max;
7114             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7115                 r->float_max_offset -= data.lookbehind_float;
7116             SvREFCNT_inc_simple_void_NN(data.longest_float);
7117         }
7118         else {
7119             r->float_substr = r->float_utf8 = NULL;
7120             longest_float_length = 0;
7121         }
7122
7123         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7124
7125         if (S_setup_longest (aTHX_ pRExC_state,
7126                                 data.longest_fixed,
7127                                 &(r->anchored_utf8),
7128                                 &(r->anchored_substr),
7129                                 &(r->anchored_end_shift),
7130                                 data.lookbehind_fixed,
7131                                 data.offset_fixed,
7132                                 data.minlen_fixed,
7133                                 longest_fixed_length,
7134                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7135                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7136         {
7137             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7138             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7139         }
7140         else {
7141             r->anchored_substr = r->anchored_utf8 = NULL;
7142             longest_fixed_length = 0;
7143         }
7144         LEAVE_with_name("study_chunk");
7145
7146         if (ri->regstclass
7147             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7148             ri->regstclass = NULL;
7149
7150         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7151             && stclass_flag
7152             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7153             && is_ssc_worth_it(pRExC_state, data.start_class))
7154         {
7155             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7156
7157             ssc_finalize(pRExC_state, data.start_class);
7158
7159             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7160             StructCopy(data.start_class,
7161                        (regnode_ssc*)RExC_rxi->data->data[n],
7162                        regnode_ssc);
7163             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7164             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7165             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7166                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7167                       PerlIO_printf(Perl_debug_log,
7168                                     "synthetic stclass \"%s\".\n",
7169                                     SvPVX_const(sv));});
7170             data.start_class = NULL;
7171         }
7172
7173         /* A temporary algorithm prefers floated substr to fixed one to dig
7174          * more info. */
7175         if (longest_fixed_length > longest_float_length) {
7176             r->substrs->check_ix = 0;
7177             r->check_end_shift = r->anchored_end_shift;
7178             r->check_substr = r->anchored_substr;
7179             r->check_utf8 = r->anchored_utf8;
7180             r->check_offset_min = r->check_offset_max = r->anchored_offset;
7181             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7182                 r->intflags |= PREGf_NOSCAN;
7183         }
7184         else {
7185             r->substrs->check_ix = 1;
7186             r->check_end_shift = r->float_end_shift;
7187             r->check_substr = r->float_substr;
7188             r->check_utf8 = r->float_utf8;
7189             r->check_offset_min = r->float_min_offset;
7190             r->check_offset_max = r->float_max_offset;
7191         }
7192         if ((r->check_substr || r->check_utf8) ) {
7193             r->extflags |= RXf_USE_INTUIT;
7194             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7195                 r->extflags |= RXf_INTUIT_TAIL;
7196         }
7197         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7198
7199         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7200         if ( (STRLEN)minlen < longest_float_length )
7201             minlen= longest_float_length;
7202         if ( (STRLEN)minlen < longest_fixed_length )
7203             minlen= longest_fixed_length;
7204         */
7205     }
7206     else {
7207         /* Several toplevels. Best we can is to set minlen. */
7208         SSize_t fake;
7209         regnode_ssc ch_class;
7210         SSize_t last_close = 0;
7211
7212         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7213
7214         scan = ri->program + 1;
7215         ssc_init(pRExC_state, &ch_class);
7216         data.start_class = &ch_class;
7217         data.last_closep = &last_close;
7218
7219         DEBUG_RExC_seen();
7220         minlen = study_chunk(pRExC_state,
7221             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7222             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7223                                                       ? SCF_TRIE_DOING_RESTUDY
7224                                                       : 0),
7225             0);
7226
7227         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7228
7229         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7230                 = r->float_substr = r->float_utf8 = NULL;
7231
7232         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7233             && is_ssc_worth_it(pRExC_state, data.start_class))
7234         {
7235             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7236
7237             ssc_finalize(pRExC_state, data.start_class);
7238
7239             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7240             StructCopy(data.start_class,
7241                        (regnode_ssc*)RExC_rxi->data->data[n],
7242                        regnode_ssc);
7243             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7244             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7245             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7246                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7247                       PerlIO_printf(Perl_debug_log,
7248                                     "synthetic stclass \"%s\".\n",
7249                                     SvPVX_const(sv));});
7250             data.start_class = NULL;
7251         }
7252     }
7253
7254     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7255         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7256         r->maxlen = REG_INFTY;
7257     }
7258     else {
7259         r->maxlen = RExC_maxlen;
7260     }
7261
7262     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7263        the "real" pattern. */
7264     DEBUG_OPTIMISE_r({
7265         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7266                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7267     });
7268     r->minlenret = minlen;
7269     if (r->minlen < minlen)
7270         r->minlen = minlen;
7271
7272     if (RExC_seen & REG_GPOS_SEEN)
7273         r->intflags |= PREGf_GPOS_SEEN;
7274     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7275         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7276                                                 lookbehind */
7277     if (pRExC_state->num_code_blocks)
7278         r->extflags |= RXf_EVAL_SEEN;
7279     if (RExC_seen & REG_CANY_SEEN)
7280         r->intflags |= PREGf_CANY_SEEN;
7281     if (RExC_seen & REG_VERBARG_SEEN)
7282     {
7283         r->intflags |= PREGf_VERBARG_SEEN;
7284         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7285     }
7286     if (RExC_seen & REG_CUTGROUP_SEEN)
7287         r->intflags |= PREGf_CUTGROUP_SEEN;
7288     if (pm_flags & PMf_USE_RE_EVAL)
7289         r->intflags |= PREGf_USE_RE_EVAL;
7290     if (RExC_paren_names)
7291         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7292     else
7293         RXp_PAREN_NAMES(r) = NULL;
7294
7295     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7296      * so it can be used in pp.c */
7297     if (r->intflags & PREGf_ANCH)
7298         r->extflags |= RXf_IS_ANCHORED;
7299
7300
7301     {
7302         /* this is used to identify "special" patterns that might result
7303          * in Perl NOT calling the regex engine and instead doing the match "itself",
7304          * particularly special cases in split//. By having the regex compiler
7305          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7306          * we avoid weird issues with equivalent patterns resulting in different behavior,
7307          * AND we allow non Perl engines to get the same optimizations by the setting the
7308          * flags appropriately - Yves */
7309         regnode *first = ri->program + 1;
7310         U8 fop = OP(first);
7311         regnode *next = NEXTOPER(first);
7312         U8 nop = OP(next);
7313
7314         if (PL_regkind[fop] == NOTHING && nop == END)
7315             r->extflags |= RXf_NULL;
7316         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7317             /* when fop is SBOL first->flags will be true only when it was
7318              * produced by parsing /\A/, and not when parsing /^/. This is
7319              * very important for the split code as there we want to
7320              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7321              * See rt #122761 for more details. -- Yves */
7322             r->extflags |= RXf_START_ONLY;
7323         else if (fop == PLUS
7324                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7325                  && OP(regnext(first)) == END)
7326             r->extflags |= RXf_WHITE;
7327         else if ( r->extflags & RXf_SPLIT
7328                   && (fop == EXACT || fop == EXACTL)
7329                   && STR_LEN(first) == 1
7330                   && *(STRING(first)) == ' '
7331                   && OP(regnext(first)) == END )
7332             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7333
7334     }
7335
7336     if (RExC_contains_locale) {
7337         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7338     }
7339
7340 #ifdef DEBUGGING
7341     if (RExC_paren_names) {
7342         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7343         ri->data->data[ri->name_list_idx]
7344                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7345     } else
7346 #endif
7347         ri->name_list_idx = 0;
7348
7349     if (RExC_recurse_count) {
7350         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7351             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7352             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7353         }
7354     }
7355     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7356     /* assume we don't need to swap parens around before we match */
7357     DEBUG_TEST_r({
7358         PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7359             (unsigned long)RExC_study_chunk_recursed_count);
7360     });
7361     DEBUG_DUMP_r({
7362         DEBUG_RExC_seen();
7363         PerlIO_printf(Perl_debug_log,"Final program:\n");
7364         regdump(r);
7365     });
7366 #ifdef RE_TRACK_PATTERN_OFFSETS
7367     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7368         const STRLEN len = ri->u.offsets[0];
7369         STRLEN i;
7370         GET_RE_DEBUG_FLAGS_DECL;
7371         PerlIO_printf(Perl_debug_log,
7372                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7373         for (i = 1; i <= len; i++) {
7374             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7375                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7376                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7377             }
7378         PerlIO_printf(Perl_debug_log, "\n");
7379     });
7380 #endif
7381
7382 #ifdef USE_ITHREADS
7383     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7384      * by setting the regexp SV to readonly-only instead. If the
7385      * pattern's been recompiled, the USEDness should remain. */
7386     if (old_re && SvREADONLY(old_re))
7387         SvREADONLY_on(rx);
7388 #endif
7389     return rx;
7390 }
7391
7392
7393 SV*
7394 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7395                     const U32 flags)
7396 {
7397     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7398
7399     PERL_UNUSED_ARG(value);
7400
7401     if (flags & RXapif_FETCH) {
7402         return reg_named_buff_fetch(rx, key, flags);
7403     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7404         Perl_croak_no_modify();
7405         return NULL;
7406     } else if (flags & RXapif_EXISTS) {
7407         return reg_named_buff_exists(rx, key, flags)
7408             ? &PL_sv_yes
7409             : &PL_sv_no;
7410     } else if (flags & RXapif_REGNAMES) {
7411         return reg_named_buff_all(rx, flags);
7412     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7413         return reg_named_buff_scalar(rx, flags);
7414     } else {
7415         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7416         return NULL;
7417     }
7418 }
7419
7420 SV*
7421 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7422                          const U32 flags)
7423 {
7424     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7425     PERL_UNUSED_ARG(lastkey);
7426
7427     if (flags & RXapif_FIRSTKEY)
7428         return reg_named_buff_firstkey(rx, flags);
7429     else if (flags & RXapif_NEXTKEY)
7430         return reg_named_buff_nextkey(rx, flags);
7431     else {
7432         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7433                                             (int)flags);
7434         return NULL;
7435     }
7436 }
7437
7438 SV*
7439 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7440                           const U32 flags)
7441 {
7442     AV *retarray = NULL;
7443     SV *ret;
7444     struct regexp *const rx = ReANY(r);
7445
7446     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7447
7448     if (flags & RXapif_ALL)
7449         retarray=newAV();
7450
7451     if (rx && RXp_PAREN_NAMES(rx)) {
7452         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7453         if (he_str) {
7454             IV i;
7455             SV* sv_dat=HeVAL(he_str);
7456             I32 *nums=(I32*)SvPVX(sv_dat);
7457             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7458                 if ((I32)(rx->nparens) >= nums[i]
7459                     && rx->offs[nums[i]].start != -1
7460                     && rx->offs[nums[i]].end != -1)
7461                 {
7462                     ret = newSVpvs("");
7463                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7464                     if (!retarray)
7465                         return ret;
7466                 } else {
7467                     if (retarray)
7468                         ret = newSVsv(&PL_sv_undef);
7469                 }
7470                 if (retarray)
7471                     av_push(retarray, ret);
7472             }
7473             if (retarray)
7474                 return newRV_noinc(MUTABLE_SV(retarray));
7475         }
7476     }
7477     return NULL;
7478 }
7479
7480 bool
7481 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7482                            const U32 flags)
7483 {
7484     struct regexp *const rx = ReANY(r);
7485
7486     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7487
7488     if (rx && RXp_PAREN_NAMES(rx)) {
7489         if (flags & RXapif_ALL) {
7490             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7491         } else {
7492             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7493             if (sv) {
7494                 SvREFCNT_dec_NN(sv);
7495                 return TRUE;
7496             } else {
7497                 return FALSE;
7498             }
7499         }
7500     } else {
7501         return FALSE;
7502     }
7503 }
7504
7505 SV*
7506 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7507 {
7508     struct regexp *const rx = ReANY(r);
7509
7510     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7511
7512     if ( rx && RXp_PAREN_NAMES(rx) ) {
7513         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7514
7515         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7516     } else {
7517         return FALSE;
7518     }
7519 }
7520
7521 SV*
7522 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7523 {
7524     struct regexp *const rx = ReANY(r);
7525     GET_RE_DEBUG_FLAGS_DECL;
7526
7527     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7528
7529     if (rx && RXp_PAREN_NAMES(rx)) {
7530         HV *hv = RXp_PAREN_NAMES(rx);
7531         HE *temphe;
7532         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7533             IV i;
7534             IV parno = 0;
7535             SV* sv_dat = HeVAL(temphe);
7536             I32 *nums = (I32*)SvPVX(sv_dat);
7537             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7538                 if ((I32)(rx->lastparen) >= nums[i] &&
7539                     rx->offs[nums[i]].start != -1 &&
7540                     rx->offs[nums[i]].end != -1)
7541                 {
7542                     parno = nums[i];
7543                     break;
7544                 }
7545             }
7546             if (parno || flags & RXapif_ALL) {
7547                 return newSVhek(HeKEY_hek(temphe));
7548             }
7549         }
7550     }
7551     return NULL;
7552 }
7553
7554 SV*
7555 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7556 {
7557     SV *ret;
7558     AV *av;
7559     SSize_t length;
7560     struct regexp *const rx = ReANY(r);
7561
7562     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7563
7564     if (rx && RXp_PAREN_NAMES(rx)) {
7565         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7566             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7567         } else if (flags & RXapif_ONE) {
7568             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7569             av = MUTABLE_AV(SvRV(ret));
7570             length = av_tindex(av);
7571             SvREFCNT_dec_NN(ret);
7572             return newSViv(length + 1);
7573         } else {
7574             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7575                                                 (int)flags);
7576             return NULL;
7577         }
7578     }
7579     return &PL_sv_undef;
7580 }
7581
7582 SV*
7583 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7584 {
7585     struct regexp *const rx = ReANY(r);
7586     AV *av = newAV();
7587
7588     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7589
7590     if (rx && RXp_PAREN_NAMES(rx)) {
7591         HV *hv= RXp_PAREN_NAMES(rx);
7592         HE *temphe;
7593         (void)hv_iterinit(hv);
7594         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7595             IV i;
7596             IV parno = 0;
7597             SV* sv_dat = HeVAL(temphe);
7598             I32 *nums = (I32*)SvPVX(sv_dat);
7599             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7600                 if ((I32)(rx->lastparen) >= nums[i] &&
7601                     rx->offs[nums[i]].start != -1 &&
7602                     rx->offs[nums[i]].end != -1)
7603                 {
7604                     parno = nums[i];
7605                     break;
7606                 }
7607             }
7608             if (parno || flags & RXapif_ALL) {
7609                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7610             }
7611         }
7612     }
7613
7614     return newRV_noinc(MUTABLE_SV(av));
7615 }
7616
7617 void
7618 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7619                              SV * const sv)
7620 {
7621     struct regexp *const rx = ReANY(r);
7622     char *s = NULL;
7623     SSize_t i = 0;
7624     SSize_t s1, t1;
7625     I32 n = paren;
7626
7627     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7628
7629     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7630            || n == RX_BUFF_IDX_CARET_FULLMATCH
7631            || n == RX_BUFF_IDX_CARET_POSTMATCH
7632        )
7633     {
7634         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7635         if (!keepcopy) {
7636             /* on something like
7637              *    $r = qr/.../;
7638              *    /$qr/p;
7639              * the KEEPCOPY is set on the PMOP rather than the regex */
7640             if (PL_curpm && r == PM_GETRE(PL_curpm))
7641                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7642         }
7643         if (!keepcopy)
7644             goto ret_undef;
7645     }
7646
7647     if (!rx->subbeg)
7648         goto ret_undef;
7649
7650     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7651         /* no need to distinguish between them any more */
7652         n = RX_BUFF_IDX_FULLMATCH;
7653
7654     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7655         && rx->offs[0].start != -1)
7656     {
7657         /* $`, ${^PREMATCH} */
7658         i = rx->offs[0].start;
7659         s = rx->subbeg;
7660     }
7661     else
7662     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7663         && rx->offs[0].end != -1)
7664     {
7665         /* $', ${^POSTMATCH} */
7666         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7667         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7668     }
7669     else
7670     if ( 0 <= n && n <= (I32)rx->nparens &&
7671         (s1 = rx->offs[n].start) != -1 &&
7672         (t1 = rx->offs[n].end) != -1)
7673     {
7674         /* $&, ${^MATCH},  $1 ... */
7675         i = t1 - s1;
7676         s = rx->subbeg + s1 - rx->suboffset;
7677     } else {
7678         goto ret_undef;
7679     }
7680
7681     assert(s >= rx->subbeg);
7682     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7683     if (i >= 0) {
7684 #ifdef NO_TAINT_SUPPORT
7685         sv_setpvn(sv, s, i);
7686 #else
7687         const int oldtainted = TAINT_get;
7688         TAINT_NOT;
7689         sv_setpvn(sv, s, i);
7690         TAINT_set(oldtainted);
7691 #endif
7692         if ( (rx->intflags & PREGf_CANY_SEEN)
7693             ? (RXp_MATCH_UTF8(rx)
7694                         && (!i || is_utf8_string((U8*)s, i)))
7695             : (RXp_MATCH_UTF8(rx)) )
7696         {
7697             SvUTF8_on(sv);
7698         }
7699         else
7700             SvUTF8_off(sv);
7701         if (TAINTING_get) {
7702             if (RXp_MATCH_TAINTED(rx)) {
7703                 if (SvTYPE(sv) >= SVt_PVMG) {
7704                     MAGIC* const mg = SvMAGIC(sv);
7705                     MAGIC* mgt;
7706                     TAINT;
7707                     SvMAGIC_set(sv, mg->mg_moremagic);
7708                     SvTAINT(sv);
7709                     if ((mgt = SvMAGIC(sv))) {
7710                         mg->mg_moremagic = mgt;
7711                         SvMAGIC_set(sv, mg);
7712                     }
7713                 } else {
7714                     TAINT;
7715                     SvTAINT(sv);
7716                 }
7717             } else
7718                 SvTAINTED_off(sv);
7719         }
7720     } else {
7721       ret_undef:
7722         sv_setsv(sv,&PL_sv_undef);
7723         return;
7724     }
7725 }
7726
7727 void
7728 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7729                                                          SV const * const value)
7730 {
7731     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7732
7733     PERL_UNUSED_ARG(rx);
7734     PERL_UNUSED_ARG(paren);
7735     PERL_UNUSED_ARG(value);
7736
7737     if (!PL_localizing)
7738         Perl_croak_no_modify();
7739 }
7740
7741 I32
7742 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7743                               const I32 paren)
7744 {
7745     struct regexp *const rx = ReANY(r);
7746     I32 i;
7747     I32 s1, t1;
7748
7749     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7750
7751     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7752         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7753         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7754     )
7755     {
7756         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7757         if (!keepcopy) {
7758             /* on something like
7759              *    $r = qr/.../;
7760              *    /$qr/p;
7761              * the KEEPCOPY is set on the PMOP rather than the regex */
7762             if (PL_curpm && r == PM_GETRE(PL_curpm))
7763                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7764         }
7765         if (!keepcopy)
7766             goto warn_undef;
7767     }
7768
7769     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7770     switch (paren) {
7771       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7772       case RX_BUFF_IDX_PREMATCH:       /* $` */
7773         if (rx->offs[0].start != -1) {
7774                         i = rx->offs[0].start;
7775                         if (i > 0) {
7776                                 s1 = 0;
7777                                 t1 = i;
7778                                 goto getlen;
7779                         }
7780             }
7781         return 0;
7782
7783       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7784       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7785             if (rx->offs[0].end != -1) {
7786                         i = rx->sublen - rx->offs[0].end;
7787                         if (i > 0) {
7788                                 s1 = rx->offs[0].end;
7789                                 t1 = rx->sublen;
7790                                 goto getlen;
7791                         }
7792             }
7793         return 0;
7794
7795       default: /* $& / ${^MATCH}, $1, $2, ... */
7796             if (paren <= (I32)rx->nparens &&
7797             (s1 = rx->offs[paren].start) != -1 &&
7798             (t1 = rx->offs[paren].end) != -1)
7799             {
7800             i = t1 - s1;
7801             goto getlen;
7802         } else {
7803           warn_undef:
7804             if (ckWARN(WARN_UNINITIALIZED))
7805                 report_uninit((const SV *)sv);
7806             return 0;
7807         }
7808     }
7809   getlen:
7810     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7811         const char * const s = rx->subbeg - rx->suboffset + s1;
7812         const U8 *ep;
7813         STRLEN el;
7814
7815         i = t1 - s1;
7816         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7817                         i = el;
7818     }
7819     return i;
7820 }
7821
7822 SV*
7823 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7824 {
7825     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7826         PERL_UNUSED_ARG(rx);
7827         if (0)
7828             return NULL;
7829         else
7830             return newSVpvs("Regexp");
7831 }
7832
7833 /* Scans the name of a named buffer from the pattern.
7834  * If flags is REG_RSN_RETURN_NULL returns null.
7835  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7836  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7837  * to the parsed name as looked up in the RExC_paren_names hash.
7838  * If there is an error throws a vFAIL().. type exception.
7839  */
7840
7841 #define REG_RSN_RETURN_NULL    0
7842 #define REG_RSN_RETURN_NAME    1
7843 #define REG_RSN_RETURN_DATA    2
7844
7845 STATIC SV*
7846 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7847 {
7848     char *name_start = RExC_parse;
7849
7850     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7851
7852     assert (RExC_parse <= RExC_end);
7853     if (RExC_parse == RExC_end) NOOP;
7854     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7855          /* skip IDFIRST by using do...while */
7856         if (UTF)
7857             do {
7858                 RExC_parse += UTF8SKIP(RExC_parse);
7859             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7860         else
7861             do {
7862                 RExC_parse++;
7863             } while (isWORDCHAR(*RExC_parse));
7864     } else {
7865         RExC_parse++; /* so the <- from the vFAIL is after the offending
7866                          character */
7867         vFAIL("Group name must start with a non-digit word character");
7868     }
7869     if ( flags ) {
7870         SV* sv_name
7871             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7872                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7873         if ( flags == REG_RSN_RETURN_NAME)
7874             return sv_name;
7875         else if (flags==REG_RSN_RETURN_DATA) {
7876             HE *he_str = NULL;
7877             SV *sv_dat = NULL;
7878             if ( ! sv_name )      /* should not happen*/
7879                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7880             if (RExC_paren_names)
7881                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7882             if ( he_str )
7883                 sv_dat = HeVAL(he_str);
7884             if ( ! sv_dat )
7885                 vFAIL("Reference to nonexistent named group");
7886             return sv_dat;
7887         }
7888         else {
7889             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7890                        (unsigned long) flags);
7891         }
7892         NOT_REACHED; /* NOT REACHED */
7893     }
7894     return NULL;
7895 }
7896
7897 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7898     int num;                                                    \
7899     if (RExC_lastparse!=RExC_parse) {                           \
7900         PerlIO_printf(Perl_debug_log, "%s",                     \
7901             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
7902                 RExC_end - RExC_parse, 16,                      \
7903                 "", "",                                         \
7904                 PERL_PV_ESCAPE_UNI_DETECT |                     \
7905                 PERL_PV_PRETTY_ELLIPSES   |                     \
7906                 PERL_PV_PRETTY_LTGT       |                     \
7907                 PERL_PV_ESCAPE_RE         |                     \
7908                 PERL_PV_PRETTY_EXACTSIZE                        \
7909             )                                                   \
7910         );                                                      \
7911     } else                                                      \
7912         PerlIO_printf(Perl_debug_log,"%16s","");                \
7913                                                                 \
7914     if (SIZE_ONLY)                                              \
7915        num = RExC_size + 1;                                     \
7916     else                                                        \
7917        num=REG_NODE_NUM(RExC_emit);                             \
7918     if (RExC_lastnum!=num)                                      \
7919        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7920     else                                                        \
7921        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7922     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7923         (int)((depth*2)), "",                                   \
7924         (funcname)                                              \
7925     );                                                          \
7926     RExC_lastnum=num;                                           \
7927     RExC_lastparse=RExC_parse;                                  \
7928 })
7929
7930
7931
7932 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7933     DEBUG_PARSE_MSG((funcname));                            \
7934     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7935 })
7936 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7937     DEBUG_PARSE_MSG((funcname));                            \
7938     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7939 })
7940
7941 /* This section of code defines the inversion list object and its methods.  The
7942  * interfaces are highly subject to change, so as much as possible is static to
7943  * this file.  An inversion list is here implemented as a malloc'd C UV array
7944  * as an SVt_INVLIST scalar.
7945  *
7946  * An inversion list for Unicode is an array of code points, sorted by ordinal
7947  * number.  The zeroth element is the first code point in the list.  The 1th
7948  * element is the first element beyond that not in the list.  In other words,
7949  * the first range is
7950  *  invlist[0]..(invlist[1]-1)
7951  * The other ranges follow.  Thus every element whose index is divisible by two
7952  * marks the beginning of a range that is in the list, and every element not
7953  * divisible by two marks the beginning of a range not in the list.  A single
7954  * element inversion list that contains the single code point N generally
7955  * consists of two elements
7956  *  invlist[0] == N
7957  *  invlist[1] == N+1
7958  * (The exception is when N is the highest representable value on the
7959  * machine, in which case the list containing just it would be a single
7960  * element, itself.  By extension, if the last range in the list extends to
7961  * infinity, then the first element of that range will be in the inversion list
7962  * at a position that is divisible by two, and is the final element in the
7963  * list.)
7964  * Taking the complement (inverting) an inversion list is quite simple, if the
7965  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7966  * This implementation reserves an element at the beginning of each inversion
7967  * list to always contain 0; there is an additional flag in the header which
7968  * indicates if the list begins at the 0, or is offset to begin at the next
7969  * element.
7970  *
7971  * More about inversion lists can be found in "Unicode Demystified"
7972  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7973  * More will be coming when functionality is added later.
7974  *
7975  * The inversion list data structure is currently implemented as an SV pointing
7976  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7977  * array of UV whose memory management is automatically handled by the existing
7978  * facilities for SV's.
7979  *
7980  * Some of the methods should always be private to the implementation, and some
7981  * should eventually be made public */
7982
7983 /* The header definitions are in F<inline_invlist.c> */
7984
7985 PERL_STATIC_INLINE UV*
7986 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7987 {
7988     /* Returns a pointer to the first element in the inversion list's array.
7989      * This is called upon initialization of an inversion list.  Where the
7990      * array begins depends on whether the list has the code point U+0000 in it
7991      * or not.  The other parameter tells it whether the code that follows this
7992      * call is about to put a 0 in the inversion list or not.  The first
7993      * element is either the element reserved for 0, if TRUE, or the element
7994      * after it, if FALSE */
7995
7996     bool* offset = get_invlist_offset_addr(invlist);
7997     UV* zero_addr = (UV *) SvPVX(invlist);
7998
7999     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8000
8001     /* Must be empty */
8002     assert(! _invlist_len(invlist));
8003
8004     *zero_addr = 0;
8005
8006     /* 1^1 = 0; 1^0 = 1 */
8007     *offset = 1 ^ will_have_0;
8008     return zero_addr + *offset;
8009 }
8010
8011 PERL_STATIC_INLINE UV*
8012 S_invlist_array(SV* const invlist)
8013 {
8014     /* Returns the pointer to the inversion list's array.  Every time the
8015      * length changes, this needs to be called in case malloc or realloc moved
8016      * it */
8017
8018     PERL_ARGS_ASSERT_INVLIST_ARRAY;
8019
8020     /* Must not be empty.  If these fail, you probably didn't check for <len>
8021      * being non-zero before trying to get the array */
8022     assert(_invlist_len(invlist));
8023
8024     /* The very first element always contains zero, The array begins either
8025      * there, or if the inversion list is offset, at the element after it.
8026      * The offset header field determines which; it contains 0 or 1 to indicate
8027      * how much additionally to add */
8028     assert(0 == *(SvPVX(invlist)));
8029     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
8030 }
8031
8032 PERL_STATIC_INLINE void
8033 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8034 {
8035     /* Sets the current number of elements stored in the inversion list.
8036      * Updates SvCUR correspondingly */
8037     PERL_UNUSED_CONTEXT;
8038     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8039
8040     assert(SvTYPE(invlist) == SVt_INVLIST);
8041
8042     SvCUR_set(invlist,
8043               (len == 0)
8044                ? 0
8045                : TO_INTERNAL_SIZE(len + offset));
8046     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8047 }
8048
8049 #ifndef PERL_IN_XSUB_RE
8050
8051 PERL_STATIC_INLINE IV*
8052 S_get_invlist_previous_index_addr(SV* invlist)
8053 {
8054     /* Return the address of the IV that is reserved to hold the cached index
8055      * */
8056     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8057
8058     assert(SvTYPE(invlist) == SVt_INVLIST);
8059
8060     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8061 }
8062
8063 PERL_STATIC_INLINE IV
8064 S_invlist_previous_index(SV* const invlist)
8065 {
8066     /* Returns cached index of previous search */
8067
8068     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8069
8070     return *get_invlist_previous_index_addr(invlist);
8071 }
8072
8073 PERL_STATIC_INLINE void
8074 S_invlist_set_previous_index(SV* const invlist, const IV index)
8075 {
8076     /* Caches <index> for later retrieval */
8077
8078     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8079
8080     assert(index == 0 || index < (int) _invlist_len(invlist));
8081
8082     *get_invlist_previous_index_addr(invlist) = index;
8083 }
8084
8085 PERL_STATIC_INLINE void
8086 S_invlist_trim(SV* const invlist)
8087 {
8088     PERL_ARGS_ASSERT_INVLIST_TRIM;
8089
8090     assert(SvTYPE(invlist) == SVt_INVLIST);
8091
8092     /* Change the length of the inversion list to how many entries it currently
8093      * has */
8094     SvPV_shrink_to_cur((SV *) invlist);
8095 }
8096
8097 PERL_STATIC_INLINE bool
8098 S_invlist_is_iterating(SV* const invlist)
8099 {
8100     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8101
8102     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8103 }
8104
8105 #endif /* ifndef PERL_IN_XSUB_RE */
8106
8107 PERL_STATIC_INLINE UV
8108 S_invlist_max(SV* const invlist)
8109 {
8110     /* Returns the maximum number of elements storable in the inversion list's
8111      * array, without having to realloc() */
8112
8113     PERL_ARGS_ASSERT_INVLIST_MAX;
8114
8115     assert(SvTYPE(invlist) == SVt_INVLIST);
8116
8117     /* Assumes worst case, in which the 0 element is not counted in the
8118      * inversion list, so subtracts 1 for that */
8119     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8120            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8121            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8122 }
8123
8124 #ifndef PERL_IN_XSUB_RE
8125 SV*
8126 Perl__new_invlist(pTHX_ IV initial_size)
8127 {
8128
8129     /* Return a pointer to a newly constructed inversion list, with enough
8130      * space to store 'initial_size' elements.  If that number is negative, a
8131      * system default is used instead */
8132
8133     SV* new_list;
8134
8135     if (initial_size < 0) {
8136         initial_size = 10;
8137     }
8138
8139     /* Allocate the initial space */
8140     new_list = newSV_type(SVt_INVLIST);
8141
8142     /* First 1 is in case the zero element isn't in the list; second 1 is for
8143      * trailing NUL */
8144     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8145     invlist_set_len(new_list, 0, 0);
8146
8147     /* Force iterinit() to be used to get iteration to work */
8148     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8149
8150     *get_invlist_previous_index_addr(new_list) = 0;
8151
8152     return new_list;
8153 }
8154
8155 SV*
8156 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8157 {
8158     /* Return a pointer to a newly constructed inversion list, initialized to
8159      * point to <list>, which has to be in the exact correct inversion list
8160      * form, including internal fields.  Thus this is a dangerous routine that
8161      * should not be used in the wrong hands.  The passed in 'list' contains
8162      * several header fields at the beginning that are not part of the
8163      * inversion list body proper */
8164
8165     const STRLEN length = (STRLEN) list[0];
8166     const UV version_id =          list[1];
8167     const bool offset   =    cBOOL(list[2]);
8168 #define HEADER_LENGTH 3
8169     /* If any of the above changes in any way, you must change HEADER_LENGTH
8170      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8171      *      perl -E 'say int(rand 2**31-1)'
8172      */
8173 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8174                                         data structure type, so that one being
8175                                         passed in can be validated to be an
8176                                         inversion list of the correct vintage.
8177                                        */
8178
8179     SV* invlist = newSV_type(SVt_INVLIST);
8180
8181     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8182
8183     if (version_id != INVLIST_VERSION_ID) {
8184         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8185     }
8186
8187     /* The generated array passed in includes header elements that aren't part
8188      * of the list proper, so start it just after them */
8189     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8190
8191     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8192                                shouldn't touch it */
8193
8194     *(get_invlist_offset_addr(invlist)) = offset;
8195
8196     /* The 'length' passed to us is the physical number of elements in the
8197      * inversion list.  But if there is an offset the logical number is one
8198      * less than that */
8199     invlist_set_len(invlist, length  - offset, offset);
8200
8201     invlist_set_previous_index(invlist, 0);
8202
8203     /* Initialize the iteration pointer. */
8204     invlist_iterfinish(invlist);
8205
8206     SvREADONLY_on(invlist);
8207
8208     return invlist;
8209 }
8210 #endif /* ifndef PERL_IN_XSUB_RE */
8211
8212 STATIC void
8213 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8214 {
8215     /* Grow the maximum size of an inversion list */
8216
8217     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8218
8219     assert(SvTYPE(invlist) == SVt_INVLIST);
8220
8221     /* Add one to account for the zero element at the beginning which may not
8222      * be counted by the calling parameters */
8223     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8224 }
8225
8226 STATIC void
8227 S__append_range_to_invlist(pTHX_ SV* const invlist,
8228                                  const UV start, const UV end)
8229 {
8230    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8231     * the end of the inversion list.  The range must be above any existing
8232     * ones. */
8233
8234     UV* array;
8235     UV max = invlist_max(invlist);
8236     UV len = _invlist_len(invlist);
8237     bool offset;
8238
8239     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8240
8241     if (len == 0) { /* Empty lists must be initialized */
8242         offset = start != 0;
8243         array = _invlist_array_init(invlist, ! offset);
8244     }
8245     else {
8246         /* Here, the existing list is non-empty. The current max entry in the
8247          * list is generally the first value not in the set, except when the
8248          * set extends to the end of permissible values, in which case it is
8249          * the first entry in that final set, and so this call is an attempt to
8250          * append out-of-order */
8251
8252         UV final_element = len - 1;
8253         array = invlist_array(invlist);
8254         if (array[final_element] > start
8255             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8256         {
8257             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8258                      array[final_element], start,
8259                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8260         }
8261
8262         /* Here, it is a legal append.  If the new range begins with the first
8263          * value not in the set, it is extending the set, so the new first
8264          * value not in the set is one greater than the newly extended range.
8265          * */
8266         offset = *get_invlist_offset_addr(invlist);
8267         if (array[final_element] == start) {
8268             if (end != UV_MAX) {
8269                 array[final_element] = end + 1;
8270             }
8271             else {
8272                 /* But if the end is the maximum representable on the machine,
8273                  * just let the range that this would extend to have no end */
8274                 invlist_set_len(invlist, len - 1, offset);
8275             }
8276             return;
8277         }
8278     }
8279
8280     /* Here the new range doesn't extend any existing set.  Add it */
8281
8282     len += 2;   /* Includes an element each for the start and end of range */
8283
8284     /* If wll overflow the existing space, extend, which may cause the array to
8285      * be moved */
8286     if (max < len) {
8287         invlist_extend(invlist, len);
8288
8289         /* Have to set len here to avoid assert failure in invlist_array() */
8290         invlist_set_len(invlist, len, offset);
8291
8292         array = invlist_array(invlist);
8293     }
8294     else {
8295         invlist_set_len(invlist, len, offset);
8296     }
8297
8298     /* The next item on the list starts the range, the one after that is
8299      * one past the new range.  */
8300     array[len - 2] = start;
8301     if (end != UV_MAX) {
8302         array[len - 1] = end + 1;
8303     }
8304     else {
8305         /* But if the end is the maximum representable on the machine, just let
8306          * the range have no end */
8307         invlist_set_len(invlist, len - 1, offset);
8308     }
8309 }
8310
8311 #ifndef PERL_IN_XSUB_RE
8312
8313 IV
8314 Perl__invlist_search(SV* const invlist, const UV cp)
8315 {
8316     /* Searches the inversion list for the entry that contains the input code
8317      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8318      * return value is the index into the list's array of the range that
8319      * contains <cp> */
8320
8321     IV low = 0;
8322     IV mid;
8323     IV high = _invlist_len(invlist);
8324     const IV highest_element = high - 1;
8325     const UV* array;
8326
8327     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8328
8329     /* If list is empty, return failure. */
8330     if (high == 0) {
8331         return -1;
8332     }
8333
8334     /* (We can't get the array unless we know the list is non-empty) */
8335     array = invlist_array(invlist);
8336
8337     mid = invlist_previous_index(invlist);
8338     assert(mid >=0 && mid <= highest_element);
8339
8340     /* <mid> contains the cache of the result of the previous call to this
8341      * function (0 the first time).  See if this call is for the same result,
8342      * or if it is for mid-1.  This is under the theory that calls to this
8343      * function will often be for related code points that are near each other.
8344      * And benchmarks show that caching gives better results.  We also test
8345      * here if the code point is within the bounds of the list.  These tests
8346      * replace others that would have had to be made anyway to make sure that
8347      * the array bounds were not exceeded, and these give us extra information
8348      * at the same time */
8349     if (cp >= array[mid]) {
8350         if (cp >= array[highest_element]) {
8351             return highest_element;
8352         }
8353
8354         /* Here, array[mid] <= cp < array[highest_element].  This means that
8355          * the final element is not the answer, so can exclude it; it also
8356          * means that <mid> is not the final element, so can refer to 'mid + 1'
8357          * safely */
8358         if (cp < array[mid + 1]) {
8359             return mid;
8360         }
8361         high--;
8362         low = mid + 1;
8363     }
8364     else { /* cp < aray[mid] */
8365         if (cp < array[0]) { /* Fail if outside the array */
8366             return -1;
8367         }
8368         high = mid;
8369         if (cp >= array[mid - 1]) {
8370             goto found_entry;
8371         }
8372     }
8373
8374     /* Binary search.  What we are looking for is <i> such that
8375      *  array[i] <= cp < array[i+1]
8376      * The loop below converges on the i+1.  Note that there may not be an
8377      * (i+1)th element in the array, and things work nonetheless */
8378     while (low < high) {
8379         mid = (low + high) / 2;
8380         assert(mid <= highest_element);
8381         if (array[mid] <= cp) { /* cp >= array[mid] */
8382             low = mid + 1;
8383
8384             /* We could do this extra test to exit the loop early.
8385             if (cp < array[low]) {
8386                 return mid;
8387             }
8388             */
8389         }
8390         else { /* cp < array[mid] */
8391             high = mid;
8392         }
8393     }
8394
8395   found_entry:
8396     high--;
8397     invlist_set_previous_index(invlist, high);
8398     return high;
8399 }
8400
8401 void
8402 Perl__invlist_populate_swatch(SV* const invlist,
8403                               const UV start, const UV end, U8* swatch)
8404 {
8405     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8406      * but is used when the swash has an inversion list.  This makes this much
8407      * faster, as it uses a binary search instead of a linear one.  This is
8408      * intimately tied to that function, and perhaps should be in utf8.c,
8409      * except it is intimately tied to inversion lists as well.  It assumes
8410      * that <swatch> is all 0's on input */
8411
8412     UV current = start;
8413     const IV len = _invlist_len(invlist);
8414     IV i;
8415     const UV * array;
8416
8417     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8418
8419     if (len == 0) { /* Empty inversion list */
8420         return;
8421     }
8422
8423     array = invlist_array(invlist);
8424
8425     /* Find which element it is */
8426     i = _invlist_search(invlist, start);
8427
8428     /* We populate from <start> to <end> */
8429     while (current < end) {
8430         UV upper;
8431
8432         /* The inversion list gives the results for every possible code point
8433          * after the first one in the list.  Only those ranges whose index is
8434          * even are ones that the inversion list matches.  For the odd ones,
8435          * and if the initial code point is not in the list, we have to skip
8436          * forward to the next element */
8437         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8438             i++;
8439             if (i >= len) { /* Finished if beyond the end of the array */
8440                 return;
8441             }
8442             current = array[i];
8443             if (current >= end) {   /* Finished if beyond the end of what we
8444                                        are populating */
8445                 if (LIKELY(end < UV_MAX)) {
8446                     return;
8447                 }
8448
8449                 /* We get here when the upper bound is the maximum
8450                  * representable on the machine, and we are looking for just
8451                  * that code point.  Have to special case it */
8452                 i = len;
8453                 goto join_end_of_list;
8454             }
8455         }
8456         assert(current >= start);
8457
8458         /* The current range ends one below the next one, except don't go past
8459          * <end> */
8460         i++;
8461         upper = (i < len && array[i] < end) ? array[i] : end;
8462
8463         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8464          * for each code point in it */
8465         for (; current < upper; current++) {
8466             const STRLEN offset = (STRLEN)(current - start);
8467             swatch[offset >> 3] |= 1 << (offset & 7);
8468         }
8469
8470     join_end_of_list:
8471
8472         /* Quit if at the end of the list */
8473         if (i >= len) {
8474
8475             /* But first, have to deal with the highest possible code point on
8476              * the platform.  The previous code assumes that <end> is one
8477              * beyond where we want to populate, but that is impossible at the
8478              * platform's infinity, so have to handle it specially */
8479             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8480             {
8481                 const STRLEN offset = (STRLEN)(end - start);
8482                 swatch[offset >> 3] |= 1 << (offset & 7);
8483             }
8484             return;
8485         }
8486
8487         /* Advance to the next range, which will be for code points not in the
8488          * inversion list */
8489         current = array[i];
8490     }
8491
8492     return;
8493 }
8494
8495 void
8496 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8497                                          const bool complement_b, SV** output)
8498 {
8499     /* Take the union of two inversion lists and point <output> to it.  *output
8500      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8501      * the reference count to that list will be decremented if not already a
8502      * temporary (mortal); otherwise *output will be made correspondingly
8503      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8504      * second list is returned.  If <complement_b> is TRUE, the union is taken
8505      * of the complement (inversion) of <b> instead of b itself.
8506      *
8507      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8508      * Richard Gillam, published by Addison-Wesley, and explained at some
8509      * length there.  The preface says to incorporate its examples into your
8510      * code at your own risk.
8511      *
8512      * The algorithm is like a merge sort.
8513      *
8514      * XXX A potential performance improvement is to keep track as we go along
8515      * if only one of the inputs contributes to the result, meaning the other
8516      * is a subset of that one.  In that case, we can skip the final copy and
8517      * return the larger of the input lists, but then outside code might need
8518      * to keep track of whether to free the input list or not */
8519
8520     const UV* array_a;    /* a's array */
8521     const UV* array_b;
8522     UV len_a;       /* length of a's array */
8523     UV len_b;
8524
8525     SV* u;                      /* the resulting union */
8526     UV* array_u;
8527     UV len_u;
8528
8529     UV i_a = 0;             /* current index into a's array */
8530     UV i_b = 0;
8531     UV i_u = 0;
8532
8533     /* running count, as explained in the algorithm source book; items are
8534      * stopped accumulating and are output when the count changes to/from 0.
8535      * The count is incremented when we start a range that's in the set, and
8536      * decremented when we start a range that's not in the set.  So its range
8537      * is 0 to 2.  Only when the count is zero is something not in the set.
8538      */
8539     UV count = 0;
8540
8541     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8542     assert(a != b);
8543
8544     /* If either one is empty, the union is the other one */
8545     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8546         bool make_temp = FALSE; /* Should we mortalize the result? */
8547
8548         if (*output == a) {
8549             if (a != NULL) {
8550                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8551                     SvREFCNT_dec_NN(a);
8552                 }
8553             }
8554         }
8555         if (*output != b) {
8556             *output = invlist_clone(b);
8557             if (complement_b) {
8558                 _invlist_invert(*output);
8559             }
8560         } /* else *output already = b; */
8561
8562         if (make_temp) {
8563             sv_2mortal(*output);
8564         }
8565         return;
8566     }
8567     else if ((len_b = _invlist_len(b)) == 0) {
8568         bool make_temp = FALSE;
8569         if (*output == b) {
8570             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8571                 SvREFCNT_dec_NN(b);
8572             }
8573         }
8574
8575         /* The complement of an empty list is a list that has everything in it,
8576          * so the union with <a> includes everything too */
8577         if (complement_b) {
8578             if (a == *output) {
8579                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8580                     SvREFCNT_dec_NN(a);
8581                 }
8582             }
8583             *output = _new_invlist(1);
8584             _append_range_to_invlist(*output, 0, UV_MAX);
8585         }
8586         else if (*output != a) {
8587             *output = invlist_clone(a);
8588         }
8589         /* else *output already = a; */
8590
8591         if (make_temp) {
8592             sv_2mortal(*output);
8593         }
8594         return;
8595     }
8596
8597     /* Here both lists exist and are non-empty */
8598     array_a = invlist_array(a);
8599     array_b = invlist_array(b);
8600
8601     /* If are to take the union of 'a' with the complement of b, set it
8602      * up so are looking at b's complement. */
8603     if (complement_b) {
8604
8605         /* To complement, we invert: if the first element is 0, remove it.  To
8606          * do this, we just pretend the array starts one later */
8607         if (array_b[0] == 0) {
8608             array_b++;
8609             len_b--;
8610         }
8611         else {
8612
8613             /* But if the first element is not zero, we pretend the list starts
8614              * at the 0 that is always stored immediately before the array. */
8615             array_b--;
8616             len_b++;
8617         }
8618     }
8619
8620     /* Size the union for the worst case: that the sets are completely
8621      * disjoint */
8622     u = _new_invlist(len_a + len_b);
8623
8624     /* Will contain U+0000 if either component does */
8625     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8626                                       || (len_b > 0 && array_b[0] == 0));
8627
8628     /* Go through each list item by item, stopping when exhausted one of
8629      * them */
8630     while (i_a < len_a && i_b < len_b) {
8631         UV cp;      /* The element to potentially add to the union's array */
8632         bool cp_in_set;   /* is it in the the input list's set or not */
8633
8634         /* We need to take one or the other of the two inputs for the union.
8635          * Since we are merging two sorted lists, we take the smaller of the
8636          * next items.  In case of a tie, we take the one that is in its set
8637          * first.  If we took one not in the set first, it would decrement the
8638          * count, possibly to 0 which would cause it to be output as ending the
8639          * range, and the next time through we would take the same number, and
8640          * output it again as beginning the next range.  By doing it the
8641          * opposite way, there is no possibility that the count will be
8642          * momentarily decremented to 0, and thus the two adjoining ranges will
8643          * be seamlessly merged.  (In a tie and both are in the set or both not
8644          * in the set, it doesn't matter which we take first.) */
8645         if (array_a[i_a] < array_b[i_b]
8646             || (array_a[i_a] == array_b[i_b]
8647                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8648         {
8649             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8650             cp= array_a[i_a++];
8651         }
8652         else {
8653             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8654             cp = array_b[i_b++];
8655         }
8656
8657         /* Here, have chosen which of the two inputs to look at.  Only output
8658          * if the running count changes to/from 0, which marks the
8659          * beginning/end of a range in that's in the set */
8660         if (cp_in_set) {
8661             if (count == 0) {
8662                 array_u[i_u++] = cp;
8663             }
8664             count++;
8665         }
8666         else {
8667             count--;
8668             if (count == 0) {
8669                 array_u[i_u++] = cp;
8670             }
8671         }
8672     }
8673
8674     /* Here, we are finished going through at least one of the lists, which
8675      * means there is something remaining in at most one.  We check if the list
8676      * that hasn't been exhausted is positioned such that we are in the middle
8677      * of a range in its set or not.  (i_a and i_b point to the element beyond
8678      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8679      * is potentially more to output.
8680      * There are four cases:
8681      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8682      *     in the union is entirely from the non-exhausted set.
8683      *  2) Both were in their sets, count is 2.  Nothing further should
8684      *     be output, as everything that remains will be in the exhausted
8685      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8686      *     that
8687      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8688      *     Nothing further should be output because the union includes
8689      *     everything from the exhausted set.  Not decrementing ensures that.
8690      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8691      *     decrementing to 0 insures that we look at the remainder of the
8692      *     non-exhausted set */
8693     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8694         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8695     {
8696         count--;
8697     }
8698
8699     /* The final length is what we've output so far, plus what else is about to
8700      * be output.  (If 'count' is non-zero, then the input list we exhausted
8701      * has everything remaining up to the machine's limit in its set, and hence
8702      * in the union, so there will be no further output. */
8703     len_u = i_u;
8704     if (count == 0) {
8705         /* At most one of the subexpressions will be non-zero */
8706         len_u += (len_a - i_a) + (len_b - i_b);
8707     }
8708
8709     /* Set result to final length, which can change the pointer to array_u, so
8710      * re-find it */
8711     if (len_u != _invlist_len(u)) {
8712         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8713         invlist_trim(u);
8714         array_u = invlist_array(u);
8715     }
8716
8717     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8718      * the other) ended with everything above it not in its set.  That means
8719      * that the remaining part of the union is precisely the same as the
8720      * non-exhausted list, so can just copy it unchanged.  (If both list were
8721      * exhausted at the same time, then the operations below will be both 0.)
8722      */
8723     if (count == 0) {
8724         IV copy_count; /* At most one will have a non-zero copy count */
8725         if ((copy_count = len_a - i_a) > 0) {
8726             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8727         }
8728         else if ((copy_count = len_b - i_b) > 0) {
8729             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8730         }
8731     }
8732
8733     /*  We may be removing a reference to one of the inputs.  If so, the output
8734      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8735      *  count decremented) */
8736     if (a == *output || b == *output) {
8737         assert(! invlist_is_iterating(*output));
8738         if ((SvTEMP(*output))) {
8739             sv_2mortal(u);
8740         }
8741         else {
8742             SvREFCNT_dec_NN(*output);
8743         }
8744     }
8745
8746     *output = u;
8747
8748     return;
8749 }
8750
8751 void
8752 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8753                                                const bool complement_b, SV** i)
8754 {
8755     /* Take the intersection of two inversion lists and point <i> to it.  *i
8756      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8757      * the reference count to that list will be decremented if not already a
8758      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8759      * The first list, <a>, may be NULL, in which case an empty list is
8760      * returned.  If <complement_b> is TRUE, the result will be the
8761      * intersection of <a> and the complement (or inversion) of <b> instead of
8762      * <b> directly.
8763      *
8764      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8765      * Richard Gillam, published by Addison-Wesley, and explained at some
8766      * length there.  The preface says to incorporate its examples into your
8767      * code at your own risk.  In fact, it had bugs
8768      *
8769      * The algorithm is like a merge sort, and is essentially the same as the
8770      * union above
8771      */
8772
8773     const UV* array_a;          /* a's array */
8774     const UV* array_b;
8775     UV len_a;   /* length of a's array */
8776     UV len_b;
8777
8778     SV* r;                   /* the resulting intersection */
8779     UV* array_r;
8780     UV len_r;
8781
8782     UV i_a = 0;             /* current index into a's array */
8783     UV i_b = 0;
8784     UV i_r = 0;
8785
8786     /* running count, as explained in the algorithm source book; items are
8787      * stopped accumulating and are output when the count changes to/from 2.
8788      * The count is incremented when we start a range that's in the set, and
8789      * decremented when we start a range that's not in the set.  So its range
8790      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8791      */
8792     UV count = 0;
8793
8794     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8795     assert(a != b);
8796
8797     /* Special case if either one is empty */
8798     len_a = (a == NULL) ? 0 : _invlist_len(a);
8799     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8800         bool make_temp = FALSE;
8801
8802         if (len_a != 0 && complement_b) {
8803
8804             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8805              * be empty.  Here, also we are using 'b's complement, which hence
8806              * must be every possible code point.  Thus the intersection is
8807              * simply 'a'. */
8808             if (*i != a) {
8809                 if (*i == b) {
8810                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8811                         SvREFCNT_dec_NN(b);
8812                     }
8813                 }
8814
8815                 *i = invlist_clone(a);
8816             }
8817             /* else *i is already 'a' */
8818
8819             if (make_temp) {
8820                 sv_2mortal(*i);
8821             }
8822             return;
8823         }
8824
8825         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8826          * intersection must be empty */
8827         if (*i == a) {
8828             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8829                 SvREFCNT_dec_NN(a);
8830             }
8831         }
8832         else if (*i == b) {
8833             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8834                 SvREFCNT_dec_NN(b);
8835             }
8836         }
8837         *i = _new_invlist(0);
8838         if (make_temp) {
8839             sv_2mortal(*i);
8840         }
8841
8842         return;
8843     }
8844
8845     /* Here both lists exist and are non-empty */
8846     array_a = invlist_array(a);
8847     array_b = invlist_array(b);
8848
8849     /* If are to take the intersection of 'a' with the complement of b, set it
8850      * up so are looking at b's complement. */
8851     if (complement_b) {
8852
8853         /* To complement, we invert: if the first element is 0, remove it.  To
8854          * do this, we just pretend the array starts one later */
8855         if (array_b[0] == 0) {
8856             array_b++;
8857             len_b--;
8858         }
8859         else {
8860
8861             /* But if the first element is not zero, we pretend the list starts
8862              * at the 0 that is always stored immediately before the array. */
8863             array_b--;
8864             len_b++;
8865         }
8866     }
8867
8868     /* Size the intersection for the worst case: that the intersection ends up
8869      * fragmenting everything to be completely disjoint */
8870     r= _new_invlist(len_a + len_b);
8871
8872     /* Will contain U+0000 iff both components do */
8873     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8874                                      && len_b > 0 && array_b[0] == 0);
8875
8876     /* Go through each list item by item, stopping when exhausted one of
8877      * them */
8878     while (i_a < len_a && i_b < len_b) {
8879         UV cp;      /* The element to potentially add to the intersection's
8880                        array */
8881         bool cp_in_set; /* Is it in the input list's set or not */
8882
8883         /* We need to take one or the other of the two inputs for the
8884          * intersection.  Since we are merging two sorted lists, we take the
8885          * smaller of the next items.  In case of a tie, we take the one that
8886          * is not in its set first (a difference from the union algorithm).  If
8887          * we took one in the set first, it would increment the count, possibly
8888          * to 2 which would cause it to be output as starting a range in the
8889          * intersection, and the next time through we would take that same
8890          * number, and output it again as ending the set.  By doing it the
8891          * opposite of this, there is no possibility that the count will be
8892          * momentarily incremented to 2.  (In a tie and both are in the set or
8893          * both not in the set, it doesn't matter which we take first.) */
8894         if (array_a[i_a] < array_b[i_b]
8895             || (array_a[i_a] == array_b[i_b]
8896                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8897         {
8898             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8899             cp= array_a[i_a++];
8900         }
8901         else {
8902             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8903             cp= array_b[i_b++];
8904         }
8905
8906         /* Here, have chosen which of the two inputs to look at.  Only output
8907          * if the running count changes to/from 2, which marks the
8908          * beginning/end of a range that's in the intersection */
8909         if (cp_in_set) {
8910             count++;
8911             if (count == 2) {
8912                 array_r[i_r++] = cp;
8913             }
8914         }
8915         else {
8916             if (count == 2) {
8917                 array_r[i_r++] = cp;
8918             }
8919             count--;
8920         }
8921     }
8922
8923     /* Here, we are finished going through at least one of the lists, which
8924      * means there is something remaining in at most one.  We check if the list
8925      * that has been exhausted is positioned such that we are in the middle
8926      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8927      * the ones we care about.)  There are four cases:
8928      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8929      *     nothing left in the intersection.
8930      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8931      *     above 2.  What should be output is exactly that which is in the
8932      *     non-exhausted set, as everything it has is also in the intersection
8933      *     set, and everything it doesn't have can't be in the intersection
8934      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8935      *     gets incremented to 2.  Like the previous case, the intersection is
8936      *     everything that remains in the non-exhausted set.
8937      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8938      *     remains 1.  And the intersection has nothing more. */
8939     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8940         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8941     {
8942         count++;
8943     }
8944
8945     /* The final length is what we've output so far plus what else is in the
8946      * intersection.  At most one of the subexpressions below will be non-zero
8947      * */
8948     len_r = i_r;
8949     if (count >= 2) {
8950         len_r += (len_a - i_a) + (len_b - i_b);
8951     }
8952
8953     /* Set result to final length, which can change the pointer to array_r, so
8954      * re-find it */
8955     if (len_r != _invlist_len(r)) {
8956         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8957         invlist_trim(r);
8958         array_r = invlist_array(r);
8959     }
8960
8961     /* Finish outputting any remaining */
8962     if (count >= 2) { /* At most one will have a non-zero copy count */
8963         IV copy_count;
8964         if ((copy_count = len_a - i_a) > 0) {
8965             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8966         }
8967         else if ((copy_count = len_b - i_b) > 0) {
8968             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8969         }
8970     }
8971
8972     /*  We may be removing a reference to one of the inputs.  If so, the output
8973      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8974      *  count decremented) */
8975     if (a == *i || b == *i) {
8976         assert(! invlist_is_iterating(*i));
8977         if (SvTEMP(*i)) {
8978             sv_2mortal(r);
8979         }
8980         else {
8981             SvREFCNT_dec_NN(*i);
8982         }
8983     }
8984
8985     *i = r;
8986
8987     return;
8988 }
8989
8990 SV*
8991 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8992 {
8993     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8994      * set.  A pointer to the inversion list is returned.  This may actually be
8995      * a new list, in which case the passed in one has been destroyed.  The
8996      * passed-in inversion list can be NULL, in which case a new one is created
8997      * with just the one range in it */
8998
8999     SV* range_invlist;
9000     UV len;
9001
9002     if (invlist == NULL) {
9003         invlist = _new_invlist(2);
9004         len = 0;
9005     }
9006     else {
9007         len = _invlist_len(invlist);
9008     }
9009
9010     /* If comes after the final entry actually in the list, can just append it
9011      * to the end, */
9012     if (len == 0
9013         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9014             && start >= invlist_array(invlist)[len - 1]))
9015     {
9016         _append_range_to_invlist(invlist, start, end);
9017         return invlist;
9018     }
9019
9020     /* Here, can't just append things, create and return a new inversion list
9021      * which is the union of this range and the existing inversion list */
9022     range_invlist = _new_invlist(2);
9023     _append_range_to_invlist(range_invlist, start, end);
9024
9025     _invlist_union(invlist, range_invlist, &invlist);
9026
9027     /* The temporary can be freed */
9028     SvREFCNT_dec_NN(range_invlist);
9029
9030     return invlist;
9031 }
9032
9033 SV*
9034 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9035                                  UV** other_elements_ptr)
9036 {
9037     /* Create and return an inversion list whose contents are to be populated
9038      * by the caller.  The caller gives the number of elements (in 'size') and
9039      * the very first element ('element0').  This function will set
9040      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9041      * are to be placed.
9042      *
9043      * Obviously there is some trust involved that the caller will properly
9044      * fill in the other elements of the array.
9045      *
9046      * (The first element needs to be passed in, as the underlying code does
9047      * things differently depending on whether it is zero or non-zero) */
9048
9049     SV* invlist = _new_invlist(size);
9050     bool offset;
9051
9052     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9053
9054     _append_range_to_invlist(invlist, element0, element0);
9055     offset = *get_invlist_offset_addr(invlist);
9056
9057     invlist_set_len(invlist, size, offset);
9058     *other_elements_ptr = invlist_array(invlist) + 1;
9059     return invlist;
9060 }
9061
9062 #endif
9063
9064 PERL_STATIC_INLINE SV*
9065 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9066     return _add_range_to_invlist(invlist, cp, cp);
9067 }
9068
9069 #ifndef PERL_IN_XSUB_RE
9070 void
9071 Perl__invlist_invert(pTHX_ SV* const invlist)
9072 {
9073     /* Complement the input inversion list.  This adds a 0 if the list didn't
9074      * have a zero; removes it otherwise.  As described above, the data
9075      * structure is set up so that this is very efficient */
9076
9077     PERL_ARGS_ASSERT__INVLIST_INVERT;
9078
9079     assert(! invlist_is_iterating(invlist));
9080
9081     /* The inverse of matching nothing is matching everything */
9082     if (_invlist_len(invlist) == 0) {
9083         _append_range_to_invlist(invlist, 0, UV_MAX);
9084         return;
9085     }
9086
9087     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9088 }
9089
9090 #endif
9091
9092 PERL_STATIC_INLINE SV*
9093 S_invlist_clone(pTHX_ SV* const invlist)
9094 {
9095
9096     /* Return a new inversion list that is a copy of the input one, which is
9097      * unchanged.  The new list will not be mortal even if the old one was. */
9098
9099     /* Need to allocate extra space to accommodate Perl's addition of a
9100      * trailing NUL to SvPV's, since it thinks they are always strings */
9101     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9102     STRLEN physical_length = SvCUR(invlist);
9103     bool offset = *(get_invlist_offset_addr(invlist));
9104
9105     PERL_ARGS_ASSERT_INVLIST_CLONE;
9106
9107     *(get_invlist_offset_addr(new_invlist)) = offset;
9108     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9109     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9110
9111     return new_invlist;
9112 }
9113
9114 PERL_STATIC_INLINE STRLEN*
9115 S_get_invlist_iter_addr(SV* invlist)
9116 {
9117     /* Return the address of the UV that contains the current iteration
9118      * position */
9119
9120     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9121
9122     assert(SvTYPE(invlist) == SVt_INVLIST);
9123
9124     return &(((XINVLIST*) SvANY(invlist))->iterator);
9125 }
9126
9127 PERL_STATIC_INLINE void
9128 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9129 {
9130     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9131
9132     *get_invlist_iter_addr(invlist) = 0;
9133 }
9134
9135 PERL_STATIC_INLINE void
9136 S_invlist_iterfinish(SV* invlist)
9137 {
9138     /* Terminate iterator for invlist.  This is to catch development errors.
9139      * Any iteration that is interrupted before completed should call this
9140      * function.  Functions that add code points anywhere else but to the end
9141      * of an inversion list assert that they are not in the middle of an
9142      * iteration.  If they were, the addition would make the iteration
9143      * problematical: if the iteration hadn't reached the place where things
9144      * were being added, it would be ok */
9145
9146     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9147
9148     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9149 }
9150
9151 STATIC bool
9152 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9153 {
9154     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9155      * This call sets in <*start> and <*end>, the next range in <invlist>.
9156      * Returns <TRUE> if successful and the next call will return the next
9157      * range; <FALSE> if was already at the end of the list.  If the latter,
9158      * <*start> and <*end> are unchanged, and the next call to this function
9159      * will start over at the beginning of the list */
9160
9161     STRLEN* pos = get_invlist_iter_addr(invlist);
9162     UV len = _invlist_len(invlist);
9163     UV *array;
9164
9165     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9166
9167     if (*pos >= len) {
9168         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9169         return FALSE;
9170     }
9171
9172     array = invlist_array(invlist);
9173
9174     *start = array[(*pos)++];
9175
9176     if (*pos >= len) {
9177         *end = UV_MAX;
9178     }
9179     else {
9180         *end = array[(*pos)++] - 1;
9181     }
9182
9183     return TRUE;
9184 }
9185
9186 PERL_STATIC_INLINE UV
9187 S_invlist_highest(SV* const invlist)
9188 {
9189     /* Returns the highest code point that matches an inversion list.  This API
9190      * has an ambiguity, as it returns 0 under either the highest is actually
9191      * 0, or if the list is empty.  If this distinction matters to you, check
9192      * for emptiness before calling this function */
9193
9194     UV len = _invlist_len(invlist);
9195     UV *array;
9196
9197     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9198
9199     if (len == 0) {
9200         return 0;
9201     }
9202
9203     array = invlist_array(invlist);
9204
9205     /* The last element in the array in the inversion list always starts a
9206      * range that goes to infinity.  That range may be for code points that are
9207      * matched in the inversion list, or it may be for ones that aren't
9208      * matched.  In the latter case, the highest code point in the set is one
9209      * less than the beginning of this range; otherwise it is the final element
9210      * of this range: infinity */
9211     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9212            ? UV_MAX
9213            : array[len - 1] - 1;
9214 }
9215
9216 #ifndef PERL_IN_XSUB_RE
9217 SV *
9218 Perl__invlist_contents(pTHX_ SV* const invlist)
9219 {
9220     /* Get the contents of an inversion list into a string SV so that they can
9221      * be printed out.  It uses the format traditionally done for debug tracing
9222      */
9223
9224     UV start, end;
9225     SV* output = newSVpvs("\n");
9226
9227     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9228
9229     assert(! invlist_is_iterating(invlist));
9230
9231     invlist_iterinit(invlist);
9232     while (invlist_iternext(invlist, &start, &end)) {
9233         if (end == UV_MAX) {
9234             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9235         }
9236         else if (end != start) {
9237             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9238                     start,       end);
9239         }
9240         else {
9241             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9242         }
9243     }
9244
9245     return output;
9246 }
9247 #endif
9248
9249 #ifndef PERL_IN_XSUB_RE
9250 void
9251 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9252                          const char * const indent, SV* const invlist)
9253 {
9254     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9255      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9256      * the string 'indent'.  The output looks like this:
9257          [0] 0x000A .. 0x000D
9258          [2] 0x0085
9259          [4] 0x2028 .. 0x2029
9260          [6] 0x3104 .. INFINITY
9261      * This means that the first range of code points matched by the list are
9262      * 0xA through 0xD; the second range contains only the single code point
9263      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9264      * are used to define each range (except if the final range extends to
9265      * infinity, only a single element is needed).  The array index of the
9266      * first element for the corresponding range is given in brackets. */
9267
9268     UV start, end;
9269     STRLEN count = 0;
9270
9271     PERL_ARGS_ASSERT__INVLIST_DUMP;
9272
9273     if (invlist_is_iterating(invlist)) {
9274         Perl_dump_indent(aTHX_ level, file,
9275              "%sCan't dump inversion list because is in middle of iterating\n",
9276              indent);
9277         return;
9278     }
9279
9280     invlist_iterinit(invlist);
9281     while (invlist_iternext(invlist, &start, &end)) {
9282         if (end == UV_MAX) {
9283             Perl_dump_indent(aTHX_ level, file,
9284                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9285                                    indent, (UV)count, start);
9286         }
9287         else if (end != start) {
9288             Perl_dump_indent(aTHX_ level, file,
9289                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9290                                 indent, (UV)count, start,         end);
9291         }
9292         else {
9293             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9294                                             indent, (UV)count, start);
9295         }
9296         count += 2;
9297     }
9298 }
9299
9300 void
9301 Perl__load_PL_utf8_foldclosures (pTHX)
9302 {
9303     assert(! PL_utf8_foldclosures);
9304
9305     /* If the folds haven't been read in, call a fold function
9306      * to force that */
9307     if (! PL_utf8_tofold) {
9308         U8 dummy[UTF8_MAXBYTES_CASE+1];
9309
9310         /* This string is just a short named one above \xff */
9311         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9312         assert(PL_utf8_tofold); /* Verify that worked */
9313     }
9314     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9315 }
9316 #endif
9317
9318 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9319 bool
9320 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9321 {
9322     /* Return a boolean as to if the two passed in inversion lists are
9323      * identical.  The final argument, if TRUE, says to take the complement of
9324      * the second inversion list before doing the comparison */
9325
9326     const UV* array_a = invlist_array(a);
9327     const UV* array_b = invlist_array(b);
9328     UV len_a = _invlist_len(a);
9329     UV len_b = _invlist_len(b);
9330
9331     UV i = 0;               /* current index into the arrays */
9332     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9333
9334     PERL_ARGS_ASSERT__INVLISTEQ;
9335
9336     /* If are to compare 'a' with the complement of b, set it
9337      * up so are looking at b's complement. */
9338     if (complement_b) {
9339
9340         /* The complement of nothing is everything, so <a> would have to have
9341          * just one element, starting at zero (ending at infinity) */
9342         if (len_b == 0) {
9343             return (len_a == 1 && array_a[0] == 0);
9344         }
9345         else if (array_b[0] == 0) {
9346
9347             /* Otherwise, to complement, we invert.  Here, the first element is
9348              * 0, just remove it.  To do this, we just pretend the array starts
9349              * one later */
9350
9351             array_b++;
9352             len_b--;
9353         }
9354         else {
9355
9356             /* But if the first element is not zero, we pretend the list starts
9357              * at the 0 that is always stored immediately before the array. */
9358             array_b--;
9359             len_b++;
9360         }
9361     }
9362
9363     /* Make sure that the lengths are the same, as well as the final element
9364      * before looping through the remainder.  (Thus we test the length, final,
9365      * and first elements right off the bat) */
9366     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9367         retval = FALSE;
9368     }
9369     else for (i = 0; i < len_a - 1; i++) {
9370         if (array_a[i] != array_b[i]) {
9371             retval = FALSE;
9372             break;
9373         }
9374     }
9375
9376     return retval;
9377 }
9378 #endif
9379
9380 /*
9381  * As best we can, determine the characters that can match the start of
9382  * the given EXACTF-ish node.
9383  *
9384  * Returns the invlist as a new SV*; it is the caller's responsibility to
9385  * call SvREFCNT_dec() when done with it.
9386  */
9387 STATIC SV*
9388 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9389 {
9390     const U8 * s = (U8*)STRING(node);
9391     SSize_t bytelen = STR_LEN(node);
9392     UV uc;
9393     /* Start out big enough for 2 separate code points */
9394     SV* invlist = _new_invlist(4);
9395
9396     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9397
9398     if (! UTF) {
9399         uc = *s;
9400
9401         /* We punt and assume can match anything if the node begins
9402          * with a multi-character fold.  Things are complicated.  For
9403          * example, /ffi/i could match any of:
9404          *  "\N{LATIN SMALL LIGATURE FFI}"
9405          *  "\N{LATIN SMALL LIGATURE FF}I"
9406          *  "F\N{LATIN SMALL LIGATURE FI}"
9407          *  plus several other things; and making sure we have all the
9408          *  possibilities is hard. */
9409         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9410             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9411         }
9412         else {
9413             /* Any Latin1 range character can potentially match any
9414              * other depending on the locale */
9415             if (OP(node) == EXACTFL) {
9416                 _invlist_union(invlist, PL_Latin1, &invlist);
9417             }
9418             else {
9419                 /* But otherwise, it matches at least itself.  We can
9420                  * quickly tell if it has a distinct fold, and if so,
9421                  * it matches that as well */
9422                 invlist = add_cp_to_invlist(invlist, uc);
9423                 if (IS_IN_SOME_FOLD_L1(uc))
9424                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9425             }
9426
9427             /* Some characters match above-Latin1 ones under /i.  This
9428              * is true of EXACTFL ones when the locale is UTF-8 */
9429             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9430                 && (! isASCII(uc) || (OP(node) != EXACTFA
9431                                     && OP(node) != EXACTFA_NO_TRIE)))
9432             {
9433                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9434             }
9435         }
9436     }
9437     else {  /* Pattern is UTF-8 */
9438         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9439         STRLEN foldlen = UTF8SKIP(s);
9440         const U8* e = s + bytelen;
9441         SV** listp;
9442
9443         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9444
9445         /* The only code points that aren't folded in a UTF EXACTFish
9446          * node are are the problematic ones in EXACTFL nodes */
9447         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9448             /* We need to check for the possibility that this EXACTFL
9449              * node begins with a multi-char fold.  Therefore we fold
9450              * the first few characters of it so that we can make that
9451              * check */
9452             U8 *d = folded;
9453             int i;
9454
9455             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9456                 if (isASCII(*s)) {
9457                     *(d++) = (U8) toFOLD(*s);
9458                     s++;
9459                 }
9460                 else {
9461                     STRLEN len;
9462                     to_utf8_fold(s, d, &len);
9463                     d += len;
9464                     s += UTF8SKIP(s);
9465                 }
9466             }
9467
9468             /* And set up so the code below that looks in this folded
9469              * buffer instead of the node's string */
9470             e = d;
9471             foldlen = UTF8SKIP(folded);
9472             s = folded;
9473         }
9474
9475         /* When we reach here 's' points to the fold of the first
9476          * character(s) of the node; and 'e' points to far enough along
9477          * the folded string to be just past any possible multi-char
9478          * fold. 'foldlen' is the length in bytes of the first
9479          * character in 's'
9480          *
9481          * Unlike the non-UTF-8 case, the macro for determining if a
9482          * string is a multi-char fold requires all the characters to
9483          * already be folded.  This is because of all the complications
9484          * if not.  Note that they are folded anyway, except in EXACTFL
9485          * nodes.  Like the non-UTF case above, we punt if the node
9486          * begins with a multi-char fold  */
9487
9488         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9489             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9490         }
9491         else {  /* Single char fold */
9492
9493             /* It matches all the things that fold to it, which are
9494              * found in PL_utf8_foldclosures (including itself) */
9495             invlist = add_cp_to_invlist(invlist, uc);
9496             if (! PL_utf8_foldclosures)
9497                 _load_PL_utf8_foldclosures();
9498             if ((listp = hv_fetch(PL_utf8_foldclosures,
9499                                 (char *) s, foldlen, FALSE)))
9500             {
9501                 AV* list = (AV*) *listp;
9502                 IV k;
9503                 for (k = 0; k <= av_tindex(list); k++) {
9504                     SV** c_p = av_fetch(list, k, FALSE);
9505                     UV c;
9506                     assert(c_p);
9507
9508                     c = SvUV(*c_p);
9509
9510                     /* /aa doesn't allow folds between ASCII and non- */
9511                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9512                         && isASCII(c) != isASCII(uc))
9513                     {
9514                         continue;
9515                     }
9516
9517                     invlist = add_cp_to_invlist(invlist, c);
9518                 }
9519             }
9520         }
9521     }
9522
9523     return invlist;
9524 }
9525
9526 #undef HEADER_LENGTH
9527 #undef TO_INTERNAL_SIZE
9528 #undef FROM_INTERNAL_SIZE
9529 #undef INVLIST_VERSION_ID
9530
9531 /* End of inversion list object */
9532
9533 STATIC void
9534 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9535 {
9536     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9537      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9538      * should point to the first flag; it is updated on output to point to the
9539      * final ')' or ':'.  There needs to be at least one flag, or this will
9540      * abort */
9541
9542     /* for (?g), (?gc), and (?o) warnings; warning
9543        about (?c) will warn about (?g) -- japhy    */
9544
9545 #define WASTED_O  0x01
9546 #define WASTED_G  0x02
9547 #define WASTED_C  0x04
9548 #define WASTED_GC (WASTED_G|WASTED_C)
9549     I32 wastedflags = 0x00;
9550     U32 posflags = 0, negflags = 0;
9551     U32 *flagsp = &posflags;
9552     char has_charset_modifier = '\0';
9553     regex_charset cs;
9554     bool has_use_defaults = FALSE;
9555     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9556     int x_mod_count = 0;
9557
9558     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9559
9560     /* '^' as an initial flag sets certain defaults */
9561     if (UCHARAT(RExC_parse) == '^') {
9562         RExC_parse++;
9563         has_use_defaults = TRUE;
9564         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9565         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9566                                         ? REGEX_UNICODE_CHARSET
9567                                         : REGEX_DEPENDS_CHARSET);
9568     }
9569
9570     cs = get_regex_charset(RExC_flags);
9571     if (cs == REGEX_DEPENDS_CHARSET
9572         && (RExC_utf8 || RExC_uni_semantics))
9573     {
9574         cs = REGEX_UNICODE_CHARSET;
9575     }
9576
9577     while (*RExC_parse) {
9578         /* && strchr("iogcmsx", *RExC_parse) */
9579         /* (?g), (?gc) and (?o) are useless here
9580            and must be globally applied -- japhy */
9581         switch (*RExC_parse) {
9582
9583             /* Code for the imsxn flags */
9584             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9585
9586             case LOCALE_PAT_MOD:
9587                 if (has_charset_modifier) {
9588                     goto excess_modifier;
9589                 }
9590                 else if (flagsp == &negflags) {
9591                     goto neg_modifier;
9592                 }
9593                 cs = REGEX_LOCALE_CHARSET;
9594                 has_charset_modifier = LOCALE_PAT_MOD;
9595                 break;
9596             case UNICODE_PAT_MOD:
9597                 if (has_charset_modifier) {
9598                     goto excess_modifier;
9599                 }
9600                 else if (flagsp == &negflags) {
9601                     goto neg_modifier;
9602                 }
9603                 cs = REGEX_UNICODE_CHARSET;
9604                 has_charset_modifier = UNICODE_PAT_MOD;
9605                 break;
9606             case ASCII_RESTRICT_PAT_MOD:
9607                 if (flagsp == &negflags) {
9608                     goto neg_modifier;
9609                 }
9610                 if (has_charset_modifier) {
9611                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9612                         goto excess_modifier;
9613                     }
9614                     /* Doubled modifier implies more restricted */
9615                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9616                 }
9617                 else {
9618                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9619                 }
9620                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9621                 break;
9622             case DEPENDS_PAT_MOD:
9623                 if (has_use_defaults) {
9624                     goto fail_modifiers;
9625                 }
9626                 else if (flagsp == &negflags) {
9627                     goto neg_modifier;
9628                 }
9629                 else if (has_charset_modifier) {
9630                     goto excess_modifier;
9631                 }
9632
9633                 /* The dual charset means unicode semantics if the
9634                  * pattern (or target, not known until runtime) are
9635                  * utf8, or something in the pattern indicates unicode
9636                  * semantics */
9637                 cs = (RExC_utf8 || RExC_uni_semantics)
9638                      ? REGEX_UNICODE_CHARSET
9639                      : REGEX_DEPENDS_CHARSET;
9640                 has_charset_modifier = DEPENDS_PAT_MOD;
9641                 break;
9642             excess_modifier:
9643                 RExC_parse++;
9644                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9645                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9646                 }
9647                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9648                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9649                                         *(RExC_parse - 1));
9650                 }
9651                 else {
9652                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9653                 }
9654                 NOT_REACHED; /*NOTREACHED*/
9655             neg_modifier:
9656                 RExC_parse++;
9657                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9658                                     *(RExC_parse - 1));
9659                 NOT_REACHED; /*NOTREACHED*/
9660             case ONCE_PAT_MOD: /* 'o' */
9661             case GLOBAL_PAT_MOD: /* 'g' */
9662                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9663                     const I32 wflagbit = *RExC_parse == 'o'
9664                                          ? WASTED_O
9665                                          : WASTED_G;
9666                     if (! (wastedflags & wflagbit) ) {
9667                         wastedflags |= wflagbit;
9668                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9669                         vWARN5(
9670                             RExC_parse + 1,
9671                             "Useless (%s%c) - %suse /%c modifier",
9672                             flagsp == &negflags ? "?-" : "?",
9673                             *RExC_parse,
9674                             flagsp == &negflags ? "don't " : "",
9675                             *RExC_parse
9676                         );
9677                     }
9678                 }
9679                 break;
9680
9681             case CONTINUE_PAT_MOD: /* 'c' */
9682                 if (PASS2 && ckWARN(WARN_REGEXP)) {
9683                     if (! (wastedflags & WASTED_C) ) {
9684                         wastedflags |= WASTED_GC;
9685                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9686                         vWARN3(
9687                             RExC_parse + 1,
9688                             "Useless (%sc) - %suse /gc modifier",
9689                             flagsp == &negflags ? "?-" : "?",
9690                             flagsp == &negflags ? "don't " : ""
9691                         );
9692                     }
9693                 }
9694                 break;
9695             case KEEPCOPY_PAT_MOD: /* 'p' */
9696                 if (flagsp == &negflags) {
9697                     if (PASS2)
9698                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9699                 } else {
9700                     *flagsp |= RXf_PMf_KEEPCOPY;
9701                 }
9702                 break;
9703             case '-':
9704                 /* A flag is a default iff it is following a minus, so
9705                  * if there is a minus, it means will be trying to
9706                  * re-specify a default which is an error */
9707                 if (has_use_defaults || flagsp == &negflags) {
9708                     goto fail_modifiers;
9709                 }
9710                 flagsp = &negflags;
9711                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9712                 break;
9713             case ':':
9714             case ')':
9715                 RExC_flags |= posflags;
9716                 RExC_flags &= ~negflags;
9717                 set_regex_charset(&RExC_flags, cs);
9718                 if (RExC_flags & RXf_PMf_FOLD) {
9719                     RExC_contains_i = 1;
9720                 }
9721                 if (PASS2) {
9722                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9723                 }
9724                 return;
9725                 /*NOTREACHED*/
9726             default:
9727             fail_modifiers:
9728                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9729                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9730                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9731                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9732                 NOT_REACHED; /*NOTREACHED*/
9733         }
9734
9735         ++RExC_parse;
9736     }
9737
9738     if (PASS2) {
9739         STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9740     }
9741 }
9742
9743 /*
9744  - reg - regular expression, i.e. main body or parenthesized thing
9745  *
9746  * Caller must absorb opening parenthesis.
9747  *
9748  * Combining parenthesis handling with the base level of regular expression
9749  * is a trifle forced, but the need to tie the tails of the branches to what
9750  * follows makes it hard to avoid.
9751  */
9752 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9753 #ifdef DEBUGGING
9754 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9755 #else
9756 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9757 #endif
9758
9759 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9760    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9761    needs to be restarted.
9762    Otherwise would only return NULL if regbranch() returns NULL, which
9763    cannot happen.  */
9764 STATIC regnode *
9765 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9766     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9767      * 2 is like 1, but indicates that nextchar() has been called to advance
9768      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9769      * this flag alerts us to the need to check for that */
9770 {
9771     regnode *ret;               /* Will be the head of the group. */
9772     regnode *br;
9773     regnode *lastbr;
9774     regnode *ender = NULL;
9775     I32 parno = 0;
9776     I32 flags;
9777     U32 oregflags = RExC_flags;
9778     bool have_branch = 0;
9779     bool is_open = 0;
9780     I32 freeze_paren = 0;
9781     I32 after_freeze = 0;
9782     I32 num; /* numeric backreferences */
9783
9784     char * parse_start = RExC_parse; /* MJD */
9785     char * const oregcomp_parse = RExC_parse;
9786
9787     GET_RE_DEBUG_FLAGS_DECL;
9788
9789     PERL_ARGS_ASSERT_REG;
9790     DEBUG_PARSE("reg ");
9791
9792     *flagp = 0;                         /* Tentatively. */
9793
9794
9795     /* Make an OPEN node, if parenthesized. */
9796     if (paren) {
9797
9798         /* Under /x, space and comments can be gobbled up between the '(' and
9799          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9800          * intervening space, as the sequence is a token, and a token should be
9801          * indivisible */
9802         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9803
9804         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9805             char *start_verb = RExC_parse;
9806             STRLEN verb_len = 0;
9807             char *start_arg = NULL;
9808             unsigned char op = 0;
9809             int argok = 1;
9810             int internal_argval = 0; /* internal_argval is only useful if
9811                                         !argok */
9812
9813             if (has_intervening_patws) {
9814                 RExC_parse++;
9815                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9816             }
9817             while ( *RExC_parse && *RExC_parse != ')' ) {
9818                 if ( *RExC_parse == ':' ) {
9819                     start_arg = RExC_parse + 1;
9820                     break;
9821                 }
9822                 RExC_parse++;
9823             }
9824             ++start_verb;
9825             verb_len = RExC_parse - start_verb;
9826             if ( start_arg ) {
9827                 RExC_parse++;
9828                 while ( *RExC_parse && *RExC_parse != ')' )
9829                     RExC_parse++;
9830                 if ( *RExC_parse != ')' )
9831                     vFAIL("Unterminated verb pattern argument");
9832                 if ( RExC_parse == start_arg )
9833                     start_arg = NULL;
9834             } else {
9835                 if ( *RExC_parse != ')' )
9836                     vFAIL("Unterminated verb pattern");
9837             }
9838
9839             switch ( *start_verb ) {
9840             case 'A':  /* (*ACCEPT) */
9841                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9842                     op = ACCEPT;
9843                     internal_argval = RExC_nestroot;
9844                 }
9845                 break;
9846             case 'C':  /* (*COMMIT) */
9847                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9848                     op = COMMIT;
9849                 break;
9850             case 'F':  /* (*FAIL) */
9851                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9852                     op = OPFAIL;
9853                     argok = 0;
9854                 }
9855                 break;
9856             case ':':  /* (*:NAME) */
9857             case 'M':  /* (*MARK:NAME) */
9858                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9859                     op = MARKPOINT;
9860                     argok = -1;
9861                 }
9862                 break;
9863             case 'P':  /* (*PRUNE) */
9864                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9865                     op = PRUNE;
9866                 break;
9867             case 'S':   /* (*SKIP) */
9868                 if ( memEQs(start_verb,verb_len,"SKIP") )
9869                     op = SKIP;
9870                 break;
9871             case 'T':  /* (*THEN) */
9872                 /* [19:06] <TimToady> :: is then */
9873                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9874                     op = CUTGROUP;
9875                     RExC_seen |= REG_CUTGROUP_SEEN;
9876                 }
9877                 break;
9878             }
9879             if ( ! op ) {
9880                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9881                 vFAIL2utf8f(
9882                     "Unknown verb pattern '%"UTF8f"'",
9883                     UTF8fARG(UTF, verb_len, start_verb));
9884             }
9885             if ( argok ) {
9886                 if ( start_arg && internal_argval ) {
9887                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9888                         verb_len, start_verb);
9889                 } else if ( argok < 0 && !start_arg ) {
9890                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9891                         verb_len, start_verb);
9892                 } else {
9893                     ret = reganode(pRExC_state, op, internal_argval);
9894                     if ( ! internal_argval && ! SIZE_ONLY ) {
9895                         if (start_arg) {
9896                             SV *sv = newSVpvn( start_arg,
9897                                                RExC_parse - start_arg);
9898                             ARG(ret) = add_data( pRExC_state,
9899                                                  STR_WITH_LEN("S"));
9900                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9901                             ret->flags = 0;
9902                         } else {
9903                             ret->flags = 1;
9904                         }
9905                     }
9906                 }
9907                 if (!internal_argval)
9908                     RExC_seen |= REG_VERBARG_SEEN;
9909             } else if ( start_arg ) {
9910                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9911                         verb_len, start_verb);
9912             } else {
9913                 ret = reg_node(pRExC_state, op);
9914             }
9915             nextchar(pRExC_state);
9916             return ret;
9917         }
9918         else if (*RExC_parse == '?') { /* (?...) */
9919             bool is_logical = 0;
9920             const char * const seqstart = RExC_parse;
9921             const char * endptr;
9922             if (has_intervening_patws) {
9923                 RExC_parse++;
9924                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9925             }
9926
9927             RExC_parse++;
9928             paren = *RExC_parse++;
9929             ret = NULL;                 /* For look-ahead/behind. */
9930             switch (paren) {
9931
9932             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9933                 paren = *RExC_parse++;
9934                 if ( paren == '<')         /* (?P<...>) named capture */
9935                     goto named_capture;
9936                 else if (paren == '>') {   /* (?P>name) named recursion */
9937                     goto named_recursion;
9938                 }
9939                 else if (paren == '=') {   /* (?P=...)  named backref */
9940                     /* this pretty much dupes the code for \k<NAME> in
9941                      * regatom(), if you change this make sure you change that
9942                      * */
9943                     char* name_start = RExC_parse;
9944                     U32 num = 0;
9945                     SV *sv_dat = reg_scan_name(pRExC_state,
9946                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9947                     if (RExC_parse == name_start || *RExC_parse != ')')
9948                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9949                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9950
9951                     if (!SIZE_ONLY) {
9952                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9953                         RExC_rxi->data->data[num]=(void*)sv_dat;
9954                         SvREFCNT_inc_simple_void(sv_dat);
9955                     }
9956                     RExC_sawback = 1;
9957                     ret = reganode(pRExC_state,
9958                                    ((! FOLD)
9959                                      ? NREF
9960                                      : (ASCII_FOLD_RESTRICTED)
9961                                        ? NREFFA
9962                                        : (AT_LEAST_UNI_SEMANTICS)
9963                                          ? NREFFU
9964                                          : (LOC)
9965                                            ? NREFFL
9966                                            : NREFF),
9967                                     num);
9968                     *flagp |= HASWIDTH;
9969
9970                     Set_Node_Offset(ret, parse_start+1);
9971                     Set_Node_Cur_Length(ret, parse_start);
9972
9973                     nextchar(pRExC_state);
9974                     return ret;
9975                 }
9976                 RExC_parse++;
9977                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9978                 vFAIL3("Sequence (%.*s...) not recognized",
9979                                 RExC_parse-seqstart, seqstart);
9980                 NOT_REACHED; /*NOTREACHED*/
9981             case '<':           /* (?<...) */
9982                 if (*RExC_parse == '!')
9983                     paren = ',';
9984                 else if (*RExC_parse != '=')
9985               named_capture:
9986                 {               /* (?<...>) */
9987                     char *name_start;
9988                     SV *svname;
9989                     paren= '>';
9990             case '\'':          /* (?'...') */
9991                     name_start= RExC_parse;
9992                     svname = reg_scan_name(pRExC_state,
9993                         SIZE_ONLY    /* reverse test from the others */
9994                         ? REG_RSN_RETURN_NAME
9995                         : REG_RSN_RETURN_NULL);
9996                     if (RExC_parse == name_start || *RExC_parse != paren)
9997                         vFAIL2("Sequence (?%c... not terminated",
9998                             paren=='>' ? '<' : paren);
9999                     if (SIZE_ONLY) {
10000                         HE *he_str;
10001                         SV *sv_dat = NULL;
10002                         if (!svname) /* shouldn't happen */
10003                             Perl_croak(aTHX_
10004                                 "panic: reg_scan_name returned NULL");
10005                         if (!RExC_paren_names) {
10006                             RExC_paren_names= newHV();
10007                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10008 #ifdef DEBUGGING
10009                             RExC_paren_name_list= newAV();
10010                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10011 #endif
10012                         }
10013                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10014                         if ( he_str )
10015                             sv_dat = HeVAL(he_str);
10016                         if ( ! sv_dat ) {
10017                             /* croak baby croak */
10018                             Perl_croak(aTHX_
10019                                 "panic: paren_name hash element allocation failed");
10020                         } else if ( SvPOK(sv_dat) ) {
10021                             /* (?|...) can mean we have dupes so scan to check
10022                                its already been stored. Maybe a flag indicating
10023                                we are inside such a construct would be useful,
10024                                but the arrays are likely to be quite small, so
10025                                for now we punt -- dmq */
10026                             IV count = SvIV(sv_dat);
10027                             I32 *pv = (I32*)SvPVX(sv_dat);
10028                             IV i;
10029                             for ( i = 0 ; i < count ; i++ ) {
10030                                 if ( pv[i] == RExC_npar ) {
10031                                     count = 0;
10032                                     break;
10033                                 }
10034                             }
10035                             if ( count ) {
10036                                 pv = (I32*)SvGROW(sv_dat,
10037                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10038                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10039                                 pv[count] = RExC_npar;
10040                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10041                             }
10042                         } else {
10043                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10044                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10045                                                                 sizeof(I32));
10046                             SvIOK_on(sv_dat);
10047                             SvIV_set(sv_dat, 1);
10048                         }
10049 #ifdef DEBUGGING
10050                         /* Yes this does cause a memory leak in debugging Perls
10051                          * */
10052                         if (!av_store(RExC_paren_name_list,
10053                                       RExC_npar, SvREFCNT_inc(svname)))
10054                             SvREFCNT_dec_NN(svname);
10055 #endif
10056
10057                         /*sv_dump(sv_dat);*/
10058                     }
10059                     nextchar(pRExC_state);
10060                     paren = 1;
10061                     goto capturing_parens;
10062                 }
10063                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10064                 RExC_in_lookbehind++;
10065                 RExC_parse++;
10066                 /* FALLTHROUGH */
10067             case '=':           /* (?=...) */
10068                 RExC_seen_zerolen++;
10069                 break;
10070             case '!':           /* (?!...) */
10071                 RExC_seen_zerolen++;
10072                 /* check if we're really just a "FAIL" assertion */
10073                 --RExC_parse;
10074                 nextchar(pRExC_state);
10075                 if (*RExC_parse == ')') {
10076                     ret=reg_node(pRExC_state, OPFAIL);
10077                     nextchar(pRExC_state);
10078                     return ret;
10079                 }
10080                 break;
10081             case '|':           /* (?|...) */
10082                 /* branch reset, behave like a (?:...) except that
10083                    buffers in alternations share the same numbers */
10084                 paren = ':';
10085                 after_freeze = freeze_paren = RExC_npar;
10086                 break;
10087             case ':':           /* (?:...) */
10088             case '>':           /* (?>...) */
10089                 break;
10090             case '$':           /* (?$...) */
10091             case '@':           /* (?@...) */
10092                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10093                 break;
10094             case '0' :           /* (?0) */
10095             case 'R' :           /* (?R) */
10096                 if (*RExC_parse != ')')
10097                     FAIL("Sequence (?R) not terminated");
10098                 ret = reg_node(pRExC_state, GOSTART);
10099                     RExC_seen |= REG_GOSTART_SEEN;
10100                 *flagp |= POSTPONED;
10101                 nextchar(pRExC_state);
10102                 return ret;
10103                 /*notreached*/
10104             /* named and numeric backreferences */
10105             case '&':            /* (?&NAME) */
10106                 parse_start = RExC_parse - 1;
10107               named_recursion:
10108                 {
10109                     SV *sv_dat = reg_scan_name(pRExC_state,
10110                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10111                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10112                 }
10113                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10114                     vFAIL("Sequence (?&... not terminated");
10115                 goto gen_recurse_regop;
10116                 /* NOT REACHED */
10117             case '+':
10118                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10119                     RExC_parse++;
10120                     vFAIL("Illegal pattern");
10121                 }
10122                 goto parse_recursion;
10123                 /* NOT REACHED*/
10124             case '-': /* (?-1) */
10125                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10126                     RExC_parse--; /* rewind to let it be handled later */
10127                     goto parse_flags;
10128                 }
10129                 /* FALLTHROUGH */
10130             case '1': case '2': case '3': case '4': /* (?1) */
10131             case '5': case '6': case '7': case '8': case '9':
10132                 RExC_parse--;
10133               parse_recursion:
10134                 {
10135                     bool is_neg = FALSE;
10136                     parse_start = RExC_parse - 1; /* MJD */
10137                     if (*RExC_parse == '-') {
10138                         RExC_parse++;
10139                         is_neg = TRUE;
10140                     }
10141                     num = grok_atou(RExC_parse, &endptr);
10142                     if (endptr)
10143                         RExC_parse = (char*)endptr;
10144                     if (is_neg) {
10145                         /* Some limit for num? */
10146                         num = -num;
10147                     }
10148                 }
10149                 if (*RExC_parse!=')')
10150                     vFAIL("Expecting close bracket");
10151
10152               gen_recurse_regop:
10153                 if ( paren == '-' ) {
10154                     /*
10155                     Diagram of capture buffer numbering.
10156                     Top line is the normal capture buffer numbers
10157                     Bottom line is the negative indexing as from
10158                     the X (the (?-2))
10159
10160                     +   1 2    3 4 5 X          6 7
10161                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10162                     -   5 4    3 2 1 X          x x
10163
10164                     */
10165                     num = RExC_npar + num;
10166                     if (num < 1)  {
10167                         RExC_parse++;
10168                         vFAIL("Reference to nonexistent group");
10169                     }
10170                 } else if ( paren == '+' ) {
10171                     num = RExC_npar + num - 1;
10172                 }
10173
10174                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10175                 if (!SIZE_ONLY) {
10176                     if (num > (I32)RExC_rx->nparens) {
10177                         RExC_parse++;
10178                         vFAIL("Reference to nonexistent group");
10179                     }
10180                     RExC_recurse_count++;
10181                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10182                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10183                               22, "|    |", (int)(depth * 2 + 1), "",
10184                               (UV)ARG(ret), (IV)ARG2L(ret)));
10185                 }
10186                 RExC_seen |= REG_RECURSE_SEEN;
10187                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10188                 Set_Node_Offset(ret, parse_start); /* MJD */
10189
10190                 *flagp |= POSTPONED;
10191                 nextchar(pRExC_state);
10192                 return ret;
10193
10194             /* NOT REACHED */
10195
10196             case '?':           /* (??...) */
10197                 is_logical = 1;
10198                 if (*RExC_parse != '{') {
10199                     RExC_parse++;
10200                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10201                     vFAIL2utf8f(
10202                         "Sequence (%"UTF8f"...) not recognized",
10203                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10204                     NOT_REACHED; /*NOTREACHED*/
10205                 }
10206                 *flagp |= POSTPONED;
10207                 paren = *RExC_parse++;
10208                 /* FALLTHROUGH */
10209             case '{':           /* (?{...}) */
10210             {
10211                 U32 n = 0;
10212                 struct reg_code_block *cb;
10213
10214                 RExC_seen_zerolen++;
10215
10216                 if (   !pRExC_state->num_code_blocks
10217                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
10218                     || pRExC_state->code_blocks[pRExC_state->code_index].start
10219                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10220                             - RExC_start)
10221                 ) {
10222                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
10223                         FAIL("panic: Sequence (?{...}): no code block found\n");
10224                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
10225                 }
10226                 /* this is a pre-compiled code block (?{...}) */
10227                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10228                 RExC_parse = RExC_start + cb->end;
10229                 if (!SIZE_ONLY) {
10230                     OP *o = cb->block;
10231                     if (cb->src_regex) {
10232                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10233                         RExC_rxi->data->data[n] =
10234                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
10235                         RExC_rxi->data->data[n+1] = (void*)o;
10236                     }
10237                     else {
10238                         n = add_data(pRExC_state,
10239                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10240                         RExC_rxi->data->data[n] = (void*)o;
10241                     }
10242                 }
10243                 pRExC_state->code_index++;
10244                 nextchar(pRExC_state);
10245
10246                 if (is_logical) {
10247                     regnode *eval;
10248                     ret = reg_node(pRExC_state, LOGICAL);
10249
10250                     eval = reg2Lanode(pRExC_state, EVAL,
10251                                        n,
10252
10253                                        /* for later propagation into (??{})
10254                                         * return value */
10255                                        RExC_flags & RXf_PMf_COMPILETIME
10256                                       );
10257                     if (!SIZE_ONLY) {
10258                         ret->flags = 2;
10259                     }
10260                     REGTAIL(pRExC_state, ret, eval);
10261                     /* deal with the length of this later - MJD */
10262                     return ret;
10263                 }
10264                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10265                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10266                 Set_Node_Offset(ret, parse_start);
10267                 return ret;
10268             }
10269             case '(':           /* (?(?{...})...) and (?(?=...)...) */
10270             {
10271                 int is_define= 0;
10272                 const int DEFINE_len = sizeof("DEFINE") - 1;
10273                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
10274                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10275                         || RExC_parse[1] == '<'
10276                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
10277                         I32 flag;
10278                         regnode *tail;
10279
10280                         ret = reg_node(pRExC_state, LOGICAL);
10281                         if (!SIZE_ONLY)
10282                             ret->flags = 1;
10283
10284                         tail = reg(pRExC_state, 1, &flag, depth+1);
10285                         if (flag & RESTART_UTF8) {
10286                             *flagp = RESTART_UTF8;
10287                             return NULL;
10288                         }
10289                         REGTAIL(pRExC_state, ret, tail);
10290                         goto insert_if;
10291                     }
10292                     /* Fall through to ‘Unknown switch condition’ at the
10293                        end of the if/else chain. */
10294                 }
10295                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10296                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10297                 {
10298                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
10299                     char *name_start= RExC_parse++;
10300                     U32 num = 0;
10301                     SV *sv_dat=reg_scan_name(pRExC_state,
10302                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10303                     if (RExC_parse == name_start || *RExC_parse != ch)
10304                         vFAIL2("Sequence (?(%c... not terminated",
10305                             (ch == '>' ? '<' : ch));
10306                     RExC_parse++;
10307                     if (!SIZE_ONLY) {
10308                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10309                         RExC_rxi->data->data[num]=(void*)sv_dat;
10310                         SvREFCNT_inc_simple_void(sv_dat);
10311                     }
10312                     ret = reganode(pRExC_state,NGROUPP,num);
10313                     goto insert_if_check_paren;
10314                 }
10315                 else if (strnEQ(RExC_parse, "DEFINE",
10316                                        MIN(DEFINE_len, RExC_end - RExC_parse)))
10317                 {
10318                     ret = reganode(pRExC_state,DEFINEP,0);
10319                     RExC_parse += DEFINE_len;
10320                     is_define = 1;
10321                     goto insert_if_check_paren;
10322                 }
10323                 else if (RExC_parse[0] == 'R') {
10324                     RExC_parse++;
10325                     parno = 0;
10326                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10327                         parno = grok_atou(RExC_parse, &endptr);
10328                         if (endptr)
10329                             RExC_parse = (char*)endptr;
10330                     } else if (RExC_parse[0] == '&') {
10331                         SV *sv_dat;
10332                         RExC_parse++;
10333                         sv_dat = reg_scan_name(pRExC_state,
10334                             SIZE_ONLY
10335                             ? REG_RSN_RETURN_NULL
10336                             : REG_RSN_RETURN_DATA);
10337                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10338                     }
10339                     ret = reganode(pRExC_state,INSUBP,parno);
10340                     goto insert_if_check_paren;
10341                 }
10342                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10343                     /* (?(1)...) */
10344                     char c;
10345                     char *tmp;
10346                     parno = grok_atou(RExC_parse, &endptr);
10347                     if (endptr)
10348                         RExC_parse = (char*)endptr;
10349                     ret = reganode(pRExC_state, GROUPP, parno);
10350
10351                  insert_if_check_paren:
10352                     if (*(tmp = nextchar(pRExC_state)) != ')') {
10353                         /* nextchar also skips comments, so undo its work
10354                          * and skip over the the next character.
10355                          */
10356                         RExC_parse = tmp;
10357                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10358                         vFAIL("Switch condition not recognized");
10359                     }
10360                   insert_if:
10361                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10362                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10363                     if (br == NULL) {
10364                         if (flags & RESTART_UTF8) {
10365                             *flagp = RESTART_UTF8;
10366                             return NULL;
10367                         }
10368                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10369                               (UV) flags);
10370                     } else
10371                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10372                                                           LONGJMP, 0));
10373                     c = *nextchar(pRExC_state);
10374                     if (flags&HASWIDTH)
10375                         *flagp |= HASWIDTH;
10376                     if (c == '|') {
10377                         if (is_define)
10378                             vFAIL("(?(DEFINE)....) does not allow branches");
10379
10380                         /* Fake one for optimizer.  */
10381                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10382
10383                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10384                             if (flags & RESTART_UTF8) {
10385                                 *flagp = RESTART_UTF8;
10386                                 return NULL;
10387                             }
10388                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10389                                   (UV) flags);
10390                         }
10391                         REGTAIL(pRExC_state, ret, lastbr);
10392                         if (flags&HASWIDTH)
10393                             *flagp |= HASWIDTH;
10394                         c = *nextchar(pRExC_state);
10395                     }
10396                     else
10397                         lastbr = NULL;
10398                     if (c != ')') {
10399                         if (RExC_parse>RExC_end)
10400                             vFAIL("Switch (?(condition)... not terminated");
10401                         else
10402                             vFAIL("Switch (?(condition)... contains too many branches");
10403                     }
10404                     ender = reg_node(pRExC_state, TAIL);
10405                     REGTAIL(pRExC_state, br, ender);
10406                     if (lastbr) {
10407                         REGTAIL(pRExC_state, lastbr, ender);
10408                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10409                     }
10410                     else
10411                         REGTAIL(pRExC_state, ret, ender);
10412                     RExC_size++; /* XXX WHY do we need this?!!
10413                                     For large programs it seems to be required
10414                                     but I can't figure out why. -- dmq*/
10415                     return ret;
10416                 }
10417                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10418                 vFAIL("Unknown switch condition (?(...))");
10419             }
10420             case '[':           /* (?[ ... ]) */
10421                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10422                                          oregcomp_parse);
10423             case 0:
10424                 RExC_parse--; /* for vFAIL to print correctly */
10425                 vFAIL("Sequence (? incomplete");
10426                 break;
10427             default: /* e.g., (?i) */
10428                 --RExC_parse;
10429               parse_flags:
10430                 parse_lparen_question_flags(pRExC_state);
10431                 if (UCHARAT(RExC_parse) != ':') {
10432                     nextchar(pRExC_state);
10433                     *flagp = TRYAGAIN;
10434                     return NULL;
10435                 }
10436                 paren = ':';
10437                 nextchar(pRExC_state);
10438                 ret = NULL;
10439                 goto parse_rest;
10440             } /* end switch */
10441         }
10442         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
10443           capturing_parens:
10444             parno = RExC_npar;
10445             RExC_npar++;
10446
10447             ret = reganode(pRExC_state, OPEN, parno);
10448             if (!SIZE_ONLY ){
10449                 if (!RExC_nestroot)
10450                     RExC_nestroot = parno;
10451                 if (RExC_seen & REG_RECURSE_SEEN
10452                     && !RExC_open_parens[parno-1])
10453                 {
10454                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10455                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
10456                         22, "|    |", (int)(depth * 2 + 1), "",
10457                         (IV)parno, REG_NODE_NUM(ret)));
10458                     RExC_open_parens[parno-1]= ret;
10459                 }
10460             }
10461             Set_Node_Length(ret, 1); /* MJD */
10462             Set_Node_Offset(ret, RExC_parse); /* MJD */
10463             is_open = 1;
10464         } else {
10465             ret = NULL;
10466         }
10467     }
10468     else                        /* ! paren */
10469         ret = NULL;
10470
10471    parse_rest:
10472     /* Pick up the branches, linking them together. */
10473     parse_start = RExC_parse;   /* MJD */
10474     br = regbranch(pRExC_state, &flags, 1,depth+1);
10475
10476     /*     branch_len = (paren != 0); */
10477
10478     if (br == NULL) {
10479         if (flags & RESTART_UTF8) {
10480             *flagp = RESTART_UTF8;
10481             return NULL;
10482         }
10483         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10484     }
10485     if (*RExC_parse == '|') {
10486         if (!SIZE_ONLY && RExC_extralen) {
10487             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10488         }
10489         else {                  /* MJD */
10490             reginsert(pRExC_state, BRANCH, br, depth+1);
10491             Set_Node_Length(br, paren != 0);
10492             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10493         }
10494         have_branch = 1;
10495         if (SIZE_ONLY)
10496             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10497     }
10498     else if (paren == ':') {
10499         *flagp |= flags&SIMPLE;
10500     }
10501     if (is_open) {                              /* Starts with OPEN. */
10502         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10503     }
10504     else if (paren != '?')              /* Not Conditional */
10505         ret = br;
10506     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10507     lastbr = br;
10508     while (*RExC_parse == '|') {
10509         if (!SIZE_ONLY && RExC_extralen) {
10510             ender = reganode(pRExC_state, LONGJMP,0);
10511
10512             /* Append to the previous. */
10513             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10514         }
10515         if (SIZE_ONLY)
10516             RExC_extralen += 2;         /* Account for LONGJMP. */
10517         nextchar(pRExC_state);
10518         if (freeze_paren) {
10519             if (RExC_npar > after_freeze)
10520                 after_freeze = RExC_npar;
10521             RExC_npar = freeze_paren;
10522         }
10523         br = regbranch(pRExC_state, &flags, 0, depth+1);
10524
10525         if (br == NULL) {
10526             if (flags & RESTART_UTF8) {
10527                 *flagp = RESTART_UTF8;
10528                 return NULL;
10529             }
10530             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10531         }
10532         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10533         lastbr = br;
10534         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10535     }
10536
10537     if (have_branch || paren != ':') {
10538         /* Make a closing node, and hook it on the end. */
10539         switch (paren) {
10540         case ':':
10541             ender = reg_node(pRExC_state, TAIL);
10542             break;
10543         case 1: case 2:
10544             ender = reganode(pRExC_state, CLOSE, parno);
10545             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10546                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10547                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
10548                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10549                 RExC_close_parens[parno-1]= ender;
10550                 if (RExC_nestroot == parno)
10551                     RExC_nestroot = 0;
10552             }
10553             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10554             Set_Node_Length(ender,1); /* MJD */
10555             break;
10556         case '<':
10557         case ',':
10558         case '=':
10559         case '!':
10560             *flagp &= ~HASWIDTH;
10561             /* FALLTHROUGH */
10562         case '>':
10563             ender = reg_node(pRExC_state, SUCCEED);
10564             break;
10565         case 0:
10566             ender = reg_node(pRExC_state, END);
10567             if (!SIZE_ONLY) {
10568                 assert(!RExC_opend); /* there can only be one! */
10569                 RExC_opend = ender;
10570             }
10571             break;
10572         }
10573         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10574             DEBUG_PARSE_MSG("lsbr");
10575             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10576             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10577             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10578                           SvPV_nolen_const(RExC_mysv1),
10579                           (IV)REG_NODE_NUM(lastbr),
10580                           SvPV_nolen_const(RExC_mysv2),
10581                           (IV)REG_NODE_NUM(ender),
10582                           (IV)(ender - lastbr)
10583             );
10584         });
10585         REGTAIL(pRExC_state, lastbr, ender);
10586
10587         if (have_branch && !SIZE_ONLY) {
10588             char is_nothing= 1;
10589             if (depth==1)
10590                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10591
10592             /* Hook the tails of the branches to the closing node. */
10593             for (br = ret; br; br = regnext(br)) {
10594                 const U8 op = PL_regkind[OP(br)];
10595                 if (op == BRANCH) {
10596                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10597                     if ( OP(NEXTOPER(br)) != NOTHING
10598                          || regnext(NEXTOPER(br)) != ender)
10599                         is_nothing= 0;
10600                 }
10601                 else if (op == BRANCHJ) {
10602                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10603                     /* for now we always disable this optimisation * /
10604                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10605                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10606                     */
10607                         is_nothing= 0;
10608                 }
10609             }
10610             if (is_nothing) {
10611                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10612                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10613                     DEBUG_PARSE_MSG("NADA");
10614                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10615                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10616                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10617                                   SvPV_nolen_const(RExC_mysv1),
10618                                   (IV)REG_NODE_NUM(ret),
10619                                   SvPV_nolen_const(RExC_mysv2),
10620                                   (IV)REG_NODE_NUM(ender),
10621                                   (IV)(ender - ret)
10622                     );
10623                 });
10624                 OP(br)= NOTHING;
10625                 if (OP(ender) == TAIL) {
10626                     NEXT_OFF(br)= 0;
10627                     RExC_emit= br + 1;
10628                 } else {
10629                     regnode *opt;
10630                     for ( opt= br + 1; opt < ender ; opt++ )
10631                         OP(opt)= OPTIMIZED;
10632                     NEXT_OFF(br)= ender - br;
10633                 }
10634             }
10635         }
10636     }
10637
10638     {
10639         const char *p;
10640         static const char parens[] = "=!<,>";
10641
10642         if (paren && (p = strchr(parens, paren))) {
10643             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10644             int flag = (p - parens) > 1;
10645
10646             if (paren == '>')
10647                 node = SUSPEND, flag = 0;
10648             reginsert(pRExC_state, node,ret, depth+1);
10649             Set_Node_Cur_Length(ret, parse_start);
10650             Set_Node_Offset(ret, parse_start + 1);
10651             ret->flags = flag;
10652             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10653         }
10654     }
10655
10656     /* Check for proper termination. */
10657     if (paren) {
10658         /* restore original flags, but keep (?p) */
10659         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10660         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10661             RExC_parse = oregcomp_parse;
10662             vFAIL("Unmatched (");
10663         }
10664     }
10665     else if (!paren && RExC_parse < RExC_end) {
10666         if (*RExC_parse == ')') {
10667             RExC_parse++;
10668             vFAIL("Unmatched )");
10669         }
10670         else
10671             FAIL("Junk on end of regexp");      /* "Can't happen". */
10672         NOT_REACHED; /* NOTREACHED */
10673     }
10674
10675     if (RExC_in_lookbehind) {
10676         RExC_in_lookbehind--;
10677     }
10678     if (after_freeze > RExC_npar)
10679         RExC_npar = after_freeze;
10680     return(ret);
10681 }
10682
10683 /*
10684  - regbranch - one alternative of an | operator
10685  *
10686  * Implements the concatenation operator.
10687  *
10688  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10689  * restarted.
10690  */
10691 STATIC regnode *
10692 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10693 {
10694     regnode *ret;
10695     regnode *chain = NULL;
10696     regnode *latest;
10697     I32 flags = 0, c = 0;
10698     GET_RE_DEBUG_FLAGS_DECL;
10699
10700     PERL_ARGS_ASSERT_REGBRANCH;
10701
10702     DEBUG_PARSE("brnc");
10703
10704     if (first)
10705         ret = NULL;
10706     else {
10707         if (!SIZE_ONLY && RExC_extralen)
10708             ret = reganode(pRExC_state, BRANCHJ,0);
10709         else {
10710             ret = reg_node(pRExC_state, BRANCH);
10711             Set_Node_Length(ret, 1);
10712         }
10713     }
10714
10715     if (!first && SIZE_ONLY)
10716         RExC_extralen += 1;                     /* BRANCHJ */
10717
10718     *flagp = WORST;                     /* Tentatively. */
10719
10720     RExC_parse--;
10721     nextchar(pRExC_state);
10722     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10723         flags &= ~TRYAGAIN;
10724         latest = regpiece(pRExC_state, &flags,depth+1);
10725         if (latest == NULL) {
10726             if (flags & TRYAGAIN)
10727                 continue;
10728             if (flags & RESTART_UTF8) {
10729                 *flagp = RESTART_UTF8;
10730                 return NULL;
10731             }
10732             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10733         }
10734         else if (ret == NULL)
10735             ret = latest;
10736         *flagp |= flags&(HASWIDTH|POSTPONED);
10737         if (chain == NULL)      /* First piece. */
10738             *flagp |= flags&SPSTART;
10739         else {
10740             /* FIXME adding one for every branch after the first is probably
10741              * excessive now we have TRIE support. (hv) */
10742             MARK_NAUGHTY(1);
10743             REGTAIL(pRExC_state, chain, latest);
10744         }
10745         chain = latest;
10746         c++;
10747     }
10748     if (chain == NULL) {        /* Loop ran zero times. */
10749         chain = reg_node(pRExC_state, NOTHING);
10750         if (ret == NULL)
10751             ret = chain;
10752     }
10753     if (c == 1) {
10754         *flagp |= flags&SIMPLE;
10755     }
10756
10757     return ret;
10758 }
10759
10760 /*
10761  - regpiece - something followed by possible [*+?]
10762  *
10763  * Note that the branching code sequences used for ? and the general cases
10764  * of * and + are somewhat optimized:  they use the same NOTHING node as
10765  * both the endmarker for their branch list and the body of the last branch.
10766  * It might seem that this node could be dispensed with entirely, but the
10767  * endmarker role is not redundant.
10768  *
10769  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10770  * TRYAGAIN.
10771  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10772  * restarted.
10773  */
10774 STATIC regnode *
10775 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10776 {
10777     regnode *ret;
10778     char op;
10779     char *next;
10780     I32 flags;
10781     const char * const origparse = RExC_parse;
10782     I32 min;
10783     I32 max = REG_INFTY;
10784 #ifdef RE_TRACK_PATTERN_OFFSETS
10785     char *parse_start;
10786 #endif
10787     const char *maxpos = NULL;
10788
10789     /* Save the original in case we change the emitted regop to a FAIL. */
10790     regnode * const orig_emit = RExC_emit;
10791
10792     GET_RE_DEBUG_FLAGS_DECL;
10793
10794     PERL_ARGS_ASSERT_REGPIECE;
10795
10796     DEBUG_PARSE("piec");
10797
10798     ret = regatom(pRExC_state, &flags,depth+1);
10799     if (ret == NULL) {
10800         if (flags & (TRYAGAIN|RESTART_UTF8))
10801             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10802         else
10803             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10804         return(NULL);
10805     }
10806
10807     op = *RExC_parse;
10808
10809     if (op == '{' && regcurly(RExC_parse)) {
10810         maxpos = NULL;
10811 #ifdef RE_TRACK_PATTERN_OFFSETS
10812         parse_start = RExC_parse; /* MJD */
10813 #endif
10814         next = RExC_parse + 1;
10815         while (isDIGIT(*next) || *next == ',') {
10816             if (*next == ',') {
10817                 if (maxpos)
10818                     break;
10819                 else
10820                     maxpos = next;
10821             }
10822             next++;
10823         }
10824         if (*next == '}') {             /* got one */
10825             const char* endptr;
10826             if (!maxpos)
10827                 maxpos = next;
10828             RExC_parse++;
10829             min = grok_atou(RExC_parse, &endptr);
10830             if (*maxpos == ',')
10831                 maxpos++;
10832             else
10833                 maxpos = RExC_parse;
10834             max = grok_atou(maxpos, &endptr);
10835             if (!max && *maxpos != '0')
10836                 max = REG_INFTY;                /* meaning "infinity" */
10837             else if (max >= REG_INFTY)
10838                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10839             RExC_parse = next;
10840             nextchar(pRExC_state);
10841             if (max < min) {    /* If can't match, warn and optimize to fail
10842                                    unconditionally */
10843                 if (SIZE_ONLY) {
10844
10845                     /* We can't back off the size because we have to reserve
10846                      * enough space for all the things we are about to throw
10847                      * away, but we can shrink it by the ammount we are about
10848                      * to re-use here */
10849                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10850                 }
10851                 else {
10852                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10853                     RExC_emit = orig_emit;
10854                 }
10855                 ret = reg_node(pRExC_state, OPFAIL);
10856                 return ret;
10857             }
10858             else if (min == max
10859                      && RExC_parse < RExC_end
10860                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10861             {
10862                 if (PASS2) {
10863                     ckWARN2reg(RExC_parse + 1,
10864                                "Useless use of greediness modifier '%c'",
10865                                *RExC_parse);
10866                 }
10867                 /* Absorb the modifier, so later code doesn't see nor use
10868                     * it */
10869                 nextchar(pRExC_state);
10870             }
10871
10872         do_curly:
10873             if ((flags&SIMPLE)) {
10874                 MARK_NAUGHTY_EXP(2, 2);
10875                 reginsert(pRExC_state, CURLY, ret, depth+1);
10876                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10877                 Set_Node_Cur_Length(ret, parse_start);
10878             }
10879             else {
10880                 regnode * const w = reg_node(pRExC_state, WHILEM);
10881
10882                 w->flags = 0;
10883                 REGTAIL(pRExC_state, ret, w);
10884                 if (!SIZE_ONLY && RExC_extralen) {
10885                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10886                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10887                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10888                 }
10889                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10890                                 /* MJD hk */
10891                 Set_Node_Offset(ret, parse_start+1);
10892                 Set_Node_Length(ret,
10893                                 op == '{' ? (RExC_parse - parse_start) : 1);
10894
10895                 if (!SIZE_ONLY && RExC_extralen)
10896                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10897                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10898                 if (SIZE_ONLY)
10899                     RExC_whilem_seen++, RExC_extralen += 3;
10900                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
10901             }
10902             ret->flags = 0;
10903
10904             if (min > 0)
10905                 *flagp = WORST;
10906             if (max > 0)
10907                 *flagp |= HASWIDTH;
10908             if (!SIZE_ONLY) {
10909                 ARG1_SET(ret, (U16)min);
10910                 ARG2_SET(ret, (U16)max);
10911             }
10912             if (max == REG_INFTY)
10913                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10914
10915             goto nest_check;
10916         }
10917     }
10918
10919     if (!ISMULT1(op)) {
10920         *flagp = flags;
10921         return(ret);
10922     }
10923
10924 #if 0                           /* Now runtime fix should be reliable. */
10925
10926     /* if this is reinstated, don't forget to put this back into perldiag:
10927
10928             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10929
10930            (F) The part of the regexp subject to either the * or + quantifier
10931            could match an empty string. The {#} shows in the regular
10932            expression about where the problem was discovered.
10933
10934     */
10935
10936     if (!(flags&HASWIDTH) && op != '?')
10937       vFAIL("Regexp *+ operand could be empty");
10938 #endif
10939
10940 #ifdef RE_TRACK_PATTERN_OFFSETS
10941     parse_start = RExC_parse;
10942 #endif
10943     nextchar(pRExC_state);
10944
10945     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10946
10947     if (op == '*' && (flags&SIMPLE)) {
10948         reginsert(pRExC_state, STAR, ret, depth+1);
10949         ret->flags = 0;
10950         MARK_NAUGHTY(4);
10951         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10952     }
10953     else if (op == '*') {
10954         min = 0;
10955         goto do_curly;
10956     }
10957     else if (op == '+' && (flags&SIMPLE)) {
10958         reginsert(pRExC_state, PLUS, ret, depth+1);
10959         ret->flags = 0;
10960         MARK_NAUGHTY(3);
10961         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10962     }
10963     else if (op == '+') {
10964         min = 1;
10965         goto do_curly;
10966     }
10967     else if (op == '?') {
10968         min = 0; max = 1;
10969         goto do_curly;
10970     }
10971   nest_check:
10972     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10973         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10974         ckWARN2reg(RExC_parse,
10975                    "%"UTF8f" matches null string many times",
10976                    UTF8fARG(UTF, (RExC_parse >= origparse
10977                                  ? RExC_parse - origparse
10978                                  : 0),
10979                    origparse));
10980         (void)ReREFCNT_inc(RExC_rx_sv);
10981     }
10982
10983     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10984         nextchar(pRExC_state);
10985         reginsert(pRExC_state, MINMOD, ret, depth+1);
10986         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10987     }
10988     else
10989     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10990         regnode *ender;
10991         nextchar(pRExC_state);
10992         ender = reg_node(pRExC_state, SUCCEED);
10993         REGTAIL(pRExC_state, ret, ender);
10994         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10995         ret->flags = 0;
10996         ender = reg_node(pRExC_state, TAIL);
10997         REGTAIL(pRExC_state, ret, ender);
10998     }
10999
11000     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11001         RExC_parse++;
11002         vFAIL("Nested quantifiers");
11003     }
11004
11005     return(ret);
11006 }
11007
11008 STATIC STRLEN
11009 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
11010                       UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
11011     )
11012 {
11013
11014  /* This is expected to be called by a parser routine that has recognized '\N'
11015    and needs to handle the rest. RExC_parse is expected to point at the first
11016    char following the N at the time of the call.  On successful return,
11017    RExC_parse has been updated to point to just after the sequence identified
11018    by this routine, <*flagp> has been updated, and the non-NULL input pointers
11019    have been set appropriately.
11020
11021    The typical case for this is \N{some character name}.  This is usually
11022    called while parsing the input, filling in or ready to fill in an EXACTish
11023    node, and the code point for the character should be returned, so that it
11024    can be added to the node, and parsing continued with the next input
11025    character.  But it may be that instead of a single character the \N{}
11026    expands to more than one, a named sequence.  In this case any following
11027    quantifier applies to the whole sequence, and it is easier, given the code
11028    structure that calls this, to handle it from a different area of the code.
11029    For this reason, the input parameters can be set so that it returns valid
11030    only on one or the other of these cases.
11031
11032    Another possibility is for the input to be an empty \N{}, which for
11033    backwards compatibility we accept, but generate a NOTHING node which should
11034    later get optimized out.  This is handled from the area of code which can
11035    handle a named sequence, so if called with the parameters for the other, it
11036    fails.
11037
11038    Still another possibility is for the \N to mean [^\n], and not a single
11039    character or explicit sequence at all.  This is determined by context.
11040    Again, this is handled from the area of code which can handle a named
11041    sequence, so if called with the parameters for the other, it also fails.
11042
11043    And the final possibility is for the \N to be called from within a bracketed
11044    character class.  In this case the [^\n] meaning makes no sense, and so is
11045    an error.  Other anomalous situations are left to the calling code to handle.
11046
11047    For non-single-quoted regexes, the tokenizer has attempted to decide which
11048    of the above applies, and in the case of a named sequence, has converted it
11049    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
11050    where c1... are the characters in the sequence.  For single-quoted regexes,
11051    the tokenizer passes the \N sequence through unchanged; this code will not
11052    attempt to determine this nor expand those, instead raising a syntax error.
11053    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
11054    or there is no '}', it signals that this \N occurrence means to match a
11055    non-newline. (This mostly was done because of [perl #56444].)
11056
11057    The API is somewhat convoluted due to historical and the above reasons.
11058
11059    The function raises an error (via vFAIL), and doesn't return for various
11060    syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
11061    it returns a count of how many characters were accounted for by it.  (This
11062    can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
11063    points in the sequence.  It sets <node_p>, <valuep>, and/or
11064    <substitute_parse> on success.
11065
11066    If <valuep> is non-null, it means the caller can accept an input sequence
11067    consisting of just a single code point; <*valuep> is set to the value of the
11068    only or first code point in the input.
11069
11070    If <substitute_parse> is non-null, it means the caller can accept an input
11071    sequence consisting of one or more code points; <*substitute_parse> is a
11072    newly created mortal SV* in this case, containing \x{} escapes representing
11073    those code points.
11074
11075    Both <valuep> and <substitute_parse> can be non-NULL.
11076
11077    If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
11078    that the caller can accept any legal sequence other than a single code
11079    point.  To wit, <*node_p> is set as follows:
11080     1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
11081     2) \N{}:              points to a new NOTHING node; return is 0
11082     3) otherwise:         points to a new EXACT node containing the resolved
11083                           string; return is the number of code points in the
11084                           string.  This will never be 1.
11085    Note that failure is returned for single code point sequences if <valuep> is
11086    null and <node_p> is not.
11087  */
11088
11089     char * endbrace;    /* '}' following the name */
11090     char* p;
11091     char *endchar;      /* Points to '.' or '}' ending cur char in the input
11092                            stream */
11093     bool has_multiple_chars; /* true if the input stream contains a sequence of
11094                                 more than one character */
11095     bool in_char_class = substitute_parse != NULL;
11096     STRLEN count = 0;   /* Number of characters in this sequence */
11097
11098     GET_RE_DEBUG_FLAGS_DECL;
11099
11100     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11101
11102     GET_RE_DEBUG_FLAGS;
11103
11104     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
11105     assert(! (node_p && substitute_parse)); /* At most 1 should be set */
11106
11107     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11108      * modifier.  The other meaning does not, so use a temporary until we find
11109      * out which we are being called with */
11110     p = (RExC_flags & RXf_PMf_EXTENDED)
11111         ? regpatws(pRExC_state, RExC_parse,
11112                                 TRUE) /* means recognize comments */
11113         : RExC_parse;
11114
11115     /* Disambiguate between \N meaning a named character versus \N meaning
11116      * [^\n].  The former is assumed when it can't be the latter. */
11117     if (*p != '{' || regcurly(p)) {
11118         RExC_parse = p;
11119         if (! node_p) {
11120             /* no bare \N allowed in a charclass */
11121             if (in_char_class) {
11122                 vFAIL("\\N in a character class must be a named character: \\N{...}");
11123             }
11124             return (STRLEN) -1;
11125         }
11126         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
11127                            current char */
11128         nextchar(pRExC_state);
11129         *node_p = reg_node(pRExC_state, REG_ANY);
11130         *flagp |= HASWIDTH|SIMPLE;
11131         MARK_NAUGHTY(1);
11132         Set_Node_Length(*node_p, 1); /* MJD */
11133         return 1;
11134     }
11135
11136     /* Here, we have decided it should be a named character or sequence */
11137
11138     /* The test above made sure that the next real character is a '{', but
11139      * under the /x modifier, it could be separated by space (or a comment and
11140      * \n) and this is not allowed (for consistency with \x{...} and the
11141      * tokenizer handling of \N{NAME}). */
11142     if (*RExC_parse != '{') {
11143         vFAIL("Missing braces on \\N{}");
11144     }
11145
11146     RExC_parse++;       /* Skip past the '{' */
11147
11148     if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11149         || ! (endbrace == RExC_parse            /* nothing between the {} */
11150               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked... */
11151                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11152                                                        error msg) */
11153     {
11154         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
11155         vFAIL("\\N{NAME} must be resolved by the lexer");
11156     }
11157
11158     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11159
11160     if (endbrace == RExC_parse) {   /* empty: \N{} */
11161         if (node_p) {
11162             *node_p = reg_node(pRExC_state,NOTHING);
11163         }
11164         else if (! in_char_class) {
11165             return (STRLEN) -1;
11166         }
11167         nextchar(pRExC_state);
11168         return 0;
11169     }
11170
11171     RExC_parse += 2;    /* Skip past the 'U+' */
11172
11173     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11174
11175     /* Code points are separated by dots.  If none, there is only one code
11176      * point, and is terminated by the brace */
11177     has_multiple_chars = (endchar < endbrace);
11178
11179     /* We get the first code point if we want it, and either there is only one,
11180      * or we can accept both cases of one and there is more than one */
11181     if (valuep && (substitute_parse || ! has_multiple_chars)) {
11182         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
11183         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11184                            | PERL_SCAN_DISALLOW_PREFIX
11185
11186                              /* No errors in the first pass (See [perl
11187                               * #122671].)  We let the code below find the
11188                               * errors when there are multiple chars. */
11189                            | ((SIZE_ONLY || has_multiple_chars)
11190                               ? PERL_SCAN_SILENT_ILLDIGIT
11191                               : 0);
11192
11193         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
11194
11195         /* The tokenizer should have guaranteed validity, but it's possible to
11196          * bypass it by using single quoting, so check.  Don't do the check
11197          * here when there are multiple chars; we do it below anyway. */
11198         if (! has_multiple_chars) {
11199             if (length_of_hex == 0
11200                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11201             {
11202                 RExC_parse += length_of_hex;    /* Includes all the valid */
11203                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
11204                                 ? UTF8SKIP(RExC_parse)
11205                                 : 1;
11206                 /* Guard against malformed utf8 */
11207                 if (RExC_parse >= endchar) {
11208                     RExC_parse = endchar;
11209                 }
11210                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11211             }
11212
11213             RExC_parse = endbrace + 1;
11214             return 1;
11215         }
11216     }
11217
11218     /* Here, we should have already handled the case where a single character
11219      * is expected and found.  So it is a failure if we aren't expecting
11220      * multiple chars and got them; or didn't get them but wanted them.  We
11221      * fail without advancing the parse, so that the caller can try again with
11222      * different acceptance criteria */
11223     if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
11224         RExC_parse = p;
11225         return (STRLEN) -1;
11226     }
11227
11228     {
11229         /* What is done here is to convert this to a sub-pattern of the form
11230          * \x{char1}\x{char2}...
11231          * and then either return it in <*substitute_parse> if non-null; or
11232          * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
11233          * way, it retains its atomicness, while not having to worry about
11234          * special handling that some code points may have.  toke.c has
11235          * converted the original Unicode values to native, so that we can just
11236          * pass on the hex values unchanged.  We do have to set a flag to keep
11237          * recoding from happening in the recursion */
11238
11239         SV * dummy = NULL;
11240         STRLEN len;
11241         char *orig_end = RExC_end;
11242         I32 flags;
11243
11244         if (substitute_parse) {
11245             *substitute_parse = newSVpvs("");
11246         }
11247         else {
11248             substitute_parse = &dummy;
11249             *substitute_parse = newSVpvs("?:");
11250         }
11251         *substitute_parse = sv_2mortal(*substitute_parse);
11252
11253         while (RExC_parse < endbrace) {
11254
11255             /* Convert to notation the rest of the code understands */
11256             sv_catpv(*substitute_parse, "\\x{");
11257             sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
11258             sv_catpv(*substitute_parse, "}");
11259
11260             /* Point to the beginning of the next character in the sequence. */
11261             RExC_parse = endchar + 1;
11262             endchar = RExC_parse + strcspn(RExC_parse, ".}");
11263
11264             count++;
11265         }
11266         if (! in_char_class) {
11267             sv_catpv(*substitute_parse, ")");
11268         }
11269
11270         RExC_parse = SvPV(*substitute_parse, len);
11271
11272         /* Don't allow empty number */
11273         if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
11274             RExC_parse = endbrace;
11275             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11276         }
11277         RExC_end = RExC_parse + len;
11278
11279         /* The values are Unicode, and therefore not subject to recoding */
11280         RExC_override_recoding = 1;
11281
11282         if (node_p) {
11283             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11284                 if (flags & RESTART_UTF8) {
11285                     *flagp = RESTART_UTF8;
11286                     return (STRLEN) -1;
11287                 }
11288                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11289                     (UV) flags);
11290             }
11291             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11292         }
11293
11294         RExC_parse = endbrace;
11295         RExC_end = orig_end;
11296         RExC_override_recoding = 0;
11297
11298         nextchar(pRExC_state);
11299     }
11300
11301     return count;
11302 }
11303
11304
11305 /*
11306  * reg_recode
11307  *
11308  * It returns the code point in utf8 for the value in *encp.
11309  *    value: a code value in the source encoding
11310  *    encp:  a pointer to an Encode object
11311  *
11312  * If the result from Encode is not a single character,
11313  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11314  */
11315 STATIC UV
11316 S_reg_recode(pTHX_ const char value, SV **encp)
11317 {
11318     STRLEN numlen = 1;
11319     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11320     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11321     const STRLEN newlen = SvCUR(sv);
11322     UV uv = UNICODE_REPLACEMENT;
11323
11324     PERL_ARGS_ASSERT_REG_RECODE;
11325
11326     if (newlen)
11327         uv = SvUTF8(sv)
11328              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11329              : *(U8*)s;
11330
11331     if (!newlen || numlen != newlen) {
11332         uv = UNICODE_REPLACEMENT;
11333         *encp = NULL;
11334     }
11335     return uv;
11336 }
11337
11338 PERL_STATIC_INLINE U8
11339 S_compute_EXACTish(RExC_state_t *pRExC_state)
11340 {
11341     U8 op;
11342
11343     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11344
11345     if (! FOLD) {
11346         return (LOC)
11347                 ? EXACTL
11348                 : EXACT;
11349     }
11350
11351     op = get_regex_charset(RExC_flags);
11352     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11353         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11354                  been, so there is no hole */
11355     }
11356
11357     return op + EXACTF;
11358 }
11359
11360 PERL_STATIC_INLINE void
11361 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11362                          regnode *node, I32* flagp, STRLEN len, UV code_point,
11363                          bool downgradable)
11364 {
11365     /* This knows the details about sizing an EXACTish node, setting flags for
11366      * it (by setting <*flagp>, and potentially populating it with a single
11367      * character.
11368      *
11369      * If <len> (the length in bytes) is non-zero, this function assumes that
11370      * the node has already been populated, and just does the sizing.  In this
11371      * case <code_point> should be the final code point that has already been
11372      * placed into the node.  This value will be ignored except that under some
11373      * circumstances <*flagp> is set based on it.
11374      *
11375      * If <len> is zero, the function assumes that the node is to contain only
11376      * the single character given by <code_point> and calculates what <len>
11377      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11378      * additionally will populate the node's STRING with <code_point> or its
11379      * fold if folding.
11380      *
11381      * In both cases <*flagp> is appropriately set
11382      *
11383      * It knows that under FOLD, the Latin Sharp S and UTF characters above
11384      * 255, must be folded (the former only when the rules indicate it can
11385      * match 'ss')
11386      *
11387      * When it does the populating, it looks at the flag 'downgradable'.  If
11388      * true with a node that folds, it checks if the single code point
11389      * participates in a fold, and if not downgrades the node to an EXACT.
11390      * This helps the optimizer */
11391
11392     bool len_passed_in = cBOOL(len != 0);
11393     U8 character[UTF8_MAXBYTES_CASE+1];
11394
11395     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11396
11397     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11398      * sizing difference, and is extra work that is thrown away */
11399     if (downgradable && ! PASS2) {
11400         downgradable = FALSE;
11401     }
11402
11403     if (! len_passed_in) {
11404         if (UTF) {
11405             if (UVCHR_IS_INVARIANT(code_point)) {
11406                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11407                     *character = (U8) code_point;
11408                 }
11409                 else { /* Here is /i and not /l. (toFOLD() is defined on just
11410                           ASCII, which isn't the same thing as INVARIANT on
11411                           EBCDIC, but it works there, as the extra invariants
11412                           fold to themselves) */
11413                     *character = toFOLD((U8) code_point);
11414
11415                     /* We can downgrade to an EXACT node if this character
11416                      * isn't a folding one.  Note that this assumes that
11417                      * nothing above Latin1 folds to some other invariant than
11418                      * one of these alphabetics; otherwise we would also have
11419                      * to check:
11420                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11421                      *      || ASCII_FOLD_RESTRICTED))
11422                      */
11423                     if (downgradable && PL_fold[code_point] == code_point) {
11424                         OP(node) = EXACT;
11425                     }
11426                 }
11427                 len = 1;
11428             }
11429             else if (FOLD && (! LOC
11430                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11431             {   /* Folding, and ok to do so now */
11432                 UV folded = _to_uni_fold_flags(
11433                                    code_point,
11434                                    character,
11435                                    &len,
11436                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11437                                                       ? FOLD_FLAGS_NOMIX_ASCII
11438                                                       : 0));
11439                 if (downgradable
11440                     && folded == code_point /* This quickly rules out many
11441                                                cases, avoiding the
11442                                                _invlist_contains_cp() overhead
11443                                                for those.  */
11444                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11445                 {
11446                     OP(node) = (LOC)
11447                                ? EXACTL
11448                                : EXACT;
11449                 }
11450             }
11451             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11452
11453                 /* Not folding this cp, and can output it directly */
11454                 *character = UTF8_TWO_BYTE_HI(code_point);
11455                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11456                 len = 2;
11457             }
11458             else {
11459                 uvchr_to_utf8( character, code_point);
11460                 len = UTF8SKIP(character);
11461             }
11462         } /* Else pattern isn't UTF8.  */
11463         else if (! FOLD) {
11464             *character = (U8) code_point;
11465             len = 1;
11466         } /* Else is folded non-UTF8 */
11467         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11468
11469             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11470              * comments at join_exact()); */
11471             *character = (U8) code_point;
11472             len = 1;
11473
11474             /* Can turn into an EXACT node if we know the fold at compile time,
11475              * and it folds to itself and doesn't particpate in other folds */
11476             if (downgradable
11477                 && ! LOC
11478                 && PL_fold_latin1[code_point] == code_point
11479                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11480                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11481             {
11482                 OP(node) = EXACT;
11483             }
11484         } /* else is Sharp s.  May need to fold it */
11485         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11486             *character = 's';
11487             *(character + 1) = 's';
11488             len = 2;
11489         }
11490         else {
11491             *character = LATIN_SMALL_LETTER_SHARP_S;
11492             len = 1;
11493         }
11494     }
11495
11496     if (SIZE_ONLY) {
11497         RExC_size += STR_SZ(len);
11498     }
11499     else {
11500         RExC_emit += STR_SZ(len);
11501         STR_LEN(node) = len;
11502         if (! len_passed_in) {
11503             Copy((char *) character, STRING(node), len, char);
11504         }
11505     }
11506
11507     *flagp |= HASWIDTH;
11508
11509     /* A single character node is SIMPLE, except for the special-cased SHARP S
11510      * under /di. */
11511     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11512         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11513             || ! FOLD || ! DEPENDS_SEMANTICS))
11514     {
11515         *flagp |= SIMPLE;
11516     }
11517
11518     /* The OP may not be well defined in PASS1 */
11519     if (PASS2 && OP(node) == EXACTFL) {
11520         RExC_contains_locale = 1;
11521     }
11522 }
11523
11524
11525 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11526  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11527
11528 static I32
11529 S_backref_value(char *p)
11530 {
11531     const char* endptr;
11532     UV val = grok_atou(p, &endptr);
11533     if (endptr == p || endptr == NULL || val > I32_MAX)
11534         return I32_MAX;
11535     return (I32)val;
11536 }
11537
11538
11539 /*
11540  - regatom - the lowest level
11541
11542    Try to identify anything special at the start of the pattern. If there
11543    is, then handle it as required. This may involve generating a single regop,
11544    such as for an assertion; or it may involve recursing, such as to
11545    handle a () structure.
11546
11547    If the string doesn't start with something special then we gobble up
11548    as much literal text as we can.
11549
11550    Once we have been able to handle whatever type of thing started the
11551    sequence, we return.
11552
11553    Note: we have to be careful with escapes, as they can be both literal
11554    and special, and in the case of \10 and friends, context determines which.
11555
11556    A summary of the code structure is:
11557
11558    switch (first_byte) {
11559         cases for each special:
11560             handle this special;
11561             break;
11562         case '\\':
11563             switch (2nd byte) {
11564                 cases for each unambiguous special:
11565                     handle this special;
11566                     break;
11567                 cases for each ambigous special/literal:
11568                     disambiguate;
11569                     if (special)  handle here
11570                     else goto defchar;
11571                 default: // unambiguously literal:
11572                     goto defchar;
11573             }
11574         default:  // is a literal char
11575             // FALL THROUGH
11576         defchar:
11577             create EXACTish node for literal;
11578             while (more input and node isn't full) {
11579                 switch (input_byte) {
11580                    cases for each special;
11581                        make sure parse pointer is set so that the next call to
11582                            regatom will see this special first
11583                        goto loopdone; // EXACTish node terminated by prev. char
11584                    default:
11585                        append char to EXACTISH node;
11586                 }
11587                 get next input byte;
11588             }
11589         loopdone:
11590    }
11591    return the generated node;
11592
11593    Specifically there are two separate switches for handling
11594    escape sequences, with the one for handling literal escapes requiring
11595    a dummy entry for all of the special escapes that are actually handled
11596    by the other.
11597
11598    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11599    TRYAGAIN.
11600    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11601    restarted.
11602    Otherwise does not return NULL.
11603 */
11604
11605 STATIC regnode *
11606 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11607 {
11608     regnode *ret = NULL;
11609     I32 flags = 0;
11610     char *parse_start = RExC_parse;
11611     U8 op;
11612     int invert = 0;
11613     U8 arg;
11614
11615     GET_RE_DEBUG_FLAGS_DECL;
11616
11617     *flagp = WORST;             /* Tentatively. */
11618
11619     DEBUG_PARSE("atom");
11620
11621     PERL_ARGS_ASSERT_REGATOM;
11622
11623 tryagain:
11624     switch ((U8)*RExC_parse) {
11625     case '^':
11626         RExC_seen_zerolen++;
11627         nextchar(pRExC_state);
11628         if (RExC_flags & RXf_PMf_MULTILINE)
11629             ret = reg_node(pRExC_state, MBOL);
11630         else
11631             ret = reg_node(pRExC_state, SBOL);
11632         Set_Node_Length(ret, 1); /* MJD */
11633         break;
11634     case '$':
11635         nextchar(pRExC_state);
11636         if (*RExC_parse)
11637             RExC_seen_zerolen++;
11638         if (RExC_flags & RXf_PMf_MULTILINE)
11639             ret = reg_node(pRExC_state, MEOL);
11640         else
11641             ret = reg_node(pRExC_state, SEOL);
11642         Set_Node_Length(ret, 1); /* MJD */
11643         break;
11644     case '.':
11645         nextchar(pRExC_state);
11646         if (RExC_flags & RXf_PMf_SINGLELINE)
11647             ret = reg_node(pRExC_state, SANY);
11648         else
11649             ret = reg_node(pRExC_state, REG_ANY);
11650         *flagp |= HASWIDTH|SIMPLE;
11651         MARK_NAUGHTY(1);
11652         Set_Node_Length(ret, 1); /* MJD */
11653         break;
11654     case '[':
11655     {
11656         char * const oregcomp_parse = ++RExC_parse;
11657         ret = regclass(pRExC_state, flagp,depth+1,
11658                        FALSE, /* means parse the whole char class */
11659                        TRUE, /* allow multi-char folds */
11660                        FALSE, /* don't silence non-portable warnings. */
11661                        NULL);
11662         if (*RExC_parse != ']') {
11663             RExC_parse = oregcomp_parse;
11664             vFAIL("Unmatched [");
11665         }
11666         if (ret == NULL) {
11667             if (*flagp & RESTART_UTF8)
11668                 return NULL;
11669             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11670                   (UV) *flagp);
11671         }
11672         nextchar(pRExC_state);
11673         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11674         break;
11675     }
11676     case '(':
11677         nextchar(pRExC_state);
11678         ret = reg(pRExC_state, 2, &flags,depth+1);
11679         if (ret == NULL) {
11680                 if (flags & TRYAGAIN) {
11681                     if (RExC_parse == RExC_end) {
11682                          /* Make parent create an empty node if needed. */
11683                         *flagp |= TRYAGAIN;
11684                         return(NULL);
11685                     }
11686                     goto tryagain;
11687                 }
11688                 if (flags & RESTART_UTF8) {
11689                     *flagp = RESTART_UTF8;
11690                     return NULL;
11691                 }
11692                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11693                                                                  (UV) flags);
11694         }
11695         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11696         break;
11697     case '|':
11698     case ')':
11699         if (flags & TRYAGAIN) {
11700             *flagp |= TRYAGAIN;
11701             return NULL;
11702         }
11703         vFAIL("Internal urp");
11704                                 /* Supposed to be caught earlier. */
11705         break;
11706     case '?':
11707     case '+':
11708     case '*':
11709         RExC_parse++;
11710         vFAIL("Quantifier follows nothing");
11711         break;
11712     case '\\':
11713         /* Special Escapes
11714
11715            This switch handles escape sequences that resolve to some kind
11716            of special regop and not to literal text. Escape sequnces that
11717            resolve to literal text are handled below in the switch marked
11718            "Literal Escapes".
11719
11720            Every entry in this switch *must* have a corresponding entry
11721            in the literal escape switch. However, the opposite is not
11722            required, as the default for this switch is to jump to the
11723            literal text handling code.
11724         */
11725         switch ((U8)*++RExC_parse) {
11726         /* Special Escapes */
11727         case 'A':
11728             RExC_seen_zerolen++;
11729             ret = reg_node(pRExC_state, SBOL);
11730             /* SBOL is shared with /^/ so we set the flags so we can tell
11731              * /\A/ from /^/ in split. We check ret because first pass we
11732              * have no regop struct to set the flags on. */
11733             if (PASS2)
11734                 ret->flags = 1;
11735             *flagp |= SIMPLE;
11736             goto finish_meta_pat;
11737         case 'G':
11738             ret = reg_node(pRExC_state, GPOS);
11739             RExC_seen |= REG_GPOS_SEEN;
11740             *flagp |= SIMPLE;
11741             goto finish_meta_pat;
11742         case 'K':
11743             RExC_seen_zerolen++;
11744             ret = reg_node(pRExC_state, KEEPS);
11745             *flagp |= SIMPLE;
11746             /* XXX:dmq : disabling in-place substitution seems to
11747              * be necessary here to avoid cases of memory corruption, as
11748              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11749              */
11750             RExC_seen |= REG_LOOKBEHIND_SEEN;
11751             goto finish_meta_pat;
11752         case 'Z':
11753             ret = reg_node(pRExC_state, SEOL);
11754             *flagp |= SIMPLE;
11755             RExC_seen_zerolen++;                /* Do not optimize RE away */
11756             goto finish_meta_pat;
11757         case 'z':
11758             ret = reg_node(pRExC_state, EOS);
11759             *flagp |= SIMPLE;
11760             RExC_seen_zerolen++;                /* Do not optimize RE away */
11761             goto finish_meta_pat;
11762         case 'C':
11763             ret = reg_node(pRExC_state, CANY);
11764             RExC_seen |= REG_CANY_SEEN;
11765             *flagp |= HASWIDTH|SIMPLE;
11766             if (PASS2) {
11767                 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11768             }
11769             goto finish_meta_pat;
11770         case 'X':
11771             ret = reg_node(pRExC_state, CLUMP);
11772             *flagp |= HASWIDTH;
11773             goto finish_meta_pat;
11774
11775         case 'W':
11776             invert = 1;
11777             /* FALLTHROUGH */
11778         case 'w':
11779             arg = ANYOF_WORDCHAR;
11780             goto join_posix;
11781
11782         case 'b':
11783             RExC_seen_zerolen++;
11784             RExC_seen |= REG_LOOKBEHIND_SEEN;
11785             op = BOUND + get_regex_charset(RExC_flags);
11786             if (op > BOUNDA) {  /* /aa is same as /a */
11787                 op = BOUNDA;
11788             }
11789             else if (op == BOUNDL) {
11790                 RExC_contains_locale = 1;
11791             }
11792             ret = reg_node(pRExC_state, op);
11793             FLAGS(ret) = get_regex_charset(RExC_flags);
11794             *flagp |= SIMPLE;
11795             if ((U8) *(RExC_parse + 1) == '{') {
11796                 /* diag_listed_as: Use "%s" instead of "%s" */
11797                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11798             }
11799             goto finish_meta_pat;
11800         case 'B':
11801             RExC_seen_zerolen++;
11802             RExC_seen |= REG_LOOKBEHIND_SEEN;
11803             op = NBOUND + get_regex_charset(RExC_flags);
11804             if (op > NBOUNDA) { /* /aa is same as /a */
11805                 op = NBOUNDA;
11806             }
11807             else if (op == NBOUNDL) {
11808                 RExC_contains_locale = 1;
11809             }
11810             ret = reg_node(pRExC_state, op);
11811             FLAGS(ret) = get_regex_charset(RExC_flags);
11812             *flagp |= SIMPLE;
11813             if ((U8) *(RExC_parse + 1) == '{') {
11814                 /* diag_listed_as: Use "%s" instead of "%s" */
11815                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11816             }
11817             goto finish_meta_pat;
11818
11819         case 'D':
11820             invert = 1;
11821             /* FALLTHROUGH */
11822         case 'd':
11823             arg = ANYOF_DIGIT;
11824             goto join_posix;
11825
11826         case 'R':
11827             ret = reg_node(pRExC_state, LNBREAK);
11828             *flagp |= HASWIDTH|SIMPLE;
11829             goto finish_meta_pat;
11830
11831         case 'H':
11832             invert = 1;
11833             /* FALLTHROUGH */
11834         case 'h':
11835             arg = ANYOF_BLANK;
11836             op = POSIXU;
11837             goto join_posix_op_known;
11838
11839         case 'V':
11840             invert = 1;
11841             /* FALLTHROUGH */
11842         case 'v':
11843             arg = ANYOF_VERTWS;
11844             op = POSIXU;
11845             goto join_posix_op_known;
11846
11847         case 'S':
11848             invert = 1;
11849             /* FALLTHROUGH */
11850         case 's':
11851             arg = ANYOF_SPACE;
11852
11853         join_posix:
11854
11855             op = POSIXD + get_regex_charset(RExC_flags);
11856             if (op > POSIXA) {  /* /aa is same as /a */
11857                 op = POSIXA;
11858             }
11859             else if (op == POSIXL) {
11860                 RExC_contains_locale = 1;
11861             }
11862
11863         join_posix_op_known:
11864
11865             if (invert) {
11866                 op += NPOSIXD - POSIXD;
11867             }
11868
11869             ret = reg_node(pRExC_state, op);
11870             if (! SIZE_ONLY) {
11871                 FLAGS(ret) = namedclass_to_classnum(arg);
11872             }
11873
11874             *flagp |= HASWIDTH|SIMPLE;
11875             /* FALLTHROUGH */
11876
11877          finish_meta_pat:
11878             nextchar(pRExC_state);
11879             Set_Node_Length(ret, 2); /* MJD */
11880             break;
11881         case 'p':
11882         case 'P':
11883             {
11884 #ifdef DEBUGGING
11885                 char* parse_start = RExC_parse - 2;
11886 #endif
11887
11888                 RExC_parse--;
11889
11890                 ret = regclass(pRExC_state, flagp,depth+1,
11891                                TRUE, /* means just parse this element */
11892                                FALSE, /* don't allow multi-char folds */
11893                                FALSE, /* don't silence non-portable warnings.
11894                                          It would be a bug if these returned
11895                                          non-portables */
11896                                NULL);
11897                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11898                    are allowed.  */
11899                 if (!ret)
11900                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11901                           (UV) *flagp);
11902
11903                 RExC_parse--;
11904
11905                 Set_Node_Offset(ret, parse_start + 2);
11906                 Set_Node_Cur_Length(ret, parse_start);
11907                 nextchar(pRExC_state);
11908             }
11909             break;
11910         case 'N':
11911             /* Handle \N and \N{NAME} with multiple code points here and not
11912              * below because it can be multicharacter. join_exact() will join
11913              * them up later on.  Also this makes sure that things like
11914              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11915              * The options to the grok function call causes it to fail if the
11916              * sequence is just a single code point.  We then go treat it as
11917              * just another character in the current EXACT node, and hence it
11918              * gets uniform treatment with all the other characters.  The
11919              * special treatment for quantifiers is not needed for such single
11920              * character sequences */
11921             ++RExC_parse;
11922             if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11923                                              depth, FALSE))
11924             {
11925                 if (*flagp & RESTART_UTF8)
11926                     return NULL;
11927                 RExC_parse--;
11928                 goto defchar;
11929             }
11930             break;
11931         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11932         parse_named_seq:
11933         {
11934             char ch= RExC_parse[1];
11935             if (ch != '<' && ch != '\'' && ch != '{') {
11936                 RExC_parse++;
11937                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11938                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11939             } else {
11940                 /* this pretty much dupes the code for (?P=...) in reg(), if
11941                    you change this make sure you change that */
11942                 char* name_start = (RExC_parse += 2);
11943                 U32 num = 0;
11944                 SV *sv_dat = reg_scan_name(pRExC_state,
11945                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11946                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11947                 if (RExC_parse == name_start || *RExC_parse != ch)
11948                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11949                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11950
11951                 if (!SIZE_ONLY) {
11952                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11953                     RExC_rxi->data->data[num]=(void*)sv_dat;
11954                     SvREFCNT_inc_simple_void(sv_dat);
11955                 }
11956
11957                 RExC_sawback = 1;
11958                 ret = reganode(pRExC_state,
11959                                ((! FOLD)
11960                                  ? NREF
11961                                  : (ASCII_FOLD_RESTRICTED)
11962                                    ? NREFFA
11963                                    : (AT_LEAST_UNI_SEMANTICS)
11964                                      ? NREFFU
11965                                      : (LOC)
11966                                        ? NREFFL
11967                                        : NREFF),
11968                                 num);
11969                 *flagp |= HASWIDTH;
11970
11971                 /* override incorrect value set in reganode MJD */
11972                 Set_Node_Offset(ret, parse_start+1);
11973                 Set_Node_Cur_Length(ret, parse_start);
11974                 nextchar(pRExC_state);
11975
11976             }
11977             break;
11978         }
11979         case 'g':
11980         case '1': case '2': case '3': case '4':
11981         case '5': case '6': case '7': case '8': case '9':
11982             {
11983                 I32 num;
11984                 bool hasbrace = 0;
11985
11986                 if (*RExC_parse == 'g') {
11987                     bool isrel = 0;
11988
11989                     RExC_parse++;
11990                     if (*RExC_parse == '{') {
11991                         RExC_parse++;
11992                         hasbrace = 1;
11993                     }
11994                     if (*RExC_parse == '-') {
11995                         RExC_parse++;
11996                         isrel = 1;
11997                     }
11998                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11999                         if (isrel) RExC_parse--;
12000                         RExC_parse -= 2;
12001                         goto parse_named_seq;
12002                     }
12003
12004                     num = S_backref_value(RExC_parse);
12005                     if (num == 0)
12006                         vFAIL("Reference to invalid group 0");
12007                     else if (num == I32_MAX) {
12008                          if (isDIGIT(*RExC_parse))
12009                             vFAIL("Reference to nonexistent group");
12010                         else
12011                             vFAIL("Unterminated \\g... pattern");
12012                     }
12013
12014                     if (isrel) {
12015                         num = RExC_npar - num;
12016                         if (num < 1)
12017                             vFAIL("Reference to nonexistent or unclosed group");
12018                     }
12019                 }
12020                 else {
12021                     num = S_backref_value(RExC_parse);
12022                     /* bare \NNN might be backref or octal - if it is larger than or equal
12023                      * RExC_npar then it is assumed to be and octal escape.
12024                      * Note RExC_npar is +1 from the actual number of parens*/
12025                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
12026                             && *RExC_parse != '8' && *RExC_parse != '9'))
12027                     {
12028                         /* Probably a character specified in octal, e.g. \35 */
12029                         goto defchar;
12030                     }
12031                 }
12032
12033                 /* at this point RExC_parse definitely points to a backref
12034                  * number */
12035                 {
12036 #ifdef RE_TRACK_PATTERN_OFFSETS
12037                     char * const parse_start = RExC_parse - 1; /* MJD */
12038 #endif
12039                     while (isDIGIT(*RExC_parse))
12040                         RExC_parse++;
12041                     if (hasbrace) {
12042                         if (*RExC_parse != '}')
12043                             vFAIL("Unterminated \\g{...} pattern");
12044                         RExC_parse++;
12045                     }
12046                     if (!SIZE_ONLY) {
12047                         if (num > (I32)RExC_rx->nparens)
12048                             vFAIL("Reference to nonexistent group");
12049                     }
12050                     RExC_sawback = 1;
12051                     ret = reganode(pRExC_state,
12052                                    ((! FOLD)
12053                                      ? REF
12054                                      : (ASCII_FOLD_RESTRICTED)
12055                                        ? REFFA
12056                                        : (AT_LEAST_UNI_SEMANTICS)
12057                                          ? REFFU
12058                                          : (LOC)
12059                                            ? REFFL
12060                                            : REFF),
12061                                     num);
12062                     *flagp |= HASWIDTH;
12063
12064                     /* override incorrect value set in reganode MJD */
12065                     Set_Node_Offset(ret, parse_start+1);
12066                     Set_Node_Cur_Length(ret, parse_start);
12067                     RExC_parse--;
12068                     nextchar(pRExC_state);
12069                 }
12070             }
12071             break;
12072         case '\0':
12073             if (RExC_parse >= RExC_end)
12074                 FAIL("Trailing \\");
12075             /* FALLTHROUGH */
12076         default:
12077             /* Do not generate "unrecognized" warnings here, we fall
12078                back into the quick-grab loop below */
12079             parse_start--;
12080             goto defchar;
12081         }
12082         break;
12083
12084     case '#':
12085         if (RExC_flags & RXf_PMf_EXTENDED) {
12086             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12087             if (RExC_parse < RExC_end)
12088                 goto tryagain;
12089         }
12090         /* FALLTHROUGH */
12091
12092     default:
12093
12094             parse_start = RExC_parse - 1;
12095
12096             RExC_parse++;
12097
12098         defchar: {
12099             STRLEN len = 0;
12100             UV ender = 0;
12101             char *p;
12102             char *s;
12103 #define MAX_NODE_STRING_SIZE 127
12104             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12105             char *s0;
12106             U8 upper_parse = MAX_NODE_STRING_SIZE;
12107             U8 node_type = compute_EXACTish(pRExC_state);
12108             bool next_is_quantifier;
12109             char * oldp = NULL;
12110
12111             /* We can convert EXACTF nodes to EXACTFU if they contain only
12112              * characters that match identically regardless of the target
12113              * string's UTF8ness.  The reason to do this is that EXACTF is not
12114              * trie-able, EXACTFU is.
12115              *
12116              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12117              * contain only above-Latin1 characters (hence must be in UTF8),
12118              * which don't participate in folds with Latin1-range characters,
12119              * as the latter's folds aren't known until runtime.  (We don't
12120              * need to figure this out until pass 2) */
12121             bool maybe_exactfu = PASS2
12122                                && (node_type == EXACTF || node_type == EXACTFL);
12123
12124             /* If a folding node contains only code points that don't
12125              * participate in folds, it can be changed into an EXACT node,
12126              * which allows the optimizer more things to look for */
12127             bool maybe_exact;
12128
12129             ret = reg_node(pRExC_state, node_type);
12130
12131             /* In pass1, folded, we use a temporary buffer instead of the
12132              * actual node, as the node doesn't exist yet */
12133             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12134
12135             s0 = s;
12136
12137         reparse:
12138
12139             /* We do the EXACTFish to EXACT node only if folding.  (And we
12140              * don't need to figure this out until pass 2) */
12141             maybe_exact = FOLD && PASS2;
12142
12143             /* XXX The node can hold up to 255 bytes, yet this only goes to
12144              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12145              * 255 allows us to not have to worry about overflow due to
12146              * converting to utf8 and fold expansion, but that value is
12147              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12148              * split up by this limit into a single one using the real max of
12149              * 255.  Even at 127, this breaks under rare circumstances.  If
12150              * folding, we do not want to split a node at a character that is a
12151              * non-final in a multi-char fold, as an input string could just
12152              * happen to want to match across the node boundary.  The join
12153              * would solve that problem if the join actually happens.  But a
12154              * series of more than two nodes in a row each of 127 would cause
12155              * the first join to succeed to get to 254, but then there wouldn't
12156              * be room for the next one, which could at be one of those split
12157              * multi-char folds.  I don't know of any fool-proof solution.  One
12158              * could back off to end with only a code point that isn't such a
12159              * non-final, but it is possible for there not to be any in the
12160              * entire node. */
12161             for (p = RExC_parse - 1;
12162                  len < upper_parse && p < RExC_end;
12163                  len++)
12164             {
12165                 oldp = p;
12166
12167                 if (RExC_flags & RXf_PMf_EXTENDED)
12168                     p = regpatws(pRExC_state, p,
12169                                           TRUE); /* means recognize comments */
12170                 switch ((U8)*p) {
12171                 case '^':
12172                 case '$':
12173                 case '.':
12174                 case '[':
12175                 case '(':
12176                 case ')':
12177                 case '|':
12178                     goto loopdone;
12179                 case '\\':
12180                     /* Literal Escapes Switch
12181
12182                        This switch is meant to handle escape sequences that
12183                        resolve to a literal character.
12184
12185                        Every escape sequence that represents something
12186                        else, like an assertion or a char class, is handled
12187                        in the switch marked 'Special Escapes' above in this
12188                        routine, but also has an entry here as anything that
12189                        isn't explicitly mentioned here will be treated as
12190                        an unescaped equivalent literal.
12191                     */
12192
12193                     switch ((U8)*++p) {
12194                     /* These are all the special escapes. */
12195                     case 'A':             /* Start assertion */
12196                     case 'b': case 'B':   /* Word-boundary assertion*/
12197                     case 'C':             /* Single char !DANGEROUS! */
12198                     case 'd': case 'D':   /* digit class */
12199                     case 'g': case 'G':   /* generic-backref, pos assertion */
12200                     case 'h': case 'H':   /* HORIZWS */
12201                     case 'k': case 'K':   /* named backref, keep marker */
12202                     case 'p': case 'P':   /* Unicode property */
12203                               case 'R':   /* LNBREAK */
12204                     case 's': case 'S':   /* space class */
12205                     case 'v': case 'V':   /* VERTWS */
12206                     case 'w': case 'W':   /* word class */
12207                     case 'X':             /* eXtended Unicode "combining
12208                                              character sequence" */
12209                     case 'z': case 'Z':   /* End of line/string assertion */
12210                         --p;
12211                         goto loopdone;
12212
12213                     /* Anything after here is an escape that resolves to a
12214                        literal. (Except digits, which may or may not)
12215                      */
12216                     case 'n':
12217                         ender = '\n';
12218                         p++;
12219                         break;
12220                     case 'N': /* Handle a single-code point named character. */
12221                         /* The options cause it to fail if a multiple code
12222                          * point sequence.  Handle those in the switch() above
12223                          * */
12224                         RExC_parse = p + 1;
12225                         if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
12226                                                          &ender,
12227                                                          flagp,
12228                                                          depth,
12229                                                          FALSE
12230                         )) {
12231                             if (*flagp & RESTART_UTF8)
12232                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12233                             RExC_parse = p = oldp;
12234                             goto loopdone;
12235                         }
12236                         p = RExC_parse;
12237                         if (ender > 0xff) {
12238                             REQUIRE_UTF8;
12239                         }
12240                         break;
12241                     case 'r':
12242                         ender = '\r';
12243                         p++;
12244                         break;
12245                     case 't':
12246                         ender = '\t';
12247                         p++;
12248                         break;
12249                     case 'f':
12250                         ender = '\f';
12251                         p++;
12252                         break;
12253                     case 'e':
12254                         ender = ESC_NATIVE;
12255                         p++;
12256                         break;
12257                     case 'a':
12258                         ender = '\a';
12259                         p++;
12260                         break;
12261                     case 'o':
12262                         {
12263                             UV result;
12264                             const char* error_msg;
12265
12266                             bool valid = grok_bslash_o(&p,
12267                                                        &result,
12268                                                        &error_msg,
12269                                                        PASS2, /* out warnings */
12270                                                        FALSE, /* not strict */
12271                                                        TRUE, /* Output warnings
12272                                                                 for non-
12273                                                                 portables */
12274                                                        UTF);
12275                             if (! valid) {
12276                                 RExC_parse = p; /* going to die anyway; point
12277                                                    to exact spot of failure */
12278                                 vFAIL(error_msg);
12279                             }
12280                             ender = result;
12281                             if (IN_ENCODING && ender < 0x100) {
12282                                 goto recode_encoding;
12283                             }
12284                             if (ender > 0xff) {
12285                                 REQUIRE_UTF8;
12286                             }
12287                             break;
12288                         }
12289                     case 'x':
12290                         {
12291                             UV result = UV_MAX; /* initialize to erroneous
12292                                                    value */
12293                             const char* error_msg;
12294
12295                             bool valid = grok_bslash_x(&p,
12296                                                        &result,
12297                                                        &error_msg,
12298                                                        PASS2, /* out warnings */
12299                                                        FALSE, /* not strict */
12300                                                        TRUE, /* Output warnings
12301                                                                 for non-
12302                                                                 portables */
12303                                                        UTF);
12304                             if (! valid) {
12305                                 RExC_parse = p; /* going to die anyway; point
12306                                                    to exact spot of failure */
12307                                 vFAIL(error_msg);
12308                             }
12309                             ender = result;
12310
12311                             if (IN_ENCODING && ender < 0x100) {
12312                                 goto recode_encoding;
12313                             }
12314                             if (ender > 0xff) {
12315                                 REQUIRE_UTF8;
12316                             }
12317                             break;
12318                         }
12319                     case 'c':
12320                         p++;
12321                         ender = grok_bslash_c(*p++, PASS2);
12322                         break;
12323                     case '8': case '9': /* must be a backreference */
12324                         --p;
12325                         goto loopdone;
12326                     case '1': case '2': case '3':case '4':
12327                     case '5': case '6': case '7':
12328                         /* When we parse backslash escapes there is ambiguity
12329                          * between backreferences and octal escapes. Any escape
12330                          * from \1 - \9 is a backreference, any multi-digit
12331                          * escape which does not start with 0 and which when
12332                          * evaluated as decimal could refer to an already
12333                          * parsed capture buffer is a backslash. Anything else
12334                          * is octal.
12335                          *
12336                          * Note this implies that \118 could be interpreted as
12337                          * 118 OR as "\11" . "8" depending on whether there
12338                          * were 118 capture buffers defined already in the
12339                          * pattern.  */
12340
12341                         /* NOTE, RExC_npar is 1 more than the actual number of
12342                          * parens we have seen so far, hence the < RExC_npar below. */
12343
12344                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12345                         {  /* Not to be treated as an octal constant, go
12346                                    find backref */
12347                             --p;
12348                             goto loopdone;
12349                         }
12350                         /* FALLTHROUGH */
12351                     case '0':
12352                         {
12353                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12354                             STRLEN numlen = 3;
12355                             ender = grok_oct(p, &numlen, &flags, NULL);
12356                             if (ender > 0xff) {
12357                                 REQUIRE_UTF8;
12358                             }
12359                             p += numlen;
12360                             if (PASS2   /* like \08, \178 */
12361                                 && numlen < 3
12362                                 && p < RExC_end
12363                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12364                             {
12365                                 reg_warn_non_literal_string(
12366                                          p + 1,
12367                                          form_short_octal_warning(p, numlen));
12368                             }
12369                         }
12370                         if (IN_ENCODING && ender < 0x100)
12371                             goto recode_encoding;
12372                         break;
12373                     recode_encoding:
12374                         if (! RExC_override_recoding) {
12375                             SV* enc = _get_encoding();
12376                             ender = reg_recode((const char)(U8)ender, &enc);
12377                             if (!enc && PASS2)
12378                                 ckWARNreg(p, "Invalid escape in the specified encoding");
12379                             REQUIRE_UTF8;
12380                         }
12381                         break;
12382                     case '\0':
12383                         if (p >= RExC_end)
12384                             FAIL("Trailing \\");
12385                         /* FALLTHROUGH */
12386                     default:
12387                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12388                             /* Include any { following the alpha to emphasize
12389                              * that it could be part of an escape at some point
12390                              * in the future */
12391                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12392                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12393                         }
12394                         goto normal_default;
12395                     } /* End of switch on '\' */
12396                     break;
12397                 case '{':
12398                     /* Currently we don't warn when the lbrace is at the start
12399                      * of a construct.  This catches it in the middle of a
12400                      * literal string, or when its the first thing after
12401                      * something like "\b" */
12402                     if (! SIZE_ONLY
12403                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12404                     {
12405                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12406                     }
12407                     /*FALLTHROUGH*/
12408                 default:    /* A literal character */
12409                   normal_default:
12410                     if (UTF8_IS_START(*p) && UTF) {
12411                         STRLEN numlen;
12412                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12413                                                &numlen, UTF8_ALLOW_DEFAULT);
12414                         p += numlen;
12415                     }
12416                     else
12417                         ender = (U8) *p++;
12418                     break;
12419                 } /* End of switch on the literal */
12420
12421                 /* Here, have looked at the literal character and <ender>
12422                  * contains its ordinal, <p> points to the character after it
12423                  */
12424
12425                 if ( RExC_flags & RXf_PMf_EXTENDED)
12426                     p = regpatws(pRExC_state, p,
12427                                           TRUE); /* means recognize comments */
12428
12429                 /* If the next thing is a quantifier, it applies to this
12430                  * character only, which means that this character has to be in
12431                  * its own node and can't just be appended to the string in an
12432                  * existing node, so if there are already other characters in
12433                  * the node, close the node with just them, and set up to do
12434                  * this character again next time through, when it will be the
12435                  * only thing in its new node */
12436                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12437                 {
12438                     p = oldp;
12439                     goto loopdone;
12440                 }
12441
12442                 if (! FOLD   /* The simple case, just append the literal */
12443                     || (LOC  /* Also don't fold for tricky chars under /l */
12444                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12445                 {
12446                     if (UTF) {
12447                         const STRLEN unilen = reguni(pRExC_state, ender, s);
12448                         if (unilen > 0) {
12449                            s   += unilen;
12450                            len += unilen;
12451                         }
12452
12453                         /* The loop increments <len> each time, as all but this
12454                          * path (and one other) through it add a single byte to
12455                          * the EXACTish node.  But this one has changed len to
12456                          * be the correct final value, so subtract one to
12457                          * cancel out the increment that follows */
12458                         len--;
12459                     }
12460                     else {
12461                         REGC((char)ender, s++);
12462                     }
12463
12464                     /* Can get here if folding only if is one of the /l
12465                      * characters whose fold depends on the locale.  The
12466                      * occurrence of any of these indicate that we can't
12467                      * simplify things */
12468                     if (FOLD) {
12469                         maybe_exact = FALSE;
12470                         maybe_exactfu = FALSE;
12471                     }
12472                 }
12473                 else             /* FOLD */
12474                      if (! ( UTF
12475                         /* See comments for join_exact() as to why we fold this
12476                          * non-UTF at compile time */
12477                         || (node_type == EXACTFU
12478                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12479                 {
12480                     /* Here, are folding and are not UTF-8 encoded; therefore
12481                      * the character must be in the range 0-255, and is not /l
12482                      * (Not /l because we already handled these under /l in
12483                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12484                     if (IS_IN_SOME_FOLD_L1(ender)) {
12485                         maybe_exact = FALSE;
12486
12487                         /* See if the character's fold differs between /d and
12488                          * /u.  This includes the multi-char fold SHARP S to
12489                          * 'ss' */
12490                         if (maybe_exactfu
12491                             && (PL_fold[ender] != PL_fold_latin1[ender]
12492                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12493                                 || (len > 0
12494                                    && isALPHA_FOLD_EQ(ender, 's')
12495                                    && isALPHA_FOLD_EQ(*(s-1), 's'))))
12496                         {
12497                             maybe_exactfu = FALSE;
12498                         }
12499                     }
12500
12501                     /* Even when folding, we store just the input character, as
12502                      * we have an array that finds its fold quickly */
12503                     *(s++) = (char) ender;
12504                 }
12505                 else {  /* FOLD and UTF */
12506                     /* Unlike the non-fold case, we do actually have to
12507                      * calculate the results here in pass 1.  This is for two
12508                      * reasons, the folded length may be longer than the
12509                      * unfolded, and we have to calculate how many EXACTish
12510                      * nodes it will take; and we may run out of room in a node
12511                      * in the middle of a potential multi-char fold, and have
12512                      * to back off accordingly.  (Hence we can't use REGC for
12513                      * the simple case just below.) */
12514
12515                     UV folded;
12516                     if (isASCII_uni(ender)) {
12517                         folded = toFOLD(ender);
12518                         *(s)++ = (U8) folded;
12519                     }
12520                     else {
12521                         STRLEN foldlen;
12522
12523                         folded = _to_uni_fold_flags(
12524                                      ender,
12525                                      (U8 *) s,
12526                                      &foldlen,
12527                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12528                                                         ? FOLD_FLAGS_NOMIX_ASCII
12529                                                         : 0));
12530                         s += foldlen;
12531
12532                         /* The loop increments <len> each time, as all but this
12533                          * path (and one other) through it add a single byte to
12534                          * the EXACTish node.  But this one has changed len to
12535                          * be the correct final value, so subtract one to
12536                          * cancel out the increment that follows */
12537                         len += foldlen - 1;
12538                     }
12539                     /* If this node only contains non-folding code points so
12540                      * far, see if this new one is also non-folding */
12541                     if (maybe_exact) {
12542                         if (folded != ender) {
12543                             maybe_exact = FALSE;
12544                         }
12545                         else {
12546                             /* Here the fold is the original; we have to check
12547                              * further to see if anything folds to it */
12548                             if (_invlist_contains_cp(PL_utf8_foldable,
12549                                                         ender))
12550                             {
12551                                 maybe_exact = FALSE;
12552                             }
12553                         }
12554                     }
12555                     ender = folded;
12556                 }
12557
12558                 if (next_is_quantifier) {
12559
12560                     /* Here, the next input is a quantifier, and to get here,
12561                      * the current character is the only one in the node.
12562                      * Also, here <len> doesn't include the final byte for this
12563                      * character */
12564                     len++;
12565                     goto loopdone;
12566                 }
12567
12568             } /* End of loop through literal characters */
12569
12570             /* Here we have either exhausted the input or ran out of room in
12571              * the node.  (If we encountered a character that can't be in the
12572              * node, transfer is made directly to <loopdone>, and so we
12573              * wouldn't have fallen off the end of the loop.)  In the latter
12574              * case, we artificially have to split the node into two, because
12575              * we just don't have enough space to hold everything.  This
12576              * creates a problem if the final character participates in a
12577              * multi-character fold in the non-final position, as a match that
12578              * should have occurred won't, due to the way nodes are matched,
12579              * and our artificial boundary.  So back off until we find a non-
12580              * problematic character -- one that isn't at the beginning or
12581              * middle of such a fold.  (Either it doesn't participate in any
12582              * folds, or appears only in the final position of all the folds it
12583              * does participate in.)  A better solution with far fewer false
12584              * positives, and that would fill the nodes more completely, would
12585              * be to actually have available all the multi-character folds to
12586              * test against, and to back-off only far enough to be sure that
12587              * this node isn't ending with a partial one.  <upper_parse> is set
12588              * further below (if we need to reparse the node) to include just
12589              * up through that final non-problematic character that this code
12590              * identifies, so when it is set to less than the full node, we can
12591              * skip the rest of this */
12592             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12593
12594                 const STRLEN full_len = len;
12595
12596                 assert(len >= MAX_NODE_STRING_SIZE);
12597
12598                 /* Here, <s> points to the final byte of the final character.
12599                  * Look backwards through the string until find a non-
12600                  * problematic character */
12601
12602                 if (! UTF) {
12603
12604                     /* This has no multi-char folds to non-UTF characters */
12605                     if (ASCII_FOLD_RESTRICTED) {
12606                         goto loopdone;
12607                     }
12608
12609                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12610                     len = s - s0 + 1;
12611                 }
12612                 else {
12613                     if (!  PL_NonL1NonFinalFold) {
12614                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12615                                         NonL1_Perl_Non_Final_Folds_invlist);
12616                     }
12617
12618                     /* Point to the first byte of the final character */
12619                     s = (char *) utf8_hop((U8 *) s, -1);
12620
12621                     while (s >= s0) {   /* Search backwards until find
12622                                            non-problematic char */
12623                         if (UTF8_IS_INVARIANT(*s)) {
12624
12625                             /* There are no ascii characters that participate
12626                              * in multi-char folds under /aa.  In EBCDIC, the
12627                              * non-ascii invariants are all control characters,
12628                              * so don't ever participate in any folds. */
12629                             if (ASCII_FOLD_RESTRICTED
12630                                 || ! IS_NON_FINAL_FOLD(*s))
12631                             {
12632                                 break;
12633                             }
12634                         }
12635                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12636                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12637                                                                   *s, *(s+1))))
12638                             {
12639                                 break;
12640                             }
12641                         }
12642                         else if (! _invlist_contains_cp(
12643                                         PL_NonL1NonFinalFold,
12644                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12645                         {
12646                             break;
12647                         }
12648
12649                         /* Here, the current character is problematic in that
12650                          * it does occur in the non-final position of some
12651                          * fold, so try the character before it, but have to
12652                          * special case the very first byte in the string, so
12653                          * we don't read outside the string */
12654                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12655                     } /* End of loop backwards through the string */
12656
12657                     /* If there were only problematic characters in the string,
12658                      * <s> will point to before s0, in which case the length
12659                      * should be 0, otherwise include the length of the
12660                      * non-problematic character just found */
12661                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12662                 }
12663
12664                 /* Here, have found the final character, if any, that is
12665                  * non-problematic as far as ending the node without splitting
12666                  * it across a potential multi-char fold.  <len> contains the
12667                  * number of bytes in the node up-to and including that
12668                  * character, or is 0 if there is no such character, meaning
12669                  * the whole node contains only problematic characters.  In
12670                  * this case, give up and just take the node as-is.  We can't
12671                  * do any better */
12672                 if (len == 0) {
12673                     len = full_len;
12674
12675                     /* If the node ends in an 's' we make sure it stays EXACTF,
12676                      * as if it turns into an EXACTFU, it could later get
12677                      * joined with another 's' that would then wrongly match
12678                      * the sharp s */
12679                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12680                     {
12681                         maybe_exactfu = FALSE;
12682                     }
12683                 } else {
12684
12685                     /* Here, the node does contain some characters that aren't
12686                      * problematic.  If one such is the final character in the
12687                      * node, we are done */
12688                     if (len == full_len) {
12689                         goto loopdone;
12690                     }
12691                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12692
12693                         /* If the final character is problematic, but the
12694                          * penultimate is not, back-off that last character to
12695                          * later start a new node with it */
12696                         p = oldp;
12697                         goto loopdone;
12698                     }
12699
12700                     /* Here, the final non-problematic character is earlier
12701                      * in the input than the penultimate character.  What we do
12702                      * is reparse from the beginning, going up only as far as
12703                      * this final ok one, thus guaranteeing that the node ends
12704                      * in an acceptable character.  The reason we reparse is
12705                      * that we know how far in the character is, but we don't
12706                      * know how to correlate its position with the input parse.
12707                      * An alternate implementation would be to build that
12708                      * correlation as we go along during the original parse,
12709                      * but that would entail extra work for every node, whereas
12710                      * this code gets executed only when the string is too
12711                      * large for the node, and the final two characters are
12712                      * problematic, an infrequent occurrence.  Yet another
12713                      * possible strategy would be to save the tail of the
12714                      * string, and the next time regatom is called, initialize
12715                      * with that.  The problem with this is that unless you
12716                      * back off one more character, you won't be guaranteed
12717                      * regatom will get called again, unless regbranch,
12718                      * regpiece ... are also changed.  If you do back off that
12719                      * extra character, so that there is input guaranteed to
12720                      * force calling regatom, you can't handle the case where
12721                      * just the first character in the node is acceptable.  I
12722                      * (khw) decided to try this method which doesn't have that
12723                      * pitfall; if performance issues are found, we can do a
12724                      * combination of the current approach plus that one */
12725                     upper_parse = len;
12726                     len = 0;
12727                     s = s0;
12728                     goto reparse;
12729                 }
12730             }   /* End of verifying node ends with an appropriate char */
12731
12732         loopdone:   /* Jumped to when encounters something that shouldn't be in
12733                        the node */
12734
12735             /* I (khw) don't know if you can get here with zero length, but the
12736              * old code handled this situation by creating a zero-length EXACT
12737              * node.  Might as well be NOTHING instead */
12738             if (len == 0) {
12739                 OP(ret) = NOTHING;
12740             }
12741             else {
12742                 if (FOLD) {
12743                     /* If 'maybe_exact' is still set here, means there are no
12744                      * code points in the node that participate in folds;
12745                      * similarly for 'maybe_exactfu' and code points that match
12746                      * differently depending on UTF8ness of the target string
12747                      * (for /u), or depending on locale for /l */
12748                     if (maybe_exact) {
12749                         OP(ret) = (LOC)
12750                                   ? EXACTL
12751                                   : EXACT;
12752                     }
12753                     else if (maybe_exactfu) {
12754                         OP(ret) = (LOC)
12755                                   ? EXACTFLU8
12756                                   : EXACTFU;
12757                     }
12758                 }
12759                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12760                                            FALSE /* Don't look to see if could
12761                                                     be turned into an EXACT
12762                                                     node, as we have already
12763                                                     computed that */
12764                                           );
12765             }
12766
12767             RExC_parse = p - 1;
12768             Set_Node_Cur_Length(ret, parse_start);
12769             nextchar(pRExC_state);
12770             {
12771                 /* len is STRLEN which is unsigned, need to copy to signed */
12772                 IV iv = len;
12773                 if (iv < 0)
12774                     vFAIL("Internal disaster");
12775             }
12776
12777         } /* End of label 'defchar:' */
12778         break;
12779     } /* End of giant switch on input character */
12780
12781     return(ret);
12782 }
12783
12784 STATIC char *
12785 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12786 {
12787     /* Returns the next non-pattern-white space, non-comment character (the
12788      * latter only if 'recognize_comment is true) in the string p, which is
12789      * ended by RExC_end.  See also reg_skipcomment */
12790     const char *e = RExC_end;
12791
12792     PERL_ARGS_ASSERT_REGPATWS;
12793
12794     while (p < e) {
12795         STRLEN len;
12796         if ((len = is_PATWS_safe(p, e, UTF))) {
12797             p += len;
12798         }
12799         else if (recognize_comment && *p == '#') {
12800             p = reg_skipcomment(pRExC_state, p);
12801         }
12802         else
12803             break;
12804     }
12805     return p;
12806 }
12807
12808 STATIC void
12809 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12810 {
12811     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12812      * sets up the bitmap and any flags, removing those code points from the
12813      * inversion list, setting it to NULL should it become completely empty */
12814
12815     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12816     assert(PL_regkind[OP(node)] == ANYOF);
12817
12818     ANYOF_BITMAP_ZERO(node);
12819     if (*invlist_ptr) {
12820
12821         /* This gets set if we actually need to modify things */
12822         bool change_invlist = FALSE;
12823
12824         UV start, end;
12825
12826         /* Start looking through *invlist_ptr */
12827         invlist_iterinit(*invlist_ptr);
12828         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12829             UV high;
12830             int i;
12831
12832             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12833                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12834             }
12835             else if (end >= NUM_ANYOF_CODE_POINTS) {
12836                 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12837             }
12838
12839             /* Quit if are above what we should change */
12840             if (start >= NUM_ANYOF_CODE_POINTS) {
12841                 break;
12842             }
12843
12844             change_invlist = TRUE;
12845
12846             /* Set all the bits in the range, up to the max that we are doing */
12847             high = (end < NUM_ANYOF_CODE_POINTS - 1)
12848                    ? end
12849                    : NUM_ANYOF_CODE_POINTS - 1;
12850             for (i = start; i <= (int) high; i++) {
12851                 if (! ANYOF_BITMAP_TEST(node, i)) {
12852                     ANYOF_BITMAP_SET(node, i);
12853                 }
12854             }
12855         }
12856         invlist_iterfinish(*invlist_ptr);
12857
12858         /* Done with loop; remove any code points that are in the bitmap from
12859          * *invlist_ptr; similarly for code points above the bitmap if we have
12860          * a flag to match all of them anyways */
12861         if (change_invlist) {
12862             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12863         }
12864         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12865             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12866         }
12867
12868         /* If have completely emptied it, remove it completely */
12869         if (_invlist_len(*invlist_ptr) == 0) {
12870             SvREFCNT_dec_NN(*invlist_ptr);
12871             *invlist_ptr = NULL;
12872         }
12873     }
12874 }
12875
12876 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12877    Character classes ([:foo:]) can also be negated ([:^foo:]).
12878    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12879    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12880    but trigger failures because they are currently unimplemented. */
12881
12882 #define POSIXCC_DONE(c)   ((c) == ':')
12883 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12884 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12885
12886 PERL_STATIC_INLINE I32
12887 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12888 {
12889     I32 namedclass = OOB_NAMEDCLASS;
12890
12891     PERL_ARGS_ASSERT_REGPPOSIXCC;
12892
12893     if (value == '[' && RExC_parse + 1 < RExC_end &&
12894         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12895         POSIXCC(UCHARAT(RExC_parse)))
12896     {
12897         const char c = UCHARAT(RExC_parse);
12898         char* const s = RExC_parse++;
12899
12900         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12901             RExC_parse++;
12902         if (RExC_parse == RExC_end) {
12903             if (strict) {
12904
12905                 /* Try to give a better location for the error (than the end of
12906                  * the string) by looking for the matching ']' */
12907                 RExC_parse = s;
12908                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12909                     RExC_parse++;
12910                 }
12911                 vFAIL2("Unmatched '%c' in POSIX class", c);
12912             }
12913             /* Grandfather lone [:, [=, [. */
12914             RExC_parse = s;
12915         }
12916         else {
12917             const char* const t = RExC_parse++; /* skip over the c */
12918             assert(*t == c);
12919
12920             if (UCHARAT(RExC_parse) == ']') {
12921                 const char *posixcc = s + 1;
12922                 RExC_parse++; /* skip over the ending ] */
12923
12924                 if (*s == ':') {
12925                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12926                     const I32 skip = t - posixcc;
12927
12928                     /* Initially switch on the length of the name.  */
12929                     switch (skip) {
12930                     case 4:
12931                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12932                                                           this is the Perl \w
12933                                                         */
12934                             namedclass = ANYOF_WORDCHAR;
12935                         break;
12936                     case 5:
12937                         /* Names all of length 5.  */
12938                         /* alnum alpha ascii blank cntrl digit graph lower
12939                            print punct space upper  */
12940                         /* Offset 4 gives the best switch position.  */
12941                         switch (posixcc[4]) {
12942                         case 'a':
12943                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12944                                 namedclass = ANYOF_ALPHA;
12945                             break;
12946                         case 'e':
12947                             if (memEQ(posixcc, "spac", 4)) /* space */
12948                                 namedclass = ANYOF_PSXSPC;
12949                             break;
12950                         case 'h':
12951                             if (memEQ(posixcc, "grap", 4)) /* graph */
12952                                 namedclass = ANYOF_GRAPH;
12953                             break;
12954                         case 'i':
12955                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12956                                 namedclass = ANYOF_ASCII;
12957                             break;
12958                         case 'k':
12959                             if (memEQ(posixcc, "blan", 4)) /* blank */
12960                                 namedclass = ANYOF_BLANK;
12961                             break;
12962                         case 'l':
12963                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12964                                 namedclass = ANYOF_CNTRL;
12965                             break;
12966                         case 'm':
12967                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12968                                 namedclass = ANYOF_ALPHANUMERIC;
12969                             break;
12970                         case 'r':
12971                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12972                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12973                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12974                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12975                             break;
12976                         case 't':
12977                             if (memEQ(posixcc, "digi", 4)) /* digit */
12978                                 namedclass = ANYOF_DIGIT;
12979                             else if (memEQ(posixcc, "prin", 4)) /* print */
12980                                 namedclass = ANYOF_PRINT;
12981                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12982                                 namedclass = ANYOF_PUNCT;
12983                             break;
12984                         }
12985                         break;
12986                     case 6:
12987                         if (memEQ(posixcc, "xdigit", 6))
12988                             namedclass = ANYOF_XDIGIT;
12989                         break;
12990                     }
12991
12992                     if (namedclass == OOB_NAMEDCLASS)
12993                         vFAIL2utf8f(
12994                             "POSIX class [:%"UTF8f":] unknown",
12995                             UTF8fARG(UTF, t - s - 1, s + 1));
12996
12997                     /* The #defines are structured so each complement is +1 to
12998                      * the normal one */
12999                     if (complement) {
13000                         namedclass++;
13001                     }
13002                     assert (posixcc[skip] == ':');
13003                     assert (posixcc[skip+1] == ']');
13004                 } else if (!SIZE_ONLY) {
13005                     /* [[=foo=]] and [[.foo.]] are still future. */
13006
13007                     /* adjust RExC_parse so the warning shows after
13008                        the class closes */
13009                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13010                         RExC_parse++;
13011                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13012                 }
13013             } else {
13014                 /* Maternal grandfather:
13015                  * "[:" ending in ":" but not in ":]" */
13016                 if (strict) {
13017                     vFAIL("Unmatched '[' in POSIX class");
13018                 }
13019
13020                 /* Grandfather lone [:, [=, [. */
13021                 RExC_parse = s;
13022             }
13023         }
13024     }
13025
13026     return namedclass;
13027 }
13028
13029 STATIC bool
13030 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13031 {
13032     /* This applies some heuristics at the current parse position (which should
13033      * be at a '[') to see if what follows might be intended to be a [:posix:]
13034      * class.  It returns true if it really is a posix class, of course, but it
13035      * also can return true if it thinks that what was intended was a posix
13036      * class that didn't quite make it.
13037      *
13038      * It will return true for
13039      *      [:alphanumerics:
13040      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
13041      *                         ')' indicating the end of the (?[
13042      *      [:any garbage including %^&$ punctuation:]
13043      *
13044      * This is designed to be called only from S_handle_regex_sets; it could be
13045      * easily adapted to be called from the spot at the beginning of regclass()
13046      * that checks to see in a normal bracketed class if the surrounding []
13047      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
13048      * change long-standing behavior, so I (khw) didn't do that */
13049     char* p = RExC_parse + 1;
13050     char first_char = *p;
13051
13052     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13053
13054     assert(*(p - 1) == '[');
13055
13056     if (! POSIXCC(first_char)) {
13057         return FALSE;
13058     }
13059
13060     p++;
13061     while (p < RExC_end && isWORDCHAR(*p)) p++;
13062
13063     if (p >= RExC_end) {
13064         return FALSE;
13065     }
13066
13067     if (p - RExC_parse > 2    /* Got at least 1 word character */
13068         && (*p == first_char
13069             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13070     {
13071         return TRUE;
13072     }
13073
13074     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13075
13076     return (p
13077             && p - RExC_parse > 2 /* [:] evaluates to colon;
13078                                       [::] is a bad posix class. */
13079             && first_char == *(p - 1));
13080 }
13081
13082 STATIC regnode *
13083 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13084                     I32 *flagp, U32 depth,
13085                     char * const oregcomp_parse)
13086 {
13087     /* Handle the (?[...]) construct to do set operations */
13088
13089     U8 curchar;
13090     UV start, end;      /* End points of code point ranges */
13091     SV* result_string;
13092     char *save_end, *save_parse;
13093     SV* final;
13094     STRLEN len;
13095     regnode* node;
13096     AV* stack;
13097     const bool save_fold = FOLD;
13098
13099     GET_RE_DEBUG_FLAGS_DECL;
13100
13101     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13102
13103     if (LOC) {
13104         vFAIL("(?[...]) not valid in locale");
13105     }
13106     RExC_uni_semantics = 1;
13107
13108     /* This will return only an ANYOF regnode, or (unlikely) something smaller
13109      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
13110      * call regclass to handle '[]' so as to not have to reinvent its parsing
13111      * rules here (throwing away the size it computes each time).  And, we exit
13112      * upon an unescaped ']' that isn't one ending a regclass.  To do both
13113      * these things, we need to realize that something preceded by a backslash
13114      * is escaped, so we have to keep track of backslashes */
13115     if (PASS2) {
13116         Perl_ck_warner_d(aTHX_
13117             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13118             "The regex_sets feature is experimental" REPORT_LOCATION,
13119                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13120                 UTF8fARG(UTF,
13121                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13122                          RExC_precomp + (RExC_parse - RExC_precomp)));
13123     }
13124     else {
13125         UV depth = 0; /* how many nested (?[...]) constructs */
13126
13127         while (RExC_parse < RExC_end) {
13128             SV* current = NULL;
13129             RExC_parse = regpatws(pRExC_state, RExC_parse,
13130                                           TRUE); /* means recognize comments */
13131             switch (*RExC_parse) {
13132                 case '?':
13133                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
13134                     /* FALLTHROUGH */
13135                 default:
13136                     break;
13137                 case '\\':
13138                     /* Skip the next byte (which could cause us to end up in
13139                      * the middle of a UTF-8 character, but since none of those
13140                      * are confusable with anything we currently handle in this
13141                      * switch (invariants all), it's safe.  We'll just hit the
13142                      * default: case next time and keep on incrementing until
13143                      * we find one of the invariants we do handle. */
13144                     RExC_parse++;
13145                     break;
13146                 case '[':
13147                 {
13148                     /* If this looks like it is a [:posix:] class, leave the
13149                      * parse pointer at the '[' to fool regclass() into
13150                      * thinking it is part of a '[[:posix:]]'.  That function
13151                      * will use strict checking to force a syntax error if it
13152                      * doesn't work out to a legitimate class */
13153                     bool is_posix_class
13154                                     = could_it_be_a_POSIX_class(pRExC_state);
13155                     if (! is_posix_class) {
13156                         RExC_parse++;
13157                     }
13158
13159                     /* regclass() can only return RESTART_UTF8 if multi-char
13160                        folds are allowed.  */
13161                     if (!regclass(pRExC_state, flagp,depth+1,
13162                                   is_posix_class, /* parse the whole char
13163                                                      class only if not a
13164                                                      posix class */
13165                                   FALSE, /* don't allow multi-char folds */
13166                                   TRUE, /* silence non-portable warnings. */
13167                                   &current))
13168                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13169                               (UV) *flagp);
13170
13171                     /* function call leaves parse pointing to the ']', except
13172                      * if we faked it */
13173                     if (is_posix_class) {
13174                         RExC_parse--;
13175                     }
13176
13177                     SvREFCNT_dec(current);   /* In case it returned something */
13178                     break;
13179                 }
13180
13181                 case ']':
13182                     if (depth--) break;
13183                     RExC_parse++;
13184                     if (RExC_parse < RExC_end
13185                         && *RExC_parse == ')')
13186                     {
13187                         node = reganode(pRExC_state, ANYOF, 0);
13188                         RExC_size += ANYOF_SKIP;
13189                         nextchar(pRExC_state);
13190                         Set_Node_Length(node,
13191                                 RExC_parse - oregcomp_parse + 1); /* MJD */
13192                         return node;
13193                     }
13194                     goto no_close;
13195             }
13196             RExC_parse++;
13197         }
13198
13199         no_close:
13200         FAIL("Syntax error in (?[...])");
13201     }
13202
13203     /* Pass 2 only after this.  Everything in this construct is a
13204      * metacharacter.  Operands begin with either a '\' (for an escape
13205      * sequence), or a '[' for a bracketed character class.  Any other
13206      * character should be an operator, or parenthesis for grouping.  Both
13207      * types of operands are handled by calling regclass() to parse them.  It
13208      * is called with a parameter to indicate to return the computed inversion
13209      * list.  The parsing here is implemented via a stack.  Each entry on the
13210      * stack is a single character representing one of the operators, or the
13211      * '('; or else a pointer to an operand inversion list. */
13212
13213 #define IS_OPERAND(a)  (! SvIOK(a))
13214
13215     /* The stack starts empty.  It is a syntax error if the first thing parsed
13216      * is a binary operator; everything else is pushed on the stack.  When an
13217      * operand is parsed, the top of the stack is examined.  If it is a binary
13218      * operator, the item before it should be an operand, and both are replaced
13219      * by the result of doing that operation on the new operand and the one on
13220      * the stack.   Thus a sequence of binary operands is reduced to a single
13221      * one before the next one is parsed.
13222      *
13223      * A unary operator may immediately follow a binary in the input, for
13224      * example
13225      *      [a] + ! [b]
13226      * When an operand is parsed and the top of the stack is a unary operator,
13227      * the operation is performed, and then the stack is rechecked to see if
13228      * this new operand is part of a binary operation; if so, it is handled as
13229      * above.
13230      *
13231      * A '(' is simply pushed on the stack; it is valid only if the stack is
13232      * empty, or the top element of the stack is an operator or another '('
13233      * (for which the parenthesized expression will become an operand).  By the
13234      * time the corresponding ')' is parsed everything in between should have
13235      * been parsed and evaluated to a single operand (or else is a syntax
13236      * error), and is handled as a regular operand */
13237
13238     sv_2mortal((SV *)(stack = newAV()));
13239
13240     while (RExC_parse < RExC_end) {
13241         I32 top_index = av_tindex(stack);
13242         SV** top_ptr;
13243         SV* current = NULL;
13244
13245         /* Skip white space */
13246         RExC_parse = regpatws(pRExC_state, RExC_parse,
13247                                          TRUE /* means recognize comments */ );
13248         if (RExC_parse >= RExC_end) {
13249             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13250         }
13251         if ((curchar = UCHARAT(RExC_parse)) == ']') {
13252             break;
13253         }
13254
13255         switch (curchar) {
13256
13257             case '?':
13258                 if (av_tindex(stack) >= 0   /* This makes sure that we can
13259                                                safely subtract 1 from
13260                                                RExC_parse in the next clause.
13261                                                If we have something on the
13262                                                stack, we have parsed something
13263                                              */
13264                     && UCHARAT(RExC_parse - 1) == '('
13265                     && RExC_parse < RExC_end)
13266                 {
13267                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13268                      * This happens when we have some thing like
13269                      *
13270                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13271                      *   ...
13272                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
13273                      *
13274                      * Here we would be handling the interpolated
13275                      * '$thai_or_lao'.  We handle this by a recursive call to
13276                      * ourselves which returns the inversion list the
13277                      * interpolated expression evaluates to.  We use the flags
13278                      * from the interpolated pattern. */
13279                     U32 save_flags = RExC_flags;
13280                     const char * const save_parse = ++RExC_parse;
13281
13282                     parse_lparen_question_flags(pRExC_state);
13283
13284                     if (RExC_parse == save_parse  /* Makes sure there was at
13285                                                      least one flag (or this
13286                                                      embedding wasn't compiled)
13287                                                    */
13288                         || RExC_parse >= RExC_end - 4
13289                         || UCHARAT(RExC_parse) != ':'
13290                         || UCHARAT(++RExC_parse) != '('
13291                         || UCHARAT(++RExC_parse) != '?'
13292                         || UCHARAT(++RExC_parse) != '[')
13293                     {
13294
13295                         /* In combination with the above, this moves the
13296                          * pointer to the point just after the first erroneous
13297                          * character (or if there are no flags, to where they
13298                          * should have been) */
13299                         if (RExC_parse >= RExC_end - 4) {
13300                             RExC_parse = RExC_end;
13301                         }
13302                         else if (RExC_parse != save_parse) {
13303                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13304                         }
13305                         vFAIL("Expecting '(?flags:(?[...'");
13306                     }
13307                     RExC_parse++;
13308                     (void) handle_regex_sets(pRExC_state, &current, flagp,
13309                                                     depth+1, oregcomp_parse);
13310
13311                     /* Here, 'current' contains the embedded expression's
13312                      * inversion list, and RExC_parse points to the trailing
13313                      * ']'; the next character should be the ')' which will be
13314                      * paired with the '(' that has been put on the stack, so
13315                      * the whole embedded expression reduces to '(operand)' */
13316                     RExC_parse++;
13317
13318                     RExC_flags = save_flags;
13319                     goto handle_operand;
13320                 }
13321                 /* FALLTHROUGH */
13322
13323             default:
13324                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13325                 vFAIL("Unexpected character");
13326
13327             case '\\':
13328                 /* regclass() can only return RESTART_UTF8 if multi-char
13329                    folds are allowed.  */
13330                 if (!regclass(pRExC_state, flagp,depth+1,
13331                               TRUE, /* means parse just the next thing */
13332                               FALSE, /* don't allow multi-char folds */
13333                               FALSE, /* don't silence non-portable warnings.  */
13334                               &current))
13335                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13336                           (UV) *flagp);
13337                 /* regclass() will return with parsing just the \ sequence,
13338                  * leaving the parse pointer at the next thing to parse */
13339                 RExC_parse--;
13340                 goto handle_operand;
13341
13342             case '[':   /* Is a bracketed character class */
13343             {
13344                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13345
13346                 if (! is_posix_class) {
13347                     RExC_parse++;
13348                 }
13349
13350                 /* regclass() can only return RESTART_UTF8 if multi-char
13351                    folds are allowed.  */
13352                 if(!regclass(pRExC_state, flagp,depth+1,
13353                              is_posix_class, /* parse the whole char class
13354                                                 only if not a posix class */
13355                              FALSE, /* don't allow multi-char folds */
13356                              FALSE, /* don't silence non-portable warnings.  */
13357                              &current))
13358                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13359                           (UV) *flagp);
13360                 /* function call leaves parse pointing to the ']', except if we
13361                  * faked it */
13362                 if (is_posix_class) {
13363                     RExC_parse--;
13364                 }
13365
13366                 goto handle_operand;
13367             }
13368
13369             case '&':
13370             case '|':
13371             case '+':
13372             case '-':
13373             case '^':
13374                 if (top_index < 0
13375                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13376                     || ! IS_OPERAND(*top_ptr))
13377                 {
13378                     RExC_parse++;
13379                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13380                 }
13381                 av_push(stack, newSVuv(curchar));
13382                 break;
13383
13384             case '!':
13385                 av_push(stack, newSVuv(curchar));
13386                 break;
13387
13388             case '(':
13389                 if (top_index >= 0) {
13390                     top_ptr = av_fetch(stack, top_index, FALSE);
13391                     assert(top_ptr);
13392                     if (IS_OPERAND(*top_ptr)) {
13393                         RExC_parse++;
13394                         vFAIL("Unexpected '(' with no preceding operator");
13395                     }
13396                 }
13397                 av_push(stack, newSVuv(curchar));
13398                 break;
13399
13400             case ')':
13401             {
13402                 SV* lparen;
13403                 if (top_index < 1
13404                     || ! (current = av_pop(stack))
13405                     || ! IS_OPERAND(current)
13406                     || ! (lparen = av_pop(stack))
13407                     || IS_OPERAND(lparen)
13408                     || SvUV(lparen) != '(')
13409                 {
13410                     SvREFCNT_dec(current);
13411                     RExC_parse++;
13412                     vFAIL("Unexpected ')'");
13413                 }
13414                 top_index -= 2;
13415                 SvREFCNT_dec_NN(lparen);
13416
13417                 /* FALLTHROUGH */
13418             }
13419
13420               handle_operand:
13421
13422                 /* Here, we have an operand to process, in 'current' */
13423
13424                 if (top_index < 0) {    /* Just push if stack is empty */
13425                     av_push(stack, current);
13426                 }
13427                 else {
13428                     SV* top = av_pop(stack);
13429                     SV *prev = NULL;
13430                     char current_operator;
13431
13432                     if (IS_OPERAND(top)) {
13433                         SvREFCNT_dec_NN(top);
13434                         SvREFCNT_dec_NN(current);
13435                         vFAIL("Operand with no preceding operator");
13436                     }
13437                     current_operator = (char) SvUV(top);
13438                     switch (current_operator) {
13439                         case '(':   /* Push the '(' back on followed by the new
13440                                        operand */
13441                             av_push(stack, top);
13442                             av_push(stack, current);
13443                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13444                                                    just after the 'break', so
13445                                                    it doesn't get wrongly freed
13446                                                  */
13447                             break;
13448
13449                         case '!':
13450                             _invlist_invert(current);
13451
13452                             /* Unlike binary operators, the top of the stack,
13453                              * now that this unary one has been popped off, may
13454                              * legally be an operator, and we now have operand
13455                              * for it. */
13456                             top_index--;
13457                             SvREFCNT_dec_NN(top);
13458                             goto handle_operand;
13459
13460                         case '&':
13461                             prev = av_pop(stack);
13462                             _invlist_intersection(prev,
13463                                                    current,
13464                                                    &current);
13465                             av_push(stack, current);
13466                             break;
13467
13468                         case '|':
13469                         case '+':
13470                             prev = av_pop(stack);
13471                             _invlist_union(prev, current, &current);
13472                             av_push(stack, current);
13473                             break;
13474
13475                         case '-':
13476                             prev = av_pop(stack);;
13477                             _invlist_subtract(prev, current, &current);
13478                             av_push(stack, current);
13479                             break;
13480
13481                         case '^':   /* The union minus the intersection */
13482                         {
13483                             SV* i = NULL;
13484                             SV* u = NULL;
13485                             SV* element;
13486
13487                             prev = av_pop(stack);
13488                             _invlist_union(prev, current, &u);
13489                             _invlist_intersection(prev, current, &i);
13490                             /* _invlist_subtract will overwrite current
13491                                 without freeing what it already contains */
13492                             element = current;
13493                             _invlist_subtract(u, i, &current);
13494                             av_push(stack, current);
13495                             SvREFCNT_dec_NN(i);
13496                             SvREFCNT_dec_NN(u);
13497                             SvREFCNT_dec_NN(element);
13498                             break;
13499                         }
13500
13501                         default:
13502                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13503                 }
13504                 SvREFCNT_dec_NN(top);
13505                 SvREFCNT_dec(prev);
13506             }
13507         }
13508
13509         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13510     }
13511
13512     if (av_tindex(stack) < 0   /* Was empty */
13513         || ((final = av_pop(stack)) == NULL)
13514         || ! IS_OPERAND(final)
13515         || av_tindex(stack) >= 0)  /* More left on stack */
13516     {
13517         vFAIL("Incomplete expression within '(?[ ])'");
13518     }
13519
13520     /* Here, 'final' is the resultant inversion list from evaluating the
13521      * expression.  Return it if so requested */
13522     if (return_invlist) {
13523         *return_invlist = final;
13524         return END;
13525     }
13526
13527     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13528      * expecting a string of ranges and individual code points */
13529     invlist_iterinit(final);
13530     result_string = newSVpvs("");
13531     while (invlist_iternext(final, &start, &end)) {
13532         if (start == end) {
13533             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13534         }
13535         else {
13536             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13537                                                      start,          end);
13538         }
13539     }
13540
13541     save_parse = RExC_parse;
13542     RExC_parse = SvPV(result_string, len);
13543     save_end = RExC_end;
13544     RExC_end = RExC_parse + len;
13545
13546     /* We turn off folding around the call, as the class we have constructed
13547      * already has all folding taken into consideration, and we don't want
13548      * regclass() to add to that */
13549     RExC_flags &= ~RXf_PMf_FOLD;
13550     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13551      */
13552     node = regclass(pRExC_state, flagp,depth+1,
13553                     FALSE, /* means parse the whole char class */
13554                     FALSE, /* don't allow multi-char folds */
13555                     TRUE, /* silence non-portable warnings.  The above may very
13556                              well have generated non-portable code points, but
13557                              they're valid on this machine */
13558                     NULL);
13559     if (!node)
13560         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13561                     PTR2UV(flagp));
13562     if (save_fold) {
13563         RExC_flags |= RXf_PMf_FOLD;
13564     }
13565     RExC_parse = save_parse + 1;
13566     RExC_end = save_end;
13567     SvREFCNT_dec_NN(final);
13568     SvREFCNT_dec_NN(result_string);
13569
13570     nextchar(pRExC_state);
13571     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13572     return node;
13573 }
13574 #undef IS_OPERAND
13575
13576 STATIC void
13577 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13578 {
13579     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13580      * innocent-looking character class, like /[ks]/i won't have to go out to
13581      * disk to find the possible matches.
13582      *
13583      * This should be called only for a Latin1-range code points, cp, which is
13584      * known to be involved in a simple fold with other code points above
13585      * Latin1.  It would give false results if /aa has been specified.
13586      * Multi-char folds are outside the scope of this, and must be handled
13587      * specially.
13588      *
13589      * XXX It would be better to generate these via regen, in case a new
13590      * version of the Unicode standard adds new mappings, though that is not
13591      * really likely, and may be caught by the default: case of the switch
13592      * below. */
13593
13594     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13595
13596     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13597
13598     switch (cp) {
13599         case 'k':
13600         case 'K':
13601           *invlist =
13602              add_cp_to_invlist(*invlist, KELVIN_SIGN);
13603             break;
13604         case 's':
13605         case 'S':
13606           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13607             break;
13608         case MICRO_SIGN:
13609           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13610           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13611             break;
13612         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13613         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13614           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13615             break;
13616         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13617           *invlist = add_cp_to_invlist(*invlist,
13618                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13619             break;
13620         case LATIN_SMALL_LETTER_SHARP_S:
13621           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13622             break;
13623         default:
13624             /* Use deprecated warning to increase the chances of this being
13625              * output */
13626             if (PASS2) {
13627                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13628             }
13629             break;
13630     }
13631 }
13632
13633 STATIC AV *
13634 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13635 {
13636     /* This adds the string scalar <multi_string> to the array
13637      * <multi_char_matches>.  <multi_string> is known to have exactly
13638      * <cp_count> code points in it.  This is used when constructing a
13639      * bracketed character class and we find something that needs to match more
13640      * than a single character.
13641      *
13642      * <multi_char_matches> is actually an array of arrays.  Each top-level
13643      * element is an array that contains all the strings known so far that are
13644      * the same length.  And that length (in number of code points) is the same
13645      * as the index of the top-level array.  Hence, the [2] element is an
13646      * array, each element thereof is a string containing TWO code points;
13647      * while element [3] is for strings of THREE characters, and so on.  Since
13648      * this is for multi-char strings there can never be a [0] nor [1] element.
13649      *
13650      * When we rewrite the character class below, we will do so such that the
13651      * longest strings are written first, so that it prefers the longest
13652      * matching strings first.  This is done even if it turns out that any
13653      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
13654      * Christiansen has agreed that this is ok.  This makes the test for the
13655      * ligature 'ffi' come before the test for 'ff', for example */
13656
13657     AV* this_array;
13658     AV** this_array_ptr;
13659
13660     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13661
13662     if (! multi_char_matches) {
13663         multi_char_matches = newAV();
13664     }
13665
13666     if (av_exists(multi_char_matches, cp_count)) {
13667         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13668         this_array = *this_array_ptr;
13669     }
13670     else {
13671         this_array = newAV();
13672         av_store(multi_char_matches, cp_count,
13673                  (SV*) this_array);
13674     }
13675     av_push(this_array, multi_string);
13676
13677     return multi_char_matches;
13678 }
13679
13680 /* The names of properties whose definitions are not known at compile time are
13681  * stored in this SV, after a constant heading.  So if the length has been
13682  * changed since initialization, then there is a run-time definition. */
13683 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13684                                         (SvCUR(listsv) != initial_listsv_len)
13685
13686 STATIC regnode *
13687 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13688                  const bool stop_at_1,  /* Just parse the next thing, don't
13689                                            look for a full character class */
13690                  bool allow_multi_folds,
13691                  const bool silence_non_portable,   /* Don't output warnings
13692                                                        about too large
13693                                                        characters */
13694                  SV** ret_invlist)  /* Return an inversion list, not a node */
13695 {
13696     /* parse a bracketed class specification.  Most of these will produce an
13697      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13698      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13699      * under /i with multi-character folds: it will be rewritten following the
13700      * paradigm of this example, where the <multi-fold>s are characters which
13701      * fold to multiple character sequences:
13702      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13703      * gets effectively rewritten as:
13704      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13705      * reg() gets called (recursively) on the rewritten version, and this
13706      * function will return what it constructs.  (Actually the <multi-fold>s
13707      * aren't physically removed from the [abcdefghi], it's just that they are
13708      * ignored in the recursion by means of a flag:
13709      * <RExC_in_multi_char_class>.)
13710      *
13711      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13712      * characters, with the corresponding bit set if that character is in the
13713      * list.  For characters above this, a range list or swash is used.  There
13714      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13715      * determinable at compile time
13716      *
13717      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13718      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13719      */
13720
13721     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13722     IV range = 0;
13723     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13724     regnode *ret;
13725     STRLEN numlen;
13726     IV namedclass = OOB_NAMEDCLASS;
13727     char *rangebegin = NULL;
13728     bool need_class = 0;
13729     SV *listsv = NULL;
13730     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13731                                       than just initialized.  */
13732     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13733     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13734                                extended beyond the Latin1 range.  These have to
13735                                be kept separate from other code points for much
13736                                of this function because their handling  is
13737                                different under /i, and for most classes under
13738                                /d as well */
13739     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13740                                separate for a while from the non-complemented
13741                                versions because of complications with /d
13742                                matching */
13743     UV element_count = 0;   /* Number of distinct elements in the class.
13744                                Optimizations may be possible if this is tiny */
13745     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13746                                        character; used under /i */
13747     UV n;
13748     char * stop_ptr = RExC_end;    /* where to stop parsing */
13749     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13750                                                    space? */
13751     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13752
13753     /* Unicode properties are stored in a swash; this holds the current one
13754      * being parsed.  If this swash is the only above-latin1 component of the
13755      * character class, an optimization is to pass it directly on to the
13756      * execution engine.  Otherwise, it is set to NULL to indicate that there
13757      * are other things in the class that have to be dealt with at execution
13758      * time */
13759     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13760
13761     /* Set if a component of this character class is user-defined; just passed
13762      * on to the engine */
13763     bool has_user_defined_property = FALSE;
13764
13765     /* inversion list of code points this node matches only when the target
13766      * string is in UTF-8.  (Because is under /d) */
13767     SV* depends_list = NULL;
13768
13769     /* Inversion list of code points this node matches regardless of things
13770      * like locale, folding, utf8ness of the target string */
13771     SV* cp_list = NULL;
13772
13773     /* Like cp_list, but code points on this list need to be checked for things
13774      * that fold to/from them under /i */
13775     SV* cp_foldable_list = NULL;
13776
13777     /* Like cp_list, but code points on this list are valid only when the
13778      * runtime locale is UTF-8 */
13779     SV* only_utf8_locale_list = NULL;
13780
13781 #ifdef EBCDIC
13782     /* In a range, counts how many 0-2 of the ends of it came from literals,
13783      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13784     UV literal_endpoint = 0;
13785
13786     /* Is the range unicode? which means on a platform that isn't 1-1 native
13787      * to Unicode (i.e. non-ASCII), each code point in it should be considered
13788      * to be a Unicode value.  */
13789     bool unicode_range = FALSE;
13790 #endif
13791     bool invert = FALSE;    /* Is this class to be complemented */
13792
13793     bool warn_super = ALWAYS_WARN_SUPER;
13794
13795     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13796         case we need to change the emitted regop to an EXACT. */
13797     const char * orig_parse = RExC_parse;
13798     const SSize_t orig_size = RExC_size;
13799     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13800     GET_RE_DEBUG_FLAGS_DECL;
13801
13802     PERL_ARGS_ASSERT_REGCLASS;
13803 #ifndef DEBUGGING
13804     PERL_UNUSED_ARG(depth);
13805 #endif
13806
13807     DEBUG_PARSE("clas");
13808
13809     /* Assume we are going to generate an ANYOF node. */
13810     ret = reganode(pRExC_state,
13811                    (LOC)
13812                     ? ANYOFL
13813                     : ANYOF,
13814                    0);
13815
13816     if (SIZE_ONLY) {
13817         RExC_size += ANYOF_SKIP;
13818         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13819     }
13820     else {
13821         ANYOF_FLAGS(ret) = 0;
13822
13823         RExC_emit += ANYOF_SKIP;
13824         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13825         initial_listsv_len = SvCUR(listsv);
13826         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13827     }
13828
13829     if (skip_white) {
13830         RExC_parse = regpatws(pRExC_state, RExC_parse,
13831                               FALSE /* means don't recognize comments */ );
13832     }
13833
13834     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13835         RExC_parse++;
13836         invert = TRUE;
13837         allow_multi_folds = FALSE;
13838         MARK_NAUGHTY(1);
13839         if (skip_white) {
13840             RExC_parse = regpatws(pRExC_state, RExC_parse,
13841                                   FALSE /* means don't recognize comments */ );
13842         }
13843     }
13844
13845     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13846     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13847         const char *s = RExC_parse;
13848         const char  c = *s++;
13849
13850         while (isWORDCHAR(*s))
13851             s++;
13852         if (*s && c == *s && s[1] == ']') {
13853             SAVEFREESV(RExC_rx_sv);
13854             ckWARN3reg(s+2,
13855                        "POSIX syntax [%c %c] belongs inside character classes",
13856                        c, c);
13857             (void)ReREFCNT_inc(RExC_rx_sv);
13858         }
13859     }
13860
13861     /* If the caller wants us to just parse a single element, accomplish this
13862      * by faking the loop ending condition */
13863     if (stop_at_1 && RExC_end > RExC_parse) {
13864         stop_ptr = RExC_parse + 1;
13865     }
13866
13867     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13868     if (UCHARAT(RExC_parse) == ']')
13869         goto charclassloop;
13870
13871     while (1) {
13872         if  (RExC_parse >= stop_ptr) {
13873             break;
13874         }
13875
13876         if (skip_white) {
13877             RExC_parse = regpatws(pRExC_state, RExC_parse,
13878                                   FALSE /* means don't recognize comments */ );
13879         }
13880
13881         if  (UCHARAT(RExC_parse) == ']') {
13882             break;
13883         }
13884
13885     charclassloop:
13886
13887         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13888         save_value = value;
13889         save_prevvalue = prevvalue;
13890
13891         if (!range) {
13892             rangebegin = RExC_parse;
13893             element_count++;
13894 #ifdef EBCDIC
13895             literal_endpoint = 0;
13896 #endif
13897         }
13898         if (UTF) {
13899             value = utf8n_to_uvchr((U8*)RExC_parse,
13900                                    RExC_end - RExC_parse,
13901                                    &numlen, UTF8_ALLOW_DEFAULT);
13902             RExC_parse += numlen;
13903         }
13904         else
13905             value = UCHARAT(RExC_parse++);
13906
13907         if (value == '['
13908             && RExC_parse < RExC_end
13909             && POSIXCC(UCHARAT(RExC_parse)))
13910         {
13911             namedclass = regpposixcc(pRExC_state, value, strict);
13912         }
13913         else if (value != '\\') {
13914 #ifdef EBCDIC
13915             literal_endpoint++;
13916 #endif
13917         }
13918         else {
13919             /* Is a backslash; get the code point of the char after it */
13920             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
13921                 value = utf8n_to_uvchr((U8*)RExC_parse,
13922                                    RExC_end - RExC_parse,
13923                                    &numlen, UTF8_ALLOW_DEFAULT);
13924                 RExC_parse += numlen;
13925             }
13926             else
13927                 value = UCHARAT(RExC_parse++);
13928
13929             /* Some compilers cannot handle switching on 64-bit integer
13930              * values, therefore value cannot be an UV.  Yes, this will
13931              * be a problem later if we want switch on Unicode.
13932              * A similar issue a little bit later when switching on
13933              * namedclass. --jhi */
13934
13935             /* If the \ is escaping white space when white space is being
13936              * skipped, it means that that white space is wanted literally, and
13937              * is already in 'value'.  Otherwise, need to translate the escape
13938              * into what it signifies. */
13939             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13940
13941             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13942             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13943             case 's':   namedclass = ANYOF_SPACE;       break;
13944             case 'S':   namedclass = ANYOF_NSPACE;      break;
13945             case 'd':   namedclass = ANYOF_DIGIT;       break;
13946             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13947             case 'v':   namedclass = ANYOF_VERTWS;      break;
13948             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13949             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13950             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13951             case 'N':  /* Handle \N{NAME} in class */
13952                 {
13953                     SV *as_text;
13954                     STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13955                                                     flagp, depth, &as_text);
13956                     if (*flagp & RESTART_UTF8)
13957                         FAIL("panic: grok_bslash_N set RESTART_UTF8");
13958                     if (cp_count != 1) {    /* The typical case drops through */
13959                         assert(cp_count != (STRLEN) -1);
13960                         if (cp_count == 0) {
13961                             if (strict) {
13962                                 RExC_parse++;   /* Position after the "}" */
13963                                 vFAIL("Zero length \\N{}");
13964                             }
13965                             else if (PASS2) {
13966                                 ckWARNreg(RExC_parse,
13967                                         "Ignoring zero length \\N{} in character class");
13968                             }
13969                         }
13970                         else { /* cp_count > 1 */
13971                             if (! RExC_in_multi_char_class) {
13972                                 if (invert || range || *RExC_parse == '-') {
13973                                     if (strict) {
13974                                         RExC_parse--;
13975                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13976                                     }
13977                                     else if (PASS2) {
13978                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13979                                     }
13980                                 }
13981                                 else {
13982                                     multi_char_matches
13983                                         = add_multi_match(multi_char_matches,
13984                                                           as_text,
13985                                                           cp_count);
13986                                 }
13987                                 break; /* <value> contains the first code
13988                                           point. Drop out of the switch to
13989                                           process it */
13990                             }
13991                         } /* End of cp_count != 1 */
13992
13993                         /* This element should not be processed further in this
13994                          * class */
13995                         element_count--;
13996                         value = save_value;
13997                         prevvalue = save_prevvalue;
13998                         continue;   /* Back to top of loop to get next char */
13999                     }
14000                     /* Here, is a single code point, and <value> contains it */
14001 #ifdef EBCDIC
14002                     /* We consider named characters to be literal characters,
14003                      * and they are Unicode */
14004                     literal_endpoint++;
14005                     unicode_range = TRUE;
14006 #endif
14007                 }
14008                 break;
14009             case 'p':
14010             case 'P':
14011                 {
14012                 char *e;
14013
14014                 /* We will handle any undefined properties ourselves */
14015                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14016                                        /* And we actually would prefer to get
14017                                         * the straight inversion list of the
14018                                         * swash, since we will be accessing it
14019                                         * anyway, to save a little time */
14020                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14021
14022                 if (RExC_parse >= RExC_end)
14023                     vFAIL2("Empty \\%c{}", (U8)value);
14024                 if (*RExC_parse == '{') {
14025                     const U8 c = (U8)value;
14026                     e = strchr(RExC_parse++, '}');
14027                     if (!e)
14028                         vFAIL2("Missing right brace on \\%c{}", c);
14029                     while (isSPACE(*RExC_parse))
14030                         RExC_parse++;
14031                     if (e == RExC_parse)
14032                         vFAIL2("Empty \\%c{}", c);
14033                     n = e - RExC_parse;
14034                     while (isSPACE(*(RExC_parse + n - 1)))
14035                         n--;
14036                 }
14037                 else {
14038                     e = RExC_parse;
14039                     n = 1;
14040                 }
14041                 if (!SIZE_ONLY) {
14042                     SV* invlist;
14043                     char* name;
14044
14045                     if (UCHARAT(RExC_parse) == '^') {
14046                          RExC_parse++;
14047                          n--;
14048                          /* toggle.  (The rhs xor gets the single bit that
14049                           * differs between P and p; the other xor inverts just
14050                           * that bit) */
14051                          value ^= 'P' ^ 'p';
14052
14053                          while (isSPACE(*RExC_parse)) {
14054                               RExC_parse++;
14055                               n--;
14056                          }
14057                     }
14058                     /* Try to get the definition of the property into
14059                      * <invlist>.  If /i is in effect, the effective property
14060                      * will have its name be <__NAME_i>.  The design is
14061                      * discussed in commit
14062                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14063                     name = savepv(Perl_form(aTHX_
14064                                           "%s%.*s%s\n",
14065                                           (FOLD) ? "__" : "",
14066                                           (int)n,
14067                                           RExC_parse,
14068                                           (FOLD) ? "_i" : ""
14069                                 ));
14070
14071                     /* Look up the property name, and get its swash and
14072                      * inversion list, if the property is found  */
14073                     if (swash) {
14074                         SvREFCNT_dec_NN(swash);
14075                     }
14076                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
14077                                              1, /* binary */
14078                                              0, /* not tr/// */
14079                                              NULL, /* No inversion list */
14080                                              &swash_init_flags
14081                                             );
14082                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14083                         HV* curpkg = (IN_PERL_COMPILETIME)
14084                                       ? PL_curstash
14085                                       : CopSTASH(PL_curcop);
14086                         if (swash) {
14087                             SvREFCNT_dec_NN(swash);
14088                             swash = NULL;
14089                         }
14090
14091                         /* Here didn't find it.  It could be a user-defined
14092                          * property that will be available at run-time.  If we
14093                          * accept only compile-time properties, is an error;
14094                          * otherwise add it to the list for run-time look up */
14095                         if (ret_invlist) {
14096                             RExC_parse = e + 1;
14097                             vFAIL2utf8f(
14098                                 "Property '%"UTF8f"' is unknown",
14099                                 UTF8fARG(UTF, n, name));
14100                         }
14101
14102                         /* If the property name doesn't already have a package
14103                          * name, add the current one to it so that it can be
14104                          * referred to outside it. [perl #121777] */
14105                         if (curpkg && ! instr(name, "::")) {
14106                             char* pkgname = HvNAME(curpkg);
14107                             if (strNE(pkgname, "main")) {
14108                                 char* full_name = Perl_form(aTHX_
14109                                                             "%s::%s",
14110                                                             pkgname,
14111                                                             name);
14112                                 n = strlen(full_name);
14113                                 Safefree(name);
14114                                 name = savepvn(full_name, n);
14115                             }
14116                         }
14117                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14118                                         (value == 'p' ? '+' : '!'),
14119                                         UTF8fARG(UTF, n, name));
14120                         has_user_defined_property = TRUE;
14121
14122                         /* We don't know yet, so have to assume that the
14123                          * property could match something in the Latin1 range,
14124                          * hence something that isn't utf8.  Note that this
14125                          * would cause things in <depends_list> to match
14126                          * inappropriately, except that any \p{}, including
14127                          * this one forces Unicode semantics, which means there
14128                          * is no <depends_list> */
14129                         ANYOF_FLAGS(ret)
14130                                       |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14131                     }
14132                     else {
14133
14134                         /* Here, did get the swash and its inversion list.  If
14135                          * the swash is from a user-defined property, then this
14136                          * whole character class should be regarded as such */
14137                         if (swash_init_flags
14138                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14139                         {
14140                             has_user_defined_property = TRUE;
14141                         }
14142                         else if
14143                             /* We warn on matching an above-Unicode code point
14144                              * if the match would return true, except don't
14145                              * warn for \p{All}, which has exactly one element
14146                              * = 0 */
14147                             (_invlist_contains_cp(invlist, 0x110000)
14148                                 && (! (_invlist_len(invlist) == 1
14149                                        && *invlist_array(invlist) == 0)))
14150                         {
14151                             warn_super = TRUE;
14152                         }
14153
14154
14155                         /* Invert if asking for the complement */
14156                         if (value == 'P') {
14157                             _invlist_union_complement_2nd(properties,
14158                                                           invlist,
14159                                                           &properties);
14160
14161                             /* The swash can't be used as-is, because we've
14162                              * inverted things; delay removing it to here after
14163                              * have copied its invlist above */
14164                             SvREFCNT_dec_NN(swash);
14165                             swash = NULL;
14166                         }
14167                         else {
14168                             _invlist_union(properties, invlist, &properties);
14169                         }
14170                     }
14171                     Safefree(name);
14172                 }
14173                 RExC_parse = e + 1;
14174                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
14175                                                 named */
14176
14177                 /* \p means they want Unicode semantics */
14178                 RExC_uni_semantics = 1;
14179                 }
14180                 break;
14181             case 'n':   value = '\n';                   break;
14182             case 'r':   value = '\r';                   break;
14183             case 't':   value = '\t';                   break;
14184             case 'f':   value = '\f';                   break;
14185             case 'b':   value = '\b';                   break;
14186             case 'e':   value = ESC_NATIVE;             break;
14187             case 'a':   value = '\a';                   break;
14188             case 'o':
14189                 RExC_parse--;   /* function expects to be pointed at the 'o' */
14190                 {
14191                     const char* error_msg;
14192                     bool valid = grok_bslash_o(&RExC_parse,
14193                                                &value,
14194                                                &error_msg,
14195                                                PASS2,   /* warnings only in
14196                                                            pass 2 */
14197                                                strict,
14198                                                silence_non_portable,
14199                                                UTF);
14200                     if (! valid) {
14201                         vFAIL(error_msg);
14202                     }
14203                 }
14204                 if (IN_ENCODING && value < 0x100) {
14205                     goto recode_encoding;
14206                 }
14207                 break;
14208             case 'x':
14209                 RExC_parse--;   /* function expects to be pointed at the 'x' */
14210                 {
14211                     const char* error_msg;
14212                     bool valid = grok_bslash_x(&RExC_parse,
14213                                                &value,
14214                                                &error_msg,
14215                                                PASS2, /* Output warnings */
14216                                                strict,
14217                                                silence_non_portable,
14218                                                UTF);
14219                     if (! valid) {
14220                         vFAIL(error_msg);
14221                     }
14222                 }
14223                 if (IN_ENCODING && value < 0x100)
14224                     goto recode_encoding;
14225                 break;
14226             case 'c':
14227                 value = grok_bslash_c(*RExC_parse++, PASS2);
14228                 break;
14229             case '0': case '1': case '2': case '3': case '4':
14230             case '5': case '6': case '7':
14231                 {
14232                     /* Take 1-3 octal digits */
14233                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14234                     numlen = (strict) ? 4 : 3;
14235                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14236                     RExC_parse += numlen;
14237                     if (numlen != 3) {
14238                         if (strict) {
14239                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14240                             vFAIL("Need exactly 3 octal digits");
14241                         }
14242                         else if (! SIZE_ONLY /* like \08, \178 */
14243                                  && numlen < 3
14244                                  && RExC_parse < RExC_end
14245                                  && isDIGIT(*RExC_parse)
14246                                  && ckWARN(WARN_REGEXP))
14247                         {
14248                             SAVEFREESV(RExC_rx_sv);
14249                             reg_warn_non_literal_string(
14250                                  RExC_parse + 1,
14251                                  form_short_octal_warning(RExC_parse, numlen));
14252                             (void)ReREFCNT_inc(RExC_rx_sv);
14253                         }
14254                     }
14255                     if (IN_ENCODING && value < 0x100)
14256                         goto recode_encoding;
14257                     break;
14258                 }
14259             recode_encoding:
14260                 if (! RExC_override_recoding) {
14261                     SV* enc = _get_encoding();
14262                     value = reg_recode((const char)(U8)value, &enc);
14263                     if (!enc) {
14264                         if (strict) {
14265                             vFAIL("Invalid escape in the specified encoding");
14266                         }
14267                         else if (PASS2) {
14268                             ckWARNreg(RExC_parse,
14269                                   "Invalid escape in the specified encoding");
14270                         }
14271                     }
14272                     break;
14273                 }
14274             default:
14275                 /* Allow \_ to not give an error */
14276                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14277                     if (strict) {
14278                         vFAIL2("Unrecognized escape \\%c in character class",
14279                                (int)value);
14280                     }
14281                     else {
14282                         SAVEFREESV(RExC_rx_sv);
14283                         ckWARN2reg(RExC_parse,
14284                             "Unrecognized escape \\%c in character class passed through",
14285                             (int)value);
14286                         (void)ReREFCNT_inc(RExC_rx_sv);
14287                     }
14288                 }
14289                 break;
14290             }   /* End of switch on char following backslash */
14291         } /* end of handling backslash escape sequences */
14292
14293         /* Here, we have the current token in 'value' */
14294
14295         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14296             U8 classnum;
14297
14298             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
14299              * literal, as is the character that began the false range, i.e.
14300              * the 'a' in the examples */
14301             if (range) {
14302                 if (!SIZE_ONLY) {
14303                     const int w = (RExC_parse >= rangebegin)
14304                                   ? RExC_parse - rangebegin
14305                                   : 0;
14306                     if (strict) {
14307                         vFAIL2utf8f(
14308                             "False [] range \"%"UTF8f"\"",
14309                             UTF8fARG(UTF, w, rangebegin));
14310                     }
14311                     else {
14312                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14313                         ckWARN2reg(RExC_parse,
14314                             "False [] range \"%"UTF8f"\"",
14315                             UTF8fARG(UTF, w, rangebegin));
14316                         (void)ReREFCNT_inc(RExC_rx_sv);
14317                         cp_list = add_cp_to_invlist(cp_list, '-');
14318                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14319                                                              prevvalue);
14320                     }
14321                 }
14322
14323                 range = 0; /* this was not a true range */
14324                 element_count += 2; /* So counts for three values */
14325             }
14326
14327             classnum = namedclass_to_classnum(namedclass);
14328
14329             if (LOC && namedclass < ANYOF_POSIXL_MAX
14330 #ifndef HAS_ISASCII
14331                 && classnum != _CC_ASCII
14332 #endif
14333             ) {
14334                 /* What the Posix classes (like \w, [:space:]) match in locale
14335                  * isn't knowable under locale until actual match time.  Room
14336                  * must be reserved (one time per outer bracketed class) to
14337                  * store such classes.  The space will contain a bit for each
14338                  * named class that is to be matched against.  This isn't
14339                  * needed for \p{} and pseudo-classes, as they are not affected
14340                  * by locale, and hence are dealt with separately */
14341                 if (! need_class) {
14342                     need_class = 1;
14343                     if (SIZE_ONLY) {
14344                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14345                     }
14346                     else {
14347                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14348                     }
14349                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14350                     ANYOF_POSIXL_ZERO(ret);
14351                 }
14352
14353                 /* Coverity thinks it is possible for this to be negative; both
14354                  * jhi and khw think it's not, but be safer */
14355                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14356                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14357
14358                 /* See if it already matches the complement of this POSIX
14359                  * class */
14360                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14361                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14362                                                             ? -1
14363                                                             : 1)))
14364                 {
14365                     posixl_matches_all = TRUE;
14366                     break;  /* No need to continue.  Since it matches both
14367                                e.g., \w and \W, it matches everything, and the
14368                                bracketed class can be optimized into qr/./s */
14369                 }
14370
14371                 /* Add this class to those that should be checked at runtime */
14372                 ANYOF_POSIXL_SET(ret, namedclass);
14373
14374                 /* The above-Latin1 characters are not subject to locale rules.
14375                  * Just add them, in the second pass, to the
14376                  * unconditionally-matched list */
14377                 if (! SIZE_ONLY) {
14378                     SV* scratch_list = NULL;
14379
14380                     /* Get the list of the above-Latin1 code points this
14381                      * matches */
14382                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14383                                           PL_XPosix_ptrs[classnum],
14384
14385                                           /* Odd numbers are complements, like
14386                                            * NDIGIT, NASCII, ... */
14387                                           namedclass % 2 != 0,
14388                                           &scratch_list);
14389                     /* Checking if 'cp_list' is NULL first saves an extra
14390                      * clone.  Its reference count will be decremented at the
14391                      * next union, etc, or if this is the only instance, at the
14392                      * end of the routine */
14393                     if (! cp_list) {
14394                         cp_list = scratch_list;
14395                     }
14396                     else {
14397                         _invlist_union(cp_list, scratch_list, &cp_list);
14398                         SvREFCNT_dec_NN(scratch_list);
14399                     }
14400                     continue;   /* Go get next character */
14401                 }
14402             }
14403             else if (! SIZE_ONLY) {
14404
14405                 /* Here, not in pass1 (in that pass we skip calculating the
14406                  * contents of this class), and is /l, or is a POSIX class for
14407                  * which /l doesn't matter (or is a Unicode property, which is
14408                  * skipped here). */
14409                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
14410                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14411
14412                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
14413                          * nor /l make a difference in what these match,
14414                          * therefore we just add what they match to cp_list. */
14415                         if (classnum != _CC_VERTSPACE) {
14416                             assert(   namedclass == ANYOF_HORIZWS
14417                                    || namedclass == ANYOF_NHORIZWS);
14418
14419                             /* It turns out that \h is just a synonym for
14420                              * XPosixBlank */
14421                             classnum = _CC_BLANK;
14422                         }
14423
14424                         _invlist_union_maybe_complement_2nd(
14425                                 cp_list,
14426                                 PL_XPosix_ptrs[classnum],
14427                                 namedclass % 2 != 0,    /* Complement if odd
14428                                                           (NHORIZWS, NVERTWS)
14429                                                         */
14430                                 &cp_list);
14431                     }
14432                 }
14433                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
14434                            complement and use nposixes */
14435                     SV** posixes_ptr = namedclass % 2 == 0
14436                                        ? &posixes
14437                                        : &nposixes;
14438                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
14439                     _invlist_union_maybe_complement_2nd(
14440                                                      *posixes_ptr,
14441                                                      *source_ptr,
14442                                                      namedclass % 2 != 0,
14443                                                      posixes_ptr);
14444                 }
14445             }
14446         } /* end of namedclass \blah */
14447
14448         if (skip_white) {
14449             RExC_parse = regpatws(pRExC_state, RExC_parse,
14450                                 FALSE /* means don't recognize comments */ );
14451         }
14452
14453         /* If 'range' is set, 'value' is the ending of a range--check its
14454          * validity.  (If value isn't a single code point in the case of a
14455          * range, we should have figured that out above in the code that
14456          * catches false ranges).  Later, we will handle each individual code
14457          * point in the range.  If 'range' isn't set, this could be the
14458          * beginning of a range, so check for that by looking ahead to see if
14459          * the next real character to be processed is the range indicator--the
14460          * minus sign */
14461
14462         if (range) {
14463 #ifdef EBCDIC
14464             /* For unicode ranges, we have to test that the Unicode as opposed
14465              * to the native values are not decreasing.  (Above 255, and there
14466              * is no difference between native and Unicode) */
14467             if (unicode_range && prevvalue < 255 && value < 255) {
14468                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14469                     goto backwards_range;
14470                 }
14471             }
14472             else
14473 #endif
14474             if (prevvalue > value) /* b-a */ {
14475                 int w;
14476 #ifdef EBCDIC
14477               backwards_range:
14478 #endif
14479                 w = RExC_parse - rangebegin;
14480                 vFAIL2utf8f(
14481                     "Invalid [] range \"%"UTF8f"\"",
14482                     UTF8fARG(UTF, w, rangebegin));
14483                 NOT_REACHED; /* NOT REACHED */
14484             }
14485         }
14486         else {
14487             prevvalue = value; /* save the beginning of the potential range */
14488             if (! stop_at_1     /* Can't be a range if parsing just one thing */
14489                 && *RExC_parse == '-')
14490             {
14491                 char* next_char_ptr = RExC_parse + 1;
14492                 if (skip_white) {   /* Get the next real char after the '-' */
14493                     next_char_ptr = regpatws(pRExC_state,
14494                                              RExC_parse + 1,
14495                                              FALSE); /* means don't recognize
14496                                                         comments */
14497                 }
14498
14499                 /* If the '-' is at the end of the class (just before the ']',
14500                  * it is a literal minus; otherwise it is a range */
14501                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14502                     RExC_parse = next_char_ptr;
14503
14504                     /* a bad range like \w-, [:word:]- ? */
14505                     if (namedclass > OOB_NAMEDCLASS) {
14506                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14507                             const int w = RExC_parse >= rangebegin
14508                                           ?  RExC_parse - rangebegin
14509                                           : 0;
14510                             if (strict) {
14511                                 vFAIL4("False [] range \"%*.*s\"",
14512                                     w, w, rangebegin);
14513                             }
14514                             else if (PASS2) {
14515                                 vWARN4(RExC_parse,
14516                                     "False [] range \"%*.*s\"",
14517                                     w, w, rangebegin);
14518                             }
14519                         }
14520                         if (!SIZE_ONLY) {
14521                             cp_list = add_cp_to_invlist(cp_list, '-');
14522                         }
14523                         element_count++;
14524                     } else
14525                         range = 1;      /* yeah, it's a range! */
14526                     continue;   /* but do it the next time */
14527                 }
14528             }
14529         }
14530
14531         if (namedclass > OOB_NAMEDCLASS) {
14532             continue;
14533         }
14534
14535         /* Here, we have a single value this time through the loop, and
14536          * <prevvalue> is the beginning of the range, if any; or <value> if
14537          * not. */
14538
14539         /* non-Latin1 code point implies unicode semantics.  Must be set in
14540          * pass1 so is there for the whole of pass 2 */
14541         if (value > 255) {
14542             RExC_uni_semantics = 1;
14543         }
14544
14545         /* Ready to process either the single value, or the completed range.
14546          * For single-valued non-inverted ranges, we consider the possibility
14547          * of multi-char folds.  (We made a conscious decision to not do this
14548          * for the other cases because it can often lead to non-intuitive
14549          * results.  For example, you have the peculiar case that:
14550          *  "s s" =~ /^[^\xDF]+$/i => Y
14551          *  "ss"  =~ /^[^\xDF]+$/i => N
14552          *
14553          * See [perl #89750] */
14554         if (FOLD && allow_multi_folds && value == prevvalue) {
14555             if (value == LATIN_SMALL_LETTER_SHARP_S
14556                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14557                                                         value)))
14558             {
14559                 /* Here <value> is indeed a multi-char fold.  Get what it is */
14560
14561                 U8 foldbuf[UTF8_MAXBYTES_CASE];
14562                 STRLEN foldlen;
14563
14564                 UV folded = _to_uni_fold_flags(
14565                                 value,
14566                                 foldbuf,
14567                                 &foldlen,
14568                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14569                                                    ? FOLD_FLAGS_NOMIX_ASCII
14570                                                    : 0)
14571                                 );
14572
14573                 /* Here, <folded> should be the first character of the
14574                  * multi-char fold of <value>, with <foldbuf> containing the
14575                  * whole thing.  But, if this fold is not allowed (because of
14576                  * the flags), <fold> will be the same as <value>, and should
14577                  * be processed like any other character, so skip the special
14578                  * handling */
14579                 if (folded != value) {
14580
14581                     /* Skip if we are recursed, currently parsing the class
14582                      * again.  Otherwise add this character to the list of
14583                      * multi-char folds. */
14584                     if (! RExC_in_multi_char_class) {
14585                         STRLEN cp_count = utf8_length(foldbuf,
14586                                                       foldbuf + foldlen);
14587                         SV* multi_fold = sv_2mortal(newSVpvs(""));
14588
14589                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14590
14591                         multi_char_matches
14592                                         = add_multi_match(multi_char_matches,
14593                                                           multi_fold,
14594                                                           cp_count);
14595
14596                     }
14597
14598                     /* This element should not be processed further in this
14599                      * class */
14600                     element_count--;
14601                     value = save_value;
14602                     prevvalue = save_prevvalue;
14603                     continue;
14604                 }
14605             }
14606         }
14607
14608         /* Deal with this element of the class */
14609         if (! SIZE_ONLY) {
14610 #ifndef EBCDIC
14611             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14612                                                      prevvalue, value);
14613 #else
14614             /* On non-ASCII platforms, for ranges that span all of 0..255, and
14615              * ones that don't require special handling, we can just add the
14616              * range like we do for ASCII platforms */
14617             if ((UNLIKELY(prevvalue == 0) && value >= 255)
14618                 || ! (prevvalue < 256
14619                       && (unicode_range
14620                           || (literal_endpoint == 2
14621                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
14622                                   || (isUPPER_A(prevvalue)
14623                                       && isUPPER_A(value)))))))
14624             {
14625                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14626                                                          prevvalue, value);
14627             }
14628             else {
14629                 /* Here, requires special handling.  This can be because it is
14630                  * a range whose code points are considered to be Unicode, and
14631                  * so must be individually translated into native, or because
14632                  * its a subrange of 'A-Z' or 'a-z' which each aren't
14633                  * contiguous in EBCDIC, but we have defined them to include
14634                  * only the "expected" upper or lower case ASCII alphabetics.
14635                  * Subranges above 255 are the same in native and Unicode, so
14636                  * can be added as a range */
14637                 U8 start = NATIVE_TO_LATIN1(prevvalue);
14638                 unsigned j;
14639                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
14640                 for (j = start; j <= end; j++) {
14641                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
14642                 }
14643                 if (value > 255) {
14644                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14645                                                              256, value);
14646                 }
14647             }
14648 #endif
14649         }
14650
14651         range = 0; /* this range (if it was one) is done now */
14652     } /* End of loop through all the text within the brackets */
14653
14654     /* If anything in the class expands to more than one character, we have to
14655      * deal with them by building up a substitute parse string, and recursively
14656      * calling reg() on it, instead of proceeding */
14657     if (multi_char_matches) {
14658         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14659         I32 cp_count;
14660         STRLEN len;
14661         char *save_end = RExC_end;
14662         char *save_parse = RExC_parse;
14663         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14664                                        a "|" */
14665         I32 reg_flags;
14666
14667         assert(! invert);
14668 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14669            because too confusing */
14670         if (invert) {
14671             sv_catpv(substitute_parse, "(?:");
14672         }
14673 #endif
14674
14675         /* Look at the longest folds first */
14676         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14677
14678             if (av_exists(multi_char_matches, cp_count)) {
14679                 AV** this_array_ptr;
14680                 SV* this_sequence;
14681
14682                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14683                                                  cp_count, FALSE);
14684                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14685                                                                 &PL_sv_undef)
14686                 {
14687                     if (! first_time) {
14688                         sv_catpv(substitute_parse, "|");
14689                     }
14690                     first_time = FALSE;
14691
14692                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14693                 }
14694             }
14695         }
14696
14697         /* If the character class contains anything else besides these
14698          * multi-character folds, have to include it in recursive parsing */
14699         if (element_count) {
14700             sv_catpv(substitute_parse, "|[");
14701             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14702             sv_catpv(substitute_parse, "]");
14703         }
14704
14705         sv_catpv(substitute_parse, ")");
14706 #if 0
14707         if (invert) {
14708             /* This is a way to get the parse to skip forward a whole named
14709              * sequence instead of matching the 2nd character when it fails the
14710              * first */
14711             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14712         }
14713 #endif
14714
14715         RExC_parse = SvPV(substitute_parse, len);
14716         RExC_end = RExC_parse + len;
14717         RExC_in_multi_char_class = 1;
14718         RExC_override_recoding = 1;
14719         RExC_emit = (regnode *)orig_emit;
14720
14721         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14722
14723         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14724
14725         RExC_parse = save_parse;
14726         RExC_end = save_end;
14727         RExC_in_multi_char_class = 0;
14728         RExC_override_recoding = 0;
14729         SvREFCNT_dec_NN(multi_char_matches);
14730         return ret;
14731     }
14732
14733     /* Here, we've gone through the entire class and dealt with multi-char
14734      * folds.  We are now in a position that we can do some checks to see if we
14735      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14736      * Currently we only do two checks:
14737      * 1) is in the unlikely event that the user has specified both, eg. \w and
14738      *    \W under /l, then the class matches everything.  (This optimization
14739      *    is done only to make the optimizer code run later work.)
14740      * 2) if the character class contains only a single element (including a
14741      *    single range), we see if there is an equivalent node for it.
14742      * Other checks are possible */
14743     if (! ret_invlist   /* Can't optimize if returning the constructed
14744                            inversion list */
14745         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14746     {
14747         U8 op = END;
14748         U8 arg = 0;
14749
14750         if (UNLIKELY(posixl_matches_all)) {
14751             op = SANY;
14752         }
14753         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14754                                                    \w or [:digit:] or \p{foo}
14755                                                  */
14756
14757             /* All named classes are mapped into POSIXish nodes, with its FLAG
14758              * argument giving which class it is */
14759             switch ((I32)namedclass) {
14760                 case ANYOF_UNIPROP:
14761                     break;
14762
14763                 /* These don't depend on the charset modifiers.  They always
14764                  * match under /u rules */
14765                 case ANYOF_NHORIZWS:
14766                 case ANYOF_HORIZWS:
14767                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14768                     /* FALLTHROUGH */
14769
14770                 case ANYOF_NVERTWS:
14771                 case ANYOF_VERTWS:
14772                     op = POSIXU;
14773                     goto join_posix;
14774
14775                 /* The actual POSIXish node for all the rest depends on the
14776                  * charset modifier.  The ones in the first set depend only on
14777                  * ASCII or, if available on this platform, locale */
14778                 case ANYOF_ASCII:
14779                 case ANYOF_NASCII:
14780 #ifdef HAS_ISASCII
14781                     op = (LOC) ? POSIXL : POSIXA;
14782 #else
14783                     op = POSIXA;
14784 #endif
14785                     goto join_posix;
14786
14787                 case ANYOF_NCASED:
14788                 case ANYOF_LOWER:
14789                 case ANYOF_NLOWER:
14790                 case ANYOF_UPPER:
14791                 case ANYOF_NUPPER:
14792                     /* under /a could be alpha */
14793                     if (FOLD) {
14794                         if (ASCII_RESTRICTED) {
14795                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14796                         }
14797                         else if (! LOC) {
14798                             break;
14799                         }
14800                     }
14801                     /* FALLTHROUGH */
14802
14803                 /* The rest have more possibilities depending on the charset.
14804                  * We take advantage of the enum ordering of the charset
14805                  * modifiers to get the exact node type, */
14806                 default:
14807                     op = POSIXD + get_regex_charset(RExC_flags);
14808                     if (op > POSIXA) { /* /aa is same as /a */
14809                         op = POSIXA;
14810                     }
14811
14812                 join_posix:
14813                     /* The odd numbered ones are the complements of the
14814                      * next-lower even number one */
14815                     if (namedclass % 2 == 1) {
14816                         invert = ! invert;
14817                         namedclass--;
14818                     }
14819                     arg = namedclass_to_classnum(namedclass);
14820                     break;
14821             }
14822         }
14823         else if (value == prevvalue) {
14824
14825             /* Here, the class consists of just a single code point */
14826
14827             if (invert) {
14828                 if (! LOC && value == '\n') {
14829                     op = REG_ANY; /* Optimize [^\n] */
14830                     *flagp |= HASWIDTH|SIMPLE;
14831                     MARK_NAUGHTY(1);
14832                 }
14833             }
14834             else if (value < 256 || UTF) {
14835
14836                 /* Optimize a single value into an EXACTish node, but not if it
14837                  * would require converting the pattern to UTF-8. */
14838                 op = compute_EXACTish(pRExC_state);
14839             }
14840         } /* Otherwise is a range */
14841         else if (! LOC) {   /* locale could vary these */
14842             if (prevvalue == '0') {
14843                 if (value == '9') {
14844                     arg = _CC_DIGIT;
14845                     op = POSIXA;
14846                 }
14847             }
14848             else if (prevvalue == 'A') {
14849                 if (value == 'Z'
14850 #ifdef EBCDIC
14851                     && literal_endpoint == 2
14852 #endif
14853                 ) {
14854                     arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14855                     op = POSIXA;
14856                 }
14857             }
14858             else if (prevvalue == 'a') {
14859                 if (value == 'z'
14860 #ifdef EBCDIC
14861                     && literal_endpoint == 2
14862 #endif
14863                 ) {
14864                     arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14865                     op = POSIXA;
14866                 }
14867             }
14868         }
14869
14870         /* Here, we have changed <op> away from its initial value iff we found
14871          * an optimization */
14872         if (op != END) {
14873
14874             /* Throw away this ANYOF regnode, and emit the calculated one,
14875              * which should correspond to the beginning, not current, state of
14876              * the parse */
14877             const char * cur_parse = RExC_parse;
14878             RExC_parse = (char *)orig_parse;
14879             if ( SIZE_ONLY) {
14880                 if (! LOC) {
14881
14882                     /* To get locale nodes to not use the full ANYOF size would
14883                      * require moving the code above that writes the portions
14884                      * of it that aren't in other nodes to after this point.
14885                      * e.g.  ANYOF_POSIXL_SET */
14886                     RExC_size = orig_size;
14887                 }
14888             }
14889             else {
14890                 RExC_emit = (regnode *)orig_emit;
14891                 if (PL_regkind[op] == POSIXD) {
14892                     if (op == POSIXL) {
14893                         RExC_contains_locale = 1;
14894                     }
14895                     if (invert) {
14896                         op += NPOSIXD - POSIXD;
14897                     }
14898                 }
14899             }
14900
14901             ret = reg_node(pRExC_state, op);
14902
14903             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14904                 if (! SIZE_ONLY) {
14905                     FLAGS(ret) = arg;
14906                 }
14907                 *flagp |= HASWIDTH|SIMPLE;
14908             }
14909             else if (PL_regkind[op] == EXACT) {
14910                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14911                                            TRUE /* downgradable to EXACT */
14912                                            );
14913             }
14914
14915             RExC_parse = (char *) cur_parse;
14916
14917             SvREFCNT_dec(posixes);
14918             SvREFCNT_dec(nposixes);
14919             SvREFCNT_dec(cp_list);
14920             SvREFCNT_dec(cp_foldable_list);
14921             return ret;
14922         }
14923     }
14924
14925     if (SIZE_ONLY)
14926         return ret;
14927     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14928
14929     /* If folding, we calculate all characters that could fold to or from the
14930      * ones already on the list */
14931     if (cp_foldable_list) {
14932         if (FOLD) {
14933             UV start, end;      /* End points of code point ranges */
14934
14935             SV* fold_intersection = NULL;
14936             SV** use_list;
14937
14938             /* Our calculated list will be for Unicode rules.  For locale
14939              * matching, we have to keep a separate list that is consulted at
14940              * runtime only when the locale indicates Unicode rules.  For
14941              * non-locale, we just use to the general list */
14942             if (LOC) {
14943                 use_list = &only_utf8_locale_list;
14944             }
14945             else {
14946                 use_list = &cp_list;
14947             }
14948
14949             /* Only the characters in this class that participate in folds need
14950              * be checked.  Get the intersection of this class and all the
14951              * possible characters that are foldable.  This can quickly narrow
14952              * down a large class */
14953             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14954                                   &fold_intersection);
14955
14956             /* The folds for all the Latin1 characters are hard-coded into this
14957              * program, but we have to go out to disk to get the others. */
14958             if (invlist_highest(cp_foldable_list) >= 256) {
14959
14960                 /* This is a hash that for a particular fold gives all
14961                  * characters that are involved in it */
14962                 if (! PL_utf8_foldclosures) {
14963                     _load_PL_utf8_foldclosures();
14964                 }
14965             }
14966
14967             /* Now look at the foldable characters in this class individually */
14968             invlist_iterinit(fold_intersection);
14969             while (invlist_iternext(fold_intersection, &start, &end)) {
14970                 UV j;
14971
14972                 /* Look at every character in the range */
14973                 for (j = start; j <= end; j++) {
14974                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14975                     STRLEN foldlen;
14976                     SV** listp;
14977
14978                     if (j < 256) {
14979
14980                         if (IS_IN_SOME_FOLD_L1(j)) {
14981
14982                             /* ASCII is always matched; non-ASCII is matched
14983                              * only under Unicode rules (which could happen
14984                              * under /l if the locale is a UTF-8 one */
14985                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14986                                 *use_list = add_cp_to_invlist(*use_list,
14987                                                             PL_fold_latin1[j]);
14988                             }
14989                             else {
14990                                 depends_list =
14991                                  add_cp_to_invlist(depends_list,
14992                                                    PL_fold_latin1[j]);
14993                             }
14994                         }
14995
14996                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14997                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14998                         {
14999                             add_above_Latin1_folds(pRExC_state,
15000                                                    (U8) j,
15001                                                    use_list);
15002                         }
15003                         continue;
15004                     }
15005
15006                     /* Here is an above Latin1 character.  We don't have the
15007                      * rules hard-coded for it.  First, get its fold.  This is
15008                      * the simple fold, as the multi-character folds have been
15009                      * handled earlier and separated out */
15010                     _to_uni_fold_flags(j, foldbuf, &foldlen,
15011                                                         (ASCII_FOLD_RESTRICTED)
15012                                                         ? FOLD_FLAGS_NOMIX_ASCII
15013                                                         : 0);
15014
15015                     /* Single character fold of above Latin1.  Add everything in
15016                     * its fold closure to the list that this node should match.
15017                     * The fold closures data structure is a hash with the keys
15018                     * being the UTF-8 of every character that is folded to, like
15019                     * 'k', and the values each an array of all code points that
15020                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
15021                     * Multi-character folds are not included */
15022                     if ((listp = hv_fetch(PL_utf8_foldclosures,
15023                                         (char *) foldbuf, foldlen, FALSE)))
15024                     {
15025                         AV* list = (AV*) *listp;
15026                         IV k;
15027                         for (k = 0; k <= av_tindex(list); k++) {
15028                             SV** c_p = av_fetch(list, k, FALSE);
15029                             UV c;
15030                             assert(c_p);
15031
15032                             c = SvUV(*c_p);
15033
15034                             /* /aa doesn't allow folds between ASCII and non- */
15035                             if ((ASCII_FOLD_RESTRICTED
15036                                 && (isASCII(c) != isASCII(j))))
15037                             {
15038                                 continue;
15039                             }
15040
15041                             /* Folds under /l which cross the 255/256 boundary
15042                              * are added to a separate list.  (These are valid
15043                              * only when the locale is UTF-8.) */
15044                             if (c < 256 && LOC) {
15045                                 *use_list = add_cp_to_invlist(*use_list, c);
15046                                 continue;
15047                             }
15048
15049                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15050                             {
15051                                 cp_list = add_cp_to_invlist(cp_list, c);
15052                             }
15053                             else {
15054                                 /* Similarly folds involving non-ascii Latin1
15055                                 * characters under /d are added to their list */
15056                                 depends_list = add_cp_to_invlist(depends_list,
15057                                                                  c);
15058                             }
15059                         }
15060                     }
15061                 }
15062             }
15063             SvREFCNT_dec_NN(fold_intersection);
15064         }
15065
15066         /* Now that we have finished adding all the folds, there is no reason
15067          * to keep the foldable list separate */
15068         _invlist_union(cp_list, cp_foldable_list, &cp_list);
15069         SvREFCNT_dec_NN(cp_foldable_list);
15070     }
15071
15072     /* And combine the result (if any) with any inversion list from posix
15073      * classes.  The lists are kept separate up to now because we don't want to
15074      * fold the classes (folding of those is automatically handled by the swash
15075      * fetching code) */
15076     if (posixes || nposixes) {
15077         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15078             /* Under /a and /aa, nothing above ASCII matches these */
15079             _invlist_intersection(posixes,
15080                                   PL_XPosix_ptrs[_CC_ASCII],
15081                                   &posixes);
15082         }
15083         if (nposixes) {
15084             if (DEPENDS_SEMANTICS) {
15085                 /* Under /d, everything in the upper half of the Latin1 range
15086                  * matches these complements */
15087                 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15088             }
15089             else if (AT_LEAST_ASCII_RESTRICTED) {
15090                 /* Under /a and /aa, everything above ASCII matches these
15091                  * complements */
15092                 _invlist_union_complement_2nd(nposixes,
15093                                               PL_XPosix_ptrs[_CC_ASCII],
15094                                               &nposixes);
15095             }
15096             if (posixes) {
15097                 _invlist_union(posixes, nposixes, &posixes);
15098                 SvREFCNT_dec_NN(nposixes);
15099             }
15100             else {
15101                 posixes = nposixes;
15102             }
15103         }
15104         if (! DEPENDS_SEMANTICS) {
15105             if (cp_list) {
15106                 _invlist_union(cp_list, posixes, &cp_list);
15107                 SvREFCNT_dec_NN(posixes);
15108             }
15109             else {
15110                 cp_list = posixes;
15111             }
15112         }
15113         else {
15114             /* Under /d, we put into a separate list the Latin1 things that
15115              * match only when the target string is utf8 */
15116             SV* nonascii_but_latin1_properties = NULL;
15117             _invlist_intersection(posixes, PL_UpperLatin1,
15118                                   &nonascii_but_latin1_properties);
15119             _invlist_subtract(posixes, nonascii_but_latin1_properties,
15120                               &posixes);
15121             if (cp_list) {
15122                 _invlist_union(cp_list, posixes, &cp_list);
15123                 SvREFCNT_dec_NN(posixes);
15124             }
15125             else {
15126                 cp_list = posixes;
15127             }
15128
15129             if (depends_list) {
15130                 _invlist_union(depends_list, nonascii_but_latin1_properties,
15131                                &depends_list);
15132                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15133             }
15134             else {
15135                 depends_list = nonascii_but_latin1_properties;
15136             }
15137         }
15138     }
15139
15140     /* And combine the result (if any) with any inversion list from properties.
15141      * The lists are kept separate up to now so that we can distinguish the two
15142      * in regards to matching above-Unicode.  A run-time warning is generated
15143      * if a Unicode property is matched against a non-Unicode code point. But,
15144      * we allow user-defined properties to match anything, without any warning,
15145      * and we also suppress the warning if there is a portion of the character
15146      * class that isn't a Unicode property, and which matches above Unicode, \W
15147      * or [\x{110000}] for example.
15148      * (Note that in this case, unlike the Posix one above, there is no
15149      * <depends_list>, because having a Unicode property forces Unicode
15150      * semantics */
15151     if (properties) {
15152         if (cp_list) {
15153
15154             /* If it matters to the final outcome, see if a non-property
15155              * component of the class matches above Unicode.  If so, the
15156              * warning gets suppressed.  This is true even if just a single
15157              * such code point is specified, as though not strictly correct if
15158              * another such code point is matched against, the fact that they
15159              * are using above-Unicode code points indicates they should know
15160              * the issues involved */
15161             if (warn_super) {
15162                 warn_super = ! (invert
15163                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15164             }
15165
15166             _invlist_union(properties, cp_list, &cp_list);
15167             SvREFCNT_dec_NN(properties);
15168         }
15169         else {
15170             cp_list = properties;
15171         }
15172
15173         if (warn_super) {
15174             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15175         }
15176     }
15177
15178     /* Here, we have calculated what code points should be in the character
15179      * class.
15180      *
15181      * Now we can see about various optimizations.  Fold calculation (which we
15182      * did above) needs to take place before inversion.  Otherwise /[^k]/i
15183      * would invert to include K, which under /i would match k, which it
15184      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
15185      * folded until runtime */
15186
15187     /* If we didn't do folding, it's because some information isn't available
15188      * until runtime; set the run-time fold flag for these.  (We don't have to
15189      * worry about properties folding, as that is taken care of by the swash
15190      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
15191      * locales, or the class matches at least one 0-255 range code point */
15192     if (LOC && FOLD) {
15193         if (only_utf8_locale_list) {
15194             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15195         }
15196         else if (cp_list) { /* Look to see if there a 0-255 code point is in
15197                                the list */
15198             UV start, end;
15199             invlist_iterinit(cp_list);
15200             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15201                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15202             }
15203             invlist_iterfinish(cp_list);
15204         }
15205     }
15206
15207     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15208      * at compile time.  Besides not inverting folded locale now, we can't
15209      * invert if there are things such as \w, which aren't known until runtime
15210      * */
15211     if (cp_list
15212         && invert
15213         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15214         && ! depends_list
15215         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15216     {
15217         _invlist_invert(cp_list);
15218
15219         /* Any swash can't be used as-is, because we've inverted things */
15220         if (swash) {
15221             SvREFCNT_dec_NN(swash);
15222             swash = NULL;
15223         }
15224
15225         /* Clear the invert flag since have just done it here */
15226         invert = FALSE;
15227     }
15228
15229     if (ret_invlist) {
15230         *ret_invlist = cp_list;
15231         SvREFCNT_dec(swash);
15232
15233         /* Discard the generated node */
15234         if (SIZE_ONLY) {
15235             RExC_size = orig_size;
15236         }
15237         else {
15238             RExC_emit = orig_emit;
15239         }
15240         return orig_emit;
15241     }
15242
15243     /* Some character classes are equivalent to other nodes.  Such nodes take
15244      * up less room and generally fewer operations to execute than ANYOF nodes.
15245      * Above, we checked for and optimized into some such equivalents for
15246      * certain common classes that are easy to test.  Getting to this point in
15247      * the code means that the class didn't get optimized there.  Since this
15248      * code is only executed in Pass 2, it is too late to save space--it has
15249      * been allocated in Pass 1, and currently isn't given back.  But turning
15250      * things into an EXACTish node can allow the optimizer to join it to any
15251      * adjacent such nodes.  And if the class is equivalent to things like /./,
15252      * expensive run-time swashes can be avoided.  Now that we have more
15253      * complete information, we can find things necessarily missed by the
15254      * earlier code.  I (khw) am not sure how much to look for here.  It would
15255      * be easy, but perhaps too slow, to check any candidates against all the
15256      * node types they could possibly match using _invlistEQ(). */
15257
15258     if (cp_list
15259         && ! invert
15260         && ! depends_list
15261         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15262         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15263
15264            /* We don't optimize if we are supposed to make sure all non-Unicode
15265             * code points raise a warning, as only ANYOF nodes have this check.
15266             * */
15267         && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15268     {
15269         UV start, end;
15270         U8 op = END;  /* The optimzation node-type */
15271         const char * cur_parse= RExC_parse;
15272
15273         invlist_iterinit(cp_list);
15274         if (! invlist_iternext(cp_list, &start, &end)) {
15275
15276             /* Here, the list is empty.  This happens, for example, when a
15277              * Unicode property is the only thing in the character class, and
15278              * it doesn't match anything.  (perluniprops.pod notes such
15279              * properties) */
15280             op = OPFAIL;
15281             *flagp |= HASWIDTH|SIMPLE;
15282         }
15283         else if (start == end) {    /* The range is a single code point */
15284             if (! invlist_iternext(cp_list, &start, &end)
15285
15286                     /* Don't do this optimization if it would require changing
15287                      * the pattern to UTF-8 */
15288                 && (start < 256 || UTF))
15289             {
15290                 /* Here, the list contains a single code point.  Can optimize
15291                  * into an EXACTish node */
15292
15293                 value = start;
15294
15295                 if (! FOLD) {
15296                     op = (LOC)
15297                          ? EXACTL
15298                          : EXACT;
15299                 }
15300                 else if (LOC) {
15301
15302                     /* A locale node under folding with one code point can be
15303                      * an EXACTFL, as its fold won't be calculated until
15304                      * runtime */
15305                     op = EXACTFL;
15306                 }
15307                 else {
15308
15309                     /* Here, we are generally folding, but there is only one
15310                      * code point to match.  If we have to, we use an EXACT
15311                      * node, but it would be better for joining with adjacent
15312                      * nodes in the optimization pass if we used the same
15313                      * EXACTFish node that any such are likely to be.  We can
15314                      * do this iff the code point doesn't participate in any
15315                      * folds.  For example, an EXACTF of a colon is the same as
15316                      * an EXACT one, since nothing folds to or from a colon. */
15317                     if (value < 256) {
15318                         if (IS_IN_SOME_FOLD_L1(value)) {
15319                             op = EXACT;
15320                         }
15321                     }
15322                     else {
15323                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15324                             op = EXACT;
15325                         }
15326                     }
15327
15328                     /* If we haven't found the node type, above, it means we
15329                      * can use the prevailing one */
15330                     if (op == END) {
15331                         op = compute_EXACTish(pRExC_state);
15332                     }
15333                 }
15334             }
15335         }
15336         else if (start == 0) {
15337             if (end == UV_MAX) {
15338                 op = SANY;
15339                 *flagp |= HASWIDTH|SIMPLE;
15340                 MARK_NAUGHTY(1);
15341             }
15342             else if (end == '\n' - 1
15343                     && invlist_iternext(cp_list, &start, &end)
15344                     && start == '\n' + 1 && end == UV_MAX)
15345             {
15346                 op = REG_ANY;
15347                 *flagp |= HASWIDTH|SIMPLE;
15348                 MARK_NAUGHTY(1);
15349             }
15350         }
15351         invlist_iterfinish(cp_list);
15352
15353         if (op != END) {
15354             RExC_parse = (char *)orig_parse;
15355             RExC_emit = (regnode *)orig_emit;
15356
15357             ret = reg_node(pRExC_state, op);
15358
15359             RExC_parse = (char *)cur_parse;
15360
15361             if (PL_regkind[op] == EXACT) {
15362                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15363                                            TRUE /* downgradable to EXACT */
15364                                           );
15365             }
15366
15367             SvREFCNT_dec_NN(cp_list);
15368             return ret;
15369         }
15370     }
15371
15372     /* Here, <cp_list> contains all the code points we can determine at
15373      * compile time that match under all conditions.  Go through it, and
15374      * for things that belong in the bitmap, put them there, and delete from
15375      * <cp_list>.  While we are at it, see if everything above 255 is in the
15376      * list, and if so, set a flag to speed up execution */
15377
15378     populate_ANYOF_from_invlist(ret, &cp_list);
15379
15380     if (invert) {
15381         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15382     }
15383
15384     /* Here, the bitmap has been populated with all the Latin1 code points that
15385      * always match.  Can now add to the overall list those that match only
15386      * when the target string is UTF-8 (<depends_list>). */
15387     if (depends_list) {
15388         if (cp_list) {
15389             _invlist_union(cp_list, depends_list, &cp_list);
15390             SvREFCNT_dec_NN(depends_list);
15391         }
15392         else {
15393             cp_list = depends_list;
15394         }
15395         ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15396     }
15397
15398     /* If there is a swash and more than one element, we can't use the swash in
15399      * the optimization below. */
15400     if (swash && element_count > 1) {
15401         SvREFCNT_dec_NN(swash);
15402         swash = NULL;
15403     }
15404
15405     /* Note that the optimization of using 'swash' if it is the only thing in
15406      * the class doesn't have us change swash at all, so it can include things
15407      * that are also in the bitmap; otherwise we have purposely deleted that
15408      * duplicate information */
15409     set_ANYOF_arg(pRExC_state, ret, cp_list,
15410                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15411                    ? listsv : NULL,
15412                   only_utf8_locale_list,
15413                   swash, has_user_defined_property);
15414
15415     *flagp |= HASWIDTH|SIMPLE;
15416
15417     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15418         RExC_contains_locale = 1;
15419     }
15420
15421     return ret;
15422 }
15423
15424 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15425
15426 STATIC void
15427 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15428                 regnode* const node,
15429                 SV* const cp_list,
15430                 SV* const runtime_defns,
15431                 SV* const only_utf8_locale_list,
15432                 SV* const swash,
15433                 const bool has_user_defined_property)
15434 {
15435     /* Sets the arg field of an ANYOF-type node 'node', using information about
15436      * the node passed-in.  If there is nothing outside the node's bitmap, the
15437      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
15438      * the count returned by add_data(), having allocated and stored an array,
15439      * av, that that count references, as follows:
15440      *  av[0] stores the character class description in its textual form.
15441      *        This is used later (regexec.c:Perl_regclass_swash()) to
15442      *        initialize the appropriate swash, and is also useful for dumping
15443      *        the regnode.  This is set to &PL_sv_undef if the textual
15444      *        description is not needed at run-time (as happens if the other
15445      *        elements completely define the class)
15446      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15447      *        computed from av[0].  But if no further computation need be done,
15448      *        the swash is stored here now (and av[0] is &PL_sv_undef).
15449      *  av[2] stores the inversion list of code points that match only if the
15450      *        current locale is UTF-8
15451      *  av[3] stores the cp_list inversion list for use in addition or instead
15452      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15453      *        (Otherwise everything needed is already in av[0] and av[1])
15454      *  av[4] is set if any component of the class is from a user-defined
15455      *        property; used only if av[3] exists */
15456
15457     UV n;
15458
15459     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15460
15461     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15462         assert(! (ANYOF_FLAGS(node)
15463                   & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15464                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15465         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15466     }
15467     else {
15468         AV * const av = newAV();
15469         SV *rv;
15470
15471         assert(ANYOF_FLAGS(node)
15472                & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15473                   |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15474
15475         av_store(av, 0, (runtime_defns)
15476                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15477         if (swash) {
15478             assert(cp_list);
15479             av_store(av, 1, swash);
15480             SvREFCNT_dec_NN(cp_list);
15481         }
15482         else {
15483             av_store(av, 1, &PL_sv_undef);
15484             if (cp_list) {
15485                 av_store(av, 3, cp_list);
15486                 av_store(av, 4, newSVuv(has_user_defined_property));
15487             }
15488         }
15489
15490         if (only_utf8_locale_list) {
15491             av_store(av, 2, only_utf8_locale_list);
15492         }
15493         else {
15494             av_store(av, 2, &PL_sv_undef);
15495         }
15496
15497         rv = newRV_noinc(MUTABLE_SV(av));
15498         n = add_data(pRExC_state, STR_WITH_LEN("s"));
15499         RExC_rxi->data->data[n] = (void*)rv;
15500         ARG_SET(node, n);
15501     }
15502 }
15503
15504 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15505 SV *
15506 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15507                                         const regnode* node,
15508                                         bool doinit,
15509                                         SV** listsvp,
15510                                         SV** only_utf8_locale_ptr,
15511                                         SV*  exclude_list)
15512
15513 {
15514     /* For internal core use only.
15515      * Returns the swash for the input 'node' in the regex 'prog'.
15516      * If <doinit> is 'true', will attempt to create the swash if not already
15517      *    done.
15518      * If <listsvp> is non-null, will return the printable contents of the
15519      *    swash.  This can be used to get debugging information even before the
15520      *    swash exists, by calling this function with 'doinit' set to false, in
15521      *    which case the components that will be used to eventually create the
15522      *    swash are returned  (in a printable form).
15523      * If <exclude_list> is not NULL, it is an inversion list of things to
15524      *    exclude from what's returned in <listsvp>.
15525      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
15526      * that, in spite of this function's name, the swash it returns may include
15527      * the bitmap data as well */
15528
15529     SV *sw  = NULL;
15530     SV *si  = NULL;         /* Input swash initialization string */
15531     SV*  invlist = NULL;
15532
15533     RXi_GET_DECL(prog,progi);
15534     const struct reg_data * const data = prog ? progi->data : NULL;
15535
15536     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15537
15538     assert(ANYOF_FLAGS(node)
15539         & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15540            |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15541
15542     if (data && data->count) {
15543         const U32 n = ARG(node);
15544
15545         if (data->what[n] == 's') {
15546             SV * const rv = MUTABLE_SV(data->data[n]);
15547             AV * const av = MUTABLE_AV(SvRV(rv));
15548             SV **const ary = AvARRAY(av);
15549             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15550
15551             si = *ary;  /* ary[0] = the string to initialize the swash with */
15552
15553             /* Elements 3 and 4 are either both present or both absent. [3] is
15554              * any inversion list generated at compile time; [4] indicates if
15555              * that inversion list has any user-defined properties in it. */
15556             if (av_tindex(av) >= 2) {
15557                 if (only_utf8_locale_ptr
15558                     && ary[2]
15559                     && ary[2] != &PL_sv_undef)
15560                 {
15561                     *only_utf8_locale_ptr = ary[2];
15562                 }
15563                 else {
15564                     assert(only_utf8_locale_ptr);
15565                     *only_utf8_locale_ptr = NULL;
15566                 }
15567
15568                 if (av_tindex(av) >= 3) {
15569                     invlist = ary[3];
15570                     if (SvUV(ary[4])) {
15571                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15572                     }
15573                 }
15574                 else {
15575                     invlist = NULL;
15576                 }
15577             }
15578
15579             /* Element [1] is reserved for the set-up swash.  If already there,
15580              * return it; if not, create it and store it there */
15581             if (ary[1] && SvROK(ary[1])) {
15582                 sw = ary[1];
15583             }
15584             else if (doinit && ((si && si != &PL_sv_undef)
15585                                  || (invlist && invlist != &PL_sv_undef))) {
15586                 assert(si);
15587                 sw = _core_swash_init("utf8", /* the utf8 package */
15588                                       "", /* nameless */
15589                                       si,
15590                                       1, /* binary */
15591                                       0, /* not from tr/// */
15592                                       invlist,
15593                                       &swash_init_flags);
15594                 (void)av_store(av, 1, sw);
15595             }
15596         }
15597     }
15598
15599     /* If requested, return a printable version of what this swash matches */
15600     if (listsvp) {
15601         SV* matches_string = newSVpvs("");
15602
15603         /* The swash should be used, if possible, to get the data, as it
15604          * contains the resolved data.  But this function can be called at
15605          * compile-time, before everything gets resolved, in which case we
15606          * return the currently best available information, which is the string
15607          * that will eventually be used to do that resolving, 'si' */
15608         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15609             && (si && si != &PL_sv_undef))
15610         {
15611             sv_catsv(matches_string, si);
15612         }
15613
15614         /* Add the inversion list to whatever we have.  This may have come from
15615          * the swash, or from an input parameter */
15616         if (invlist) {
15617             if (exclude_list) {
15618                 SV* clone = invlist_clone(invlist);
15619                 _invlist_subtract(clone, exclude_list, &clone);
15620                 sv_catsv(matches_string, _invlist_contents(clone));
15621                 SvREFCNT_dec_NN(clone);
15622             }
15623             else {
15624                 sv_catsv(matches_string, _invlist_contents(invlist));
15625             }
15626         }
15627         *listsvp = matches_string;
15628     }
15629
15630     return sw;
15631 }
15632 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15633
15634 /* reg_skipcomment()
15635
15636    Absorbs an /x style # comment from the input stream,
15637    returning a pointer to the first character beyond the comment, or if the
15638    comment terminates the pattern without anything following it, this returns
15639    one past the final character of the pattern (in other words, RExC_end) and
15640    sets the REG_RUN_ON_COMMENT_SEEN flag.
15641
15642    Note it's the callers responsibility to ensure that we are
15643    actually in /x mode
15644
15645 */
15646
15647 PERL_STATIC_INLINE char*
15648 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15649 {
15650     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15651
15652     assert(*p == '#');
15653
15654     while (p < RExC_end) {
15655         if (*(++p) == '\n') {
15656             return p+1;
15657         }
15658     }
15659
15660     /* we ran off the end of the pattern without ending the comment, so we have
15661      * to add an \n when wrapping */
15662     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15663     return p;
15664 }
15665
15666 /* nextchar()
15667
15668    Advances the parse position, and optionally absorbs
15669    "whitespace" from the inputstream.
15670
15671    Without /x "whitespace" means (?#...) style comments only,
15672    with /x this means (?#...) and # comments and whitespace proper.
15673
15674    Returns the RExC_parse point from BEFORE the scan occurs.
15675
15676    This is the /x friendly way of saying RExC_parse++.
15677 */
15678
15679 STATIC char*
15680 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15681 {
15682     char* const retval = RExC_parse++;
15683
15684     PERL_ARGS_ASSERT_NEXTCHAR;
15685
15686     for (;;) {
15687         if (RExC_end - RExC_parse >= 3
15688             && *RExC_parse == '('
15689             && RExC_parse[1] == '?'
15690             && RExC_parse[2] == '#')
15691         {
15692             while (*RExC_parse != ')') {
15693                 if (RExC_parse == RExC_end)
15694                     FAIL("Sequence (?#... not terminated");
15695                 RExC_parse++;
15696             }
15697             RExC_parse++;
15698             continue;
15699         }
15700         if (RExC_flags & RXf_PMf_EXTENDED) {
15701             char * p = regpatws(pRExC_state, RExC_parse,
15702                                           TRUE); /* means recognize comments */
15703             if (p != RExC_parse) {
15704                 RExC_parse = p;
15705                 continue;
15706             }
15707         }
15708         return retval;
15709     }
15710 }
15711
15712 STATIC regnode *
15713 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
15714 {
15715     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
15716      * space.  In pass1, it aligns and increments RExC_size; in pass2,
15717      * RExC_emit */
15718
15719     regnode * const ret = RExC_emit;
15720     GET_RE_DEBUG_FLAGS_DECL;
15721
15722     PERL_ARGS_ASSERT_REGNODE_GUTS;
15723
15724     assert(extra_size >= regarglen[op]);
15725
15726     if (SIZE_ONLY) {
15727         SIZE_ALIGN(RExC_size);
15728         RExC_size += 1 + extra_size;
15729         return(ret);
15730     }
15731     if (RExC_emit >= RExC_emit_bound)
15732         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15733                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
15734
15735     NODE_ALIGN_FILL(ret);
15736 #ifndef RE_TRACK_PATTERN_OFFSETS
15737     PERL_UNUSED_ARG(name);
15738 #else
15739     if (RExC_offsets) {         /* MJD */
15740         MJD_OFFSET_DEBUG(
15741               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15742               name, __LINE__,
15743               PL_reg_name[op],
15744               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15745                 ? "Overwriting end of array!\n" : "OK",
15746               (UV)(RExC_emit - RExC_emit_start),
15747               (UV)(RExC_parse - RExC_start),
15748               (UV)RExC_offsets[0]));
15749         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15750     }
15751 #endif
15752     return(ret);
15753 }
15754
15755 /*
15756 - reg_node - emit a node
15757 */
15758 STATIC regnode *                        /* Location. */
15759 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15760 {
15761     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
15762
15763     PERL_ARGS_ASSERT_REG_NODE;
15764
15765     assert(regarglen[op] == 0);
15766
15767     if (PASS2) {
15768         regnode *ptr = ret;
15769         FILL_ADVANCE_NODE(ptr, op);
15770         RExC_emit = ptr;
15771     }
15772     return(ret);
15773 }
15774
15775 /*
15776 - reganode - emit a node with an argument
15777 */
15778 STATIC regnode *                        /* Location. */
15779 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15780 {
15781     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
15782
15783     PERL_ARGS_ASSERT_REGANODE;
15784
15785     assert(regarglen[op] == 1);
15786
15787     if (PASS2) {
15788         regnode *ptr = ret;
15789         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15790         RExC_emit = ptr;
15791     }
15792     return(ret);
15793 }
15794
15795 STATIC regnode *
15796 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
15797 {
15798     /* emit a node with U32 and I32 arguments */
15799
15800     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
15801
15802     PERL_ARGS_ASSERT_REG2LANODE;
15803
15804     assert(regarglen[op] == 2);
15805
15806     if (PASS2) {
15807         regnode *ptr = ret;
15808         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
15809         RExC_emit = ptr;
15810     }
15811     return(ret);
15812 }
15813
15814 /*
15815 - reguni - emit (if appropriate) a Unicode character
15816 */
15817 PERL_STATIC_INLINE STRLEN
15818 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15819 {
15820     PERL_ARGS_ASSERT_REGUNI;
15821
15822     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15823 }
15824
15825 /*
15826 - reginsert - insert an operator in front of already-emitted operand
15827 *
15828 * Means relocating the operand.
15829 */
15830 STATIC void
15831 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15832 {
15833     regnode *src;
15834     regnode *dst;
15835     regnode *place;
15836     const int offset = regarglen[(U8)op];
15837     const int size = NODE_STEP_REGNODE + offset;
15838     GET_RE_DEBUG_FLAGS_DECL;
15839
15840     PERL_ARGS_ASSERT_REGINSERT;
15841     PERL_UNUSED_CONTEXT;
15842     PERL_UNUSED_ARG(depth);
15843 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15844     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15845     if (SIZE_ONLY) {
15846         RExC_size += size;
15847         return;
15848     }
15849
15850     src = RExC_emit;
15851     RExC_emit += size;
15852     dst = RExC_emit;
15853     if (RExC_open_parens) {
15854         int paren;
15855         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15856         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15857             if ( RExC_open_parens[paren] >= opnd ) {
15858                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15859                 RExC_open_parens[paren] += size;
15860             } else {
15861                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15862             }
15863             if ( RExC_close_parens[paren] >= opnd ) {
15864                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15865                 RExC_close_parens[paren] += size;
15866             } else {
15867                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15868             }
15869         }
15870     }
15871
15872     while (src > opnd) {
15873         StructCopy(--src, --dst, regnode);
15874 #ifdef RE_TRACK_PATTERN_OFFSETS
15875         if (RExC_offsets) {     /* MJD 20010112 */
15876             MJD_OFFSET_DEBUG(
15877                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15878                   "reg_insert",
15879                   __LINE__,
15880                   PL_reg_name[op],
15881                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15882                     ? "Overwriting end of array!\n" : "OK",
15883                   (UV)(src - RExC_emit_start),
15884                   (UV)(dst - RExC_emit_start),
15885                   (UV)RExC_offsets[0]));
15886             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15887             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15888         }
15889 #endif
15890     }
15891
15892
15893     place = opnd;               /* Op node, where operand used to be. */
15894 #ifdef RE_TRACK_PATTERN_OFFSETS
15895     if (RExC_offsets) {         /* MJD */
15896         MJD_OFFSET_DEBUG(
15897               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15898               "reginsert",
15899               __LINE__,
15900               PL_reg_name[op],
15901               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15902               ? "Overwriting end of array!\n" : "OK",
15903               (UV)(place - RExC_emit_start),
15904               (UV)(RExC_parse - RExC_start),
15905               (UV)RExC_offsets[0]));
15906         Set_Node_Offset(place, RExC_parse);
15907         Set_Node_Length(place, 1);
15908     }
15909 #endif
15910     src = NEXTOPER(place);
15911     FILL_ADVANCE_NODE(place, op);
15912     Zero(src, offset, regnode);
15913 }
15914
15915 /*
15916 - regtail - set the next-pointer at the end of a node chain of p to val.
15917 - SEE ALSO: regtail_study
15918 */
15919 /* TODO: All three parms should be const */
15920 STATIC void
15921 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15922                 const regnode *val,U32 depth)
15923 {
15924     regnode *scan;
15925     GET_RE_DEBUG_FLAGS_DECL;
15926
15927     PERL_ARGS_ASSERT_REGTAIL;
15928 #ifndef DEBUGGING
15929     PERL_UNUSED_ARG(depth);
15930 #endif
15931
15932     if (SIZE_ONLY)
15933         return;
15934
15935     /* Find last node. */
15936     scan = p;
15937     for (;;) {
15938         regnode * const temp = regnext(scan);
15939         DEBUG_PARSE_r({
15940             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15941             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
15942             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15943                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
15944                     (temp == NULL ? "->" : ""),
15945                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15946             );
15947         });
15948         if (temp == NULL)
15949             break;
15950         scan = temp;
15951     }
15952
15953     if (reg_off_by_arg[OP(scan)]) {
15954         ARG_SET(scan, val - scan);
15955     }
15956     else {
15957         NEXT_OFF(scan) = val - scan;
15958     }
15959 }
15960
15961 #ifdef DEBUGGING
15962 /*
15963 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15964 - Look for optimizable sequences at the same time.
15965 - currently only looks for EXACT chains.
15966
15967 This is experimental code. The idea is to use this routine to perform
15968 in place optimizations on branches and groups as they are constructed,
15969 with the long term intention of removing optimization from study_chunk so
15970 that it is purely analytical.
15971
15972 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15973 to control which is which.
15974
15975 */
15976 /* TODO: All four parms should be const */
15977
15978 STATIC U8
15979 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15980                       const regnode *val,U32 depth)
15981 {
15982     regnode *scan;
15983     U8 exact = PSEUDO;
15984 #ifdef EXPERIMENTAL_INPLACESCAN
15985     I32 min = 0;
15986 #endif
15987     GET_RE_DEBUG_FLAGS_DECL;
15988
15989     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15990
15991
15992     if (SIZE_ONLY)
15993         return exact;
15994
15995     /* Find last node. */
15996
15997     scan = p;
15998     for (;;) {
15999         regnode * const temp = regnext(scan);
16000 #ifdef EXPERIMENTAL_INPLACESCAN
16001         if (PL_regkind[OP(scan)] == EXACT) {
16002             bool unfolded_multi_char;   /* Unexamined in this routine */
16003             if (join_exact(pRExC_state, scan, &min,
16004                            &unfolded_multi_char, 1, val, depth+1))
16005                 return EXACT;
16006         }
16007 #endif
16008         if ( exact ) {
16009             switch (OP(scan)) {
16010                 case EXACT:
16011                 case EXACTL:
16012                 case EXACTF:
16013                 case EXACTFA_NO_TRIE:
16014                 case EXACTFA:
16015                 case EXACTFU:
16016                 case EXACTFLU8:
16017                 case EXACTFU_SS:
16018                 case EXACTFL:
16019                         if( exact == PSEUDO )
16020                             exact= OP(scan);
16021                         else if ( exact != OP(scan) )
16022                             exact= 0;
16023                 case NOTHING:
16024                     break;
16025                 default:
16026                     exact= 0;
16027             }
16028         }
16029         DEBUG_PARSE_r({
16030             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16031             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16032             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16033                 SvPV_nolen_const(RExC_mysv),
16034                 REG_NODE_NUM(scan),
16035                 PL_reg_name[exact]);
16036         });
16037         if (temp == NULL)
16038             break;
16039         scan = temp;
16040     }
16041     DEBUG_PARSE_r({
16042         DEBUG_PARSE_MSG("");
16043         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16044         PerlIO_printf(Perl_debug_log,
16045                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16046                       SvPV_nolen_const(RExC_mysv),
16047                       (IV)REG_NODE_NUM(val),
16048                       (IV)(val - scan)
16049         );
16050     });
16051     if (reg_off_by_arg[OP(scan)]) {
16052         ARG_SET(scan, val - scan);
16053     }
16054     else {
16055         NEXT_OFF(scan) = val - scan;
16056     }
16057
16058     return exact;
16059 }
16060 #endif
16061
16062 /*
16063  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16064  */
16065 #ifdef DEBUGGING
16066
16067 static void
16068 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16069 {
16070     int bit;
16071     int set=0;
16072
16073     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16074
16075     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16076         if (flags & (1<<bit)) {
16077             if (!set++ && lead)
16078                 PerlIO_printf(Perl_debug_log, "%s",lead);
16079             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16080         }
16081     }
16082     if (lead)  {
16083         if (set)
16084             PerlIO_printf(Perl_debug_log, "\n");
16085         else
16086             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16087     }
16088 }
16089
16090 static void
16091 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16092 {
16093     int bit;
16094     int set=0;
16095     regex_charset cs;
16096
16097     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16098
16099     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16100         if (flags & (1<<bit)) {
16101             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
16102                 continue;
16103             }
16104             if (!set++ && lead)
16105                 PerlIO_printf(Perl_debug_log, "%s",lead);
16106             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16107         }
16108     }
16109     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16110             if (!set++ && lead) {
16111                 PerlIO_printf(Perl_debug_log, "%s",lead);
16112             }
16113             switch (cs) {
16114                 case REGEX_UNICODE_CHARSET:
16115                     PerlIO_printf(Perl_debug_log, "UNICODE");
16116                     break;
16117                 case REGEX_LOCALE_CHARSET:
16118                     PerlIO_printf(Perl_debug_log, "LOCALE");
16119                     break;
16120                 case REGEX_ASCII_RESTRICTED_CHARSET:
16121                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16122                     break;
16123                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16124                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16125                     break;
16126                 default:
16127                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16128                     break;
16129             }
16130     }
16131     if (lead)  {
16132         if (set)
16133             PerlIO_printf(Perl_debug_log, "\n");
16134         else
16135             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16136     }
16137 }
16138 #endif
16139
16140 void
16141 Perl_regdump(pTHX_ const regexp *r)
16142 {
16143 #ifdef DEBUGGING
16144     SV * const sv = sv_newmortal();
16145     SV *dsv= sv_newmortal();
16146     RXi_GET_DECL(r,ri);
16147     GET_RE_DEBUG_FLAGS_DECL;
16148
16149     PERL_ARGS_ASSERT_REGDUMP;
16150
16151     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16152
16153     /* Header fields of interest. */
16154     if (r->anchored_substr) {
16155         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16156             RE_SV_DUMPLEN(r->anchored_substr), 30);
16157         PerlIO_printf(Perl_debug_log,
16158                       "anchored %s%s at %"IVdf" ",
16159                       s, RE_SV_TAIL(r->anchored_substr),
16160                       (IV)r->anchored_offset);
16161     } else if (r->anchored_utf8) {
16162         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16163             RE_SV_DUMPLEN(r->anchored_utf8), 30);
16164         PerlIO_printf(Perl_debug_log,
16165                       "anchored utf8 %s%s at %"IVdf" ",
16166                       s, RE_SV_TAIL(r->anchored_utf8),
16167                       (IV)r->anchored_offset);
16168     }
16169     if (r->float_substr) {
16170         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16171             RE_SV_DUMPLEN(r->float_substr), 30);
16172         PerlIO_printf(Perl_debug_log,
16173                       "floating %s%s at %"IVdf"..%"UVuf" ",
16174                       s, RE_SV_TAIL(r->float_substr),
16175                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16176     } else if (r->float_utf8) {
16177         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16178             RE_SV_DUMPLEN(r->float_utf8), 30);
16179         PerlIO_printf(Perl_debug_log,
16180                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16181                       s, RE_SV_TAIL(r->float_utf8),
16182                       (IV)r->float_min_offset, (UV)r->float_max_offset);
16183     }
16184     if (r->check_substr || r->check_utf8)
16185         PerlIO_printf(Perl_debug_log,
16186                       (const char *)
16187                       (r->check_substr == r->float_substr
16188                        && r->check_utf8 == r->float_utf8
16189                        ? "(checking floating" : "(checking anchored"));
16190     if (r->intflags & PREGf_NOSCAN)
16191         PerlIO_printf(Perl_debug_log, " noscan");
16192     if (r->extflags & RXf_CHECK_ALL)
16193         PerlIO_printf(Perl_debug_log, " isall");
16194     if (r->check_substr || r->check_utf8)
16195         PerlIO_printf(Perl_debug_log, ") ");
16196
16197     if (ri->regstclass) {
16198         regprop(r, sv, ri->regstclass, NULL, NULL);
16199         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16200     }
16201     if (r->intflags & PREGf_ANCH) {
16202         PerlIO_printf(Perl_debug_log, "anchored");
16203         if (r->intflags & PREGf_ANCH_MBOL)
16204             PerlIO_printf(Perl_debug_log, "(MBOL)");
16205         if (r->intflags & PREGf_ANCH_SBOL)
16206             PerlIO_printf(Perl_debug_log, "(SBOL)");
16207         if (r->intflags & PREGf_ANCH_GPOS)
16208             PerlIO_printf(Perl_debug_log, "(GPOS)");
16209         PerlIO_putc(Perl_debug_log, ' ');
16210     }
16211     if (r->intflags & PREGf_GPOS_SEEN)
16212         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16213     if (r->intflags & PREGf_SKIP)
16214         PerlIO_printf(Perl_debug_log, "plus ");
16215     if (r->intflags & PREGf_IMPLICIT)
16216         PerlIO_printf(Perl_debug_log, "implicit ");
16217     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16218     if (r->extflags & RXf_EVAL_SEEN)
16219         PerlIO_printf(Perl_debug_log, "with eval ");
16220     PerlIO_printf(Perl_debug_log, "\n");
16221     DEBUG_FLAGS_r({
16222         regdump_extflags("r->extflags: ",r->extflags);
16223         regdump_intflags("r->intflags: ",r->intflags);
16224     });
16225 #else
16226     PERL_ARGS_ASSERT_REGDUMP;
16227     PERL_UNUSED_CONTEXT;
16228     PERL_UNUSED_ARG(r);
16229 #endif  /* DEBUGGING */
16230 }
16231
16232 /*
16233 - regprop - printable representation of opcode, with run time support
16234 */
16235
16236 void
16237 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16238 {
16239 #ifdef DEBUGGING
16240     int k;
16241
16242     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16243     static const char * const anyofs[] = {
16244 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16245     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
16246     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
16247     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
16248     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
16249     || _CC_VERTSPACE != 16
16250   #error Need to adjust order of anyofs[]
16251 #endif
16252         "\\w",
16253         "\\W",
16254         "\\d",
16255         "\\D",
16256         "[:alpha:]",
16257         "[:^alpha:]",
16258         "[:lower:]",
16259         "[:^lower:]",
16260         "[:upper:]",
16261         "[:^upper:]",
16262         "[:punct:]",
16263         "[:^punct:]",
16264         "[:print:]",
16265         "[:^print:]",
16266         "[:alnum:]",
16267         "[:^alnum:]",
16268         "[:graph:]",
16269         "[:^graph:]",
16270         "[:cased:]",
16271         "[:^cased:]",
16272         "\\s",
16273         "\\S",
16274         "[:blank:]",
16275         "[:^blank:]",
16276         "[:xdigit:]",
16277         "[:^xdigit:]",
16278         "[:space:]",
16279         "[:^space:]",
16280         "[:cntrl:]",
16281         "[:^cntrl:]",
16282         "[:ascii:]",
16283         "[:^ascii:]",
16284         "\\v",
16285         "\\V"
16286     };
16287     RXi_GET_DECL(prog,progi);
16288     GET_RE_DEBUG_FLAGS_DECL;
16289
16290     PERL_ARGS_ASSERT_REGPROP;
16291
16292     sv_setpvn(sv, "", 0);
16293
16294     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
16295         /* It would be nice to FAIL() here, but this may be called from
16296            regexec.c, and it would be hard to supply pRExC_state. */
16297         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16298                                               (int)OP(o), (int)REGNODE_MAX);
16299     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16300
16301     k = PL_regkind[OP(o)];
16302
16303     if (k == EXACT) {
16304         sv_catpvs(sv, " ");
16305         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16306          * is a crude hack but it may be the best for now since
16307          * we have no flag "this EXACTish node was UTF-8"
16308          * --jhi */
16309         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16310                   PERL_PV_ESCAPE_UNI_DETECT |
16311                   PERL_PV_ESCAPE_NONASCII   |
16312                   PERL_PV_PRETTY_ELLIPSES   |
16313                   PERL_PV_PRETTY_LTGT       |
16314                   PERL_PV_PRETTY_NOCLEAR
16315                   );
16316     } else if (k == TRIE) {
16317         /* print the details of the trie in dumpuntil instead, as
16318          * progi->data isn't available here */
16319         const char op = OP(o);
16320         const U32 n = ARG(o);
16321         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16322                (reg_ac_data *)progi->data->data[n] :
16323                NULL;
16324         const reg_trie_data * const trie
16325             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16326
16327         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16328         DEBUG_TRIE_COMPILE_r(
16329           Perl_sv_catpvf(aTHX_ sv,
16330             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16331             (UV)trie->startstate,
16332             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16333             (UV)trie->wordcount,
16334             (UV)trie->minlen,
16335             (UV)trie->maxlen,
16336             (UV)TRIE_CHARCOUNT(trie),
16337             (UV)trie->uniquecharcount
16338           );
16339         );
16340         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16341             sv_catpvs(sv, "[");
16342             (void) put_charclass_bitmap_innards(sv,
16343                                                 (IS_ANYOF_TRIE(op))
16344                                                  ? ANYOF_BITMAP(o)
16345                                                  : TRIE_BITMAP(trie),
16346                                                 NULL);
16347             sv_catpvs(sv, "]");
16348         }
16349
16350     } else if (k == CURLY) {
16351         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16352             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16353         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16354     }
16355     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
16356         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16357     else if (k == REF || k == OPEN || k == CLOSE
16358              || k == GROUPP || OP(o)==ACCEPT)
16359     {
16360         AV *name_list= NULL;
16361         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
16362         if ( RXp_PAREN_NAMES(prog) ) {
16363             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16364         } else if ( pRExC_state ) {
16365             name_list= RExC_paren_name_list;
16366         }
16367         if (name_list) {
16368             if ( k != REF || (OP(o) < NREF)) {
16369                 SV **name= av_fetch(name_list, ARG(o), 0 );
16370                 if (name)
16371                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16372             }
16373             else {
16374                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16375                 I32 *nums=(I32*)SvPVX(sv_dat);
16376                 SV **name= av_fetch(name_list, nums[0], 0 );
16377                 I32 n;
16378                 if (name) {
16379                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
16380                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16381                                     (n ? "," : ""), (IV)nums[n]);
16382                     }
16383                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16384                 }
16385             }
16386         }
16387         if ( k == REF && reginfo) {
16388             U32 n = ARG(o);  /* which paren pair */
16389             I32 ln = prog->offs[n].start;
16390             if (prog->lastparen < n || ln == -1)
16391                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16392             else if (ln == prog->offs[n].end)
16393                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16394             else {
16395                 const char *s = reginfo->strbeg + ln;
16396                 Perl_sv_catpvf(aTHX_ sv, ": ");
16397                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16398                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16399             }
16400         }
16401     } else if (k == GOSUB) {
16402         AV *name_list= NULL;
16403         if ( RXp_PAREN_NAMES(prog) ) {
16404             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16405         } else if ( pRExC_state ) {
16406             name_list= RExC_paren_name_list;
16407         }
16408
16409         /* Paren and offset */
16410         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16411         if (name_list) {
16412             SV **name= av_fetch(name_list, ARG(o), 0 );
16413             if (name)
16414                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16415         }
16416     }
16417     else if (k == VERB) {
16418         if (!o->flags)
16419             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16420                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16421     } else if (k == LOGICAL)
16422         /* 2: embedded, otherwise 1 */
16423         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16424     else if (k == ANYOF) {
16425         const U8 flags = ANYOF_FLAGS(o);
16426         int do_sep = 0;
16427         SV* bitmap_invlist;  /* Will hold what the bit map contains */
16428
16429
16430         if (OP(o) == ANYOFL)
16431             sv_catpvs(sv, "{loc}");
16432         if (flags & ANYOF_LOC_FOLD)
16433             sv_catpvs(sv, "{i}");
16434         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16435         if (flags & ANYOF_INVERT)
16436             sv_catpvs(sv, "^");
16437
16438         /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16439          * */
16440         do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16441                                                             &bitmap_invlist);
16442
16443         /* output any special charclass tests (used entirely under use
16444          * locale) * */
16445         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16446             int i;
16447             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16448                 if (ANYOF_POSIXL_TEST(o,i)) {
16449                     sv_catpv(sv, anyofs[i]);
16450                     do_sep = 1;
16451                 }
16452             }
16453         }
16454
16455         if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16456                       |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16457                       |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16458                       |ANYOF_LOC_FOLD)))
16459         {
16460             if (do_sep) {
16461                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16462                 if (flags & ANYOF_INVERT)
16463                     /*make sure the invert info is in each */
16464                     sv_catpvs(sv, "^");
16465             }
16466
16467             if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16468                 sv_catpvs(sv, "{non-utf8-latin1-all}");
16469             }
16470
16471             /* output information about the unicode matching */
16472             if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16473                 sv_catpvs(sv, "{above_bitmap_all}");
16474             else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16475                 SV *lv; /* Set if there is something outside the bit map. */
16476                 bool byte_output = FALSE;   /* If something in the bitmap has
16477                                                been output */
16478                 SV *only_utf8_locale;
16479
16480                 /* Get the stuff that wasn't in the bitmap.  'bitmap_invlist'
16481                  * is used to guarantee that nothing in the bitmap gets
16482                  * returned */
16483                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16484                                                     &lv, &only_utf8_locale,
16485                                                     bitmap_invlist);
16486                 if (lv && lv != &PL_sv_undef) {
16487                     char *s = savesvpv(lv);
16488                     char * const origs = s;
16489
16490                     while (*s && *s != '\n')
16491                         s++;
16492
16493                     if (*s == '\n') {
16494                         const char * const t = ++s;
16495
16496                         if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16497                             sv_catpvs(sv, "{outside bitmap}");
16498                         }
16499                         else {
16500                             sv_catpvs(sv, "{utf8}");
16501                         }
16502
16503                         if (byte_output) {
16504                             sv_catpvs(sv, " ");
16505                         }
16506
16507                         while (*s) {
16508                             if (*s == '\n') {
16509
16510                                 /* Truncate very long output */
16511                                 if (s - origs > 256) {
16512                                     Perl_sv_catpvf(aTHX_ sv,
16513                                                 "%.*s...",
16514                                                 (int) (s - origs - 1),
16515                                                 t);
16516                                     goto out_dump;
16517                                 }
16518                                 *s = ' ';
16519                             }
16520                             else if (*s == '\t') {
16521                                 *s = '-';
16522                             }
16523                             s++;
16524                         }
16525                         if (s[-1] == ' ')
16526                             s[-1] = 0;
16527
16528                         sv_catpv(sv, t);
16529                     }
16530
16531                 out_dump:
16532
16533                     Safefree(origs);
16534                     SvREFCNT_dec_NN(lv);
16535                 }
16536
16537                 if ((flags & ANYOF_LOC_FOLD)
16538                      && only_utf8_locale
16539                      && only_utf8_locale != &PL_sv_undef)
16540                 {
16541                     UV start, end;
16542                     int max_entries = 256;
16543
16544                     sv_catpvs(sv, "{utf8 locale}");
16545                     invlist_iterinit(only_utf8_locale);
16546                     while (invlist_iternext(only_utf8_locale,
16547                                             &start, &end)) {
16548                         put_range(sv, start, end, FALSE);
16549                         max_entries --;
16550                         if (max_entries < 0) {
16551                             sv_catpvs(sv, "...");
16552                             break;
16553                         }
16554                     }
16555                     invlist_iterfinish(only_utf8_locale);
16556                 }
16557             }
16558         }
16559         SvREFCNT_dec(bitmap_invlist);
16560
16561
16562         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16563     }
16564     else if (k == POSIXD || k == NPOSIXD) {
16565         U8 index = FLAGS(o) * 2;
16566         if (index < C_ARRAY_LENGTH(anyofs)) {
16567             if (*anyofs[index] != '[')  {
16568                 sv_catpv(sv, "[");
16569             }
16570             sv_catpv(sv, anyofs[index]);
16571             if (*anyofs[index] != '[')  {
16572                 sv_catpv(sv, "]");
16573             }
16574         }
16575         else {
16576             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16577         }
16578     }
16579     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16580         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16581     else if (OP(o) == SBOL)
16582         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16583 #else
16584     PERL_UNUSED_CONTEXT;
16585     PERL_UNUSED_ARG(sv);
16586     PERL_UNUSED_ARG(o);
16587     PERL_UNUSED_ARG(prog);
16588     PERL_UNUSED_ARG(reginfo);
16589     PERL_UNUSED_ARG(pRExC_state);
16590 #endif  /* DEBUGGING */
16591 }
16592
16593
16594
16595 SV *
16596 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16597 {                               /* Assume that RE_INTUIT is set */
16598     struct regexp *const prog = ReANY(r);
16599     GET_RE_DEBUG_FLAGS_DECL;
16600
16601     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16602     PERL_UNUSED_CONTEXT;
16603
16604     DEBUG_COMPILE_r(
16605         {
16606             const char * const s = SvPV_nolen_const(prog->check_substr
16607                       ? prog->check_substr : prog->check_utf8);
16608
16609             if (!PL_colorset) reginitcolors();
16610             PerlIO_printf(Perl_debug_log,
16611                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16612                       PL_colors[4],
16613                       prog->check_substr ? "" : "utf8 ",
16614                       PL_colors[5],PL_colors[0],
16615                       s,
16616                       PL_colors[1],
16617                       (strlen(s) > 60 ? "..." : ""));
16618         } );
16619
16620     return prog->check_substr ? prog->check_substr : prog->check_utf8;
16621 }
16622
16623 /*
16624    pregfree()
16625
16626    handles refcounting and freeing the perl core regexp structure. When
16627    it is necessary to actually free the structure the first thing it
16628    does is call the 'free' method of the regexp_engine associated to
16629    the regexp, allowing the handling of the void *pprivate; member
16630    first. (This routine is not overridable by extensions, which is why
16631    the extensions free is called first.)
16632
16633    See regdupe and regdupe_internal if you change anything here.
16634 */
16635 #ifndef PERL_IN_XSUB_RE
16636 void
16637 Perl_pregfree(pTHX_ REGEXP *r)
16638 {
16639     SvREFCNT_dec(r);
16640 }
16641
16642 void
16643 Perl_pregfree2(pTHX_ REGEXP *rx)
16644 {
16645     struct regexp *const r = ReANY(rx);
16646     GET_RE_DEBUG_FLAGS_DECL;
16647
16648     PERL_ARGS_ASSERT_PREGFREE2;
16649
16650     if (r->mother_re) {
16651         ReREFCNT_dec(r->mother_re);
16652     } else {
16653         CALLREGFREE_PVT(rx); /* free the private data */
16654         SvREFCNT_dec(RXp_PAREN_NAMES(r));
16655         Safefree(r->xpv_len_u.xpvlenu_pv);
16656     }
16657     if (r->substrs) {
16658         SvREFCNT_dec(r->anchored_substr);
16659         SvREFCNT_dec(r->anchored_utf8);
16660         SvREFCNT_dec(r->float_substr);
16661         SvREFCNT_dec(r->float_utf8);
16662         Safefree(r->substrs);
16663     }
16664     RX_MATCH_COPY_FREE(rx);
16665 #ifdef PERL_ANY_COW
16666     SvREFCNT_dec(r->saved_copy);
16667 #endif
16668     Safefree(r->offs);
16669     SvREFCNT_dec(r->qr_anoncv);
16670     rx->sv_u.svu_rx = 0;
16671 }
16672
16673 /*  reg_temp_copy()
16674
16675     This is a hacky workaround to the structural issue of match results
16676     being stored in the regexp structure which is in turn stored in
16677     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16678     could be PL_curpm in multiple contexts, and could require multiple
16679     result sets being associated with the pattern simultaneously, such
16680     as when doing a recursive match with (??{$qr})
16681
16682     The solution is to make a lightweight copy of the regexp structure
16683     when a qr// is returned from the code executed by (??{$qr}) this
16684     lightweight copy doesn't actually own any of its data except for
16685     the starp/end and the actual regexp structure itself.
16686
16687 */
16688
16689
16690 REGEXP *
16691 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16692 {
16693     struct regexp *ret;
16694     struct regexp *const r = ReANY(rx);
16695     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16696
16697     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16698
16699     if (!ret_x)
16700         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16701     else {
16702         SvOK_off((SV *)ret_x);
16703         if (islv) {
16704             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16705                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16706                made both spots point to the same regexp body.) */
16707             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16708             assert(!SvPVX(ret_x));
16709             ret_x->sv_u.svu_rx = temp->sv_any;
16710             temp->sv_any = NULL;
16711             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16712             SvREFCNT_dec_NN(temp);
16713             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16714                ing below will not set it. */
16715             SvCUR_set(ret_x, SvCUR(rx));
16716         }
16717     }
16718     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16719        sv_force_normal(sv) is called.  */
16720     SvFAKE_on(ret_x);
16721     ret = ReANY(ret_x);
16722
16723     SvFLAGS(ret_x) |= SvUTF8(rx);
16724     /* We share the same string buffer as the original regexp, on which we
16725        hold a reference count, incremented when mother_re is set below.
16726        The string pointer is copied here, being part of the regexp struct.
16727      */
16728     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16729            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16730     if (r->offs) {
16731         const I32 npar = r->nparens+1;
16732         Newx(ret->offs, npar, regexp_paren_pair);
16733         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16734     }
16735     if (r->substrs) {
16736         Newx(ret->substrs, 1, struct reg_substr_data);
16737         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16738
16739         SvREFCNT_inc_void(ret->anchored_substr);
16740         SvREFCNT_inc_void(ret->anchored_utf8);
16741         SvREFCNT_inc_void(ret->float_substr);
16742         SvREFCNT_inc_void(ret->float_utf8);
16743
16744         /* check_substr and check_utf8, if non-NULL, point to either their
16745            anchored or float namesakes, and don't hold a second reference.  */
16746     }
16747     RX_MATCH_COPIED_off(ret_x);
16748 #ifdef PERL_ANY_COW
16749     ret->saved_copy = NULL;
16750 #endif
16751     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16752     SvREFCNT_inc_void(ret->qr_anoncv);
16753
16754     return ret_x;
16755 }
16756 #endif
16757
16758 /* regfree_internal()
16759
16760    Free the private data in a regexp. This is overloadable by
16761    extensions. Perl takes care of the regexp structure in pregfree(),
16762    this covers the *pprivate pointer which technically perl doesn't
16763    know about, however of course we have to handle the
16764    regexp_internal structure when no extension is in use.
16765
16766    Note this is called before freeing anything in the regexp
16767    structure.
16768  */
16769
16770 void
16771 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16772 {
16773     struct regexp *const r = ReANY(rx);
16774     RXi_GET_DECL(r,ri);
16775     GET_RE_DEBUG_FLAGS_DECL;
16776
16777     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16778
16779     DEBUG_COMPILE_r({
16780         if (!PL_colorset)
16781             reginitcolors();
16782         {
16783             SV *dsv= sv_newmortal();
16784             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16785                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16786             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16787                 PL_colors[4],PL_colors[5],s);
16788         }
16789     });
16790 #ifdef RE_TRACK_PATTERN_OFFSETS
16791     if (ri->u.offsets)
16792         Safefree(ri->u.offsets);             /* 20010421 MJD */
16793 #endif
16794     if (ri->code_blocks) {
16795         int n;
16796         for (n = 0; n < ri->num_code_blocks; n++)
16797             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16798         Safefree(ri->code_blocks);
16799     }
16800
16801     if (ri->data) {
16802         int n = ri->data->count;
16803
16804         while (--n >= 0) {
16805           /* If you add a ->what type here, update the comment in regcomp.h */
16806             switch (ri->data->what[n]) {
16807             case 'a':
16808             case 'r':
16809             case 's':
16810             case 'S':
16811             case 'u':
16812                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16813                 break;
16814             case 'f':
16815                 Safefree(ri->data->data[n]);
16816                 break;
16817             case 'l':
16818             case 'L':
16819                 break;
16820             case 'T':
16821                 { /* Aho Corasick add-on structure for a trie node.
16822                      Used in stclass optimization only */
16823                     U32 refcount;
16824                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16825 #ifdef USE_ITHREADS
16826                     dVAR;
16827 #endif
16828                     OP_REFCNT_LOCK;
16829                     refcount = --aho->refcount;
16830                     OP_REFCNT_UNLOCK;
16831                     if ( !refcount ) {
16832                         PerlMemShared_free(aho->states);
16833                         PerlMemShared_free(aho->fail);
16834                          /* do this last!!!! */
16835                         PerlMemShared_free(ri->data->data[n]);
16836                         /* we should only ever get called once, so
16837                          * assert as much, and also guard the free
16838                          * which /might/ happen twice. At the least
16839                          * it will make code anlyzers happy and it
16840                          * doesn't cost much. - Yves */
16841                         assert(ri->regstclass);
16842                         if (ri->regstclass) {
16843                             PerlMemShared_free(ri->regstclass);
16844                             ri->regstclass = 0;
16845                         }
16846                     }
16847                 }
16848                 break;
16849             case 't':
16850                 {
16851                     /* trie structure. */
16852                     U32 refcount;
16853                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16854 #ifdef USE_ITHREADS
16855                     dVAR;
16856 #endif
16857                     OP_REFCNT_LOCK;
16858                     refcount = --trie->refcount;
16859                     OP_REFCNT_UNLOCK;
16860                     if ( !refcount ) {
16861                         PerlMemShared_free(trie->charmap);
16862                         PerlMemShared_free(trie->states);
16863                         PerlMemShared_free(trie->trans);
16864                         if (trie->bitmap)
16865                             PerlMemShared_free(trie->bitmap);
16866                         if (trie->jump)
16867                             PerlMemShared_free(trie->jump);
16868                         PerlMemShared_free(trie->wordinfo);
16869                         /* do this last!!!! */
16870                         PerlMemShared_free(ri->data->data[n]);
16871                     }
16872                 }
16873                 break;
16874             default:
16875                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16876                                                     ri->data->what[n]);
16877             }
16878         }
16879         Safefree(ri->data->what);
16880         Safefree(ri->data);
16881     }
16882
16883     Safefree(ri);
16884 }
16885
16886 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16887 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16888 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16889
16890 /*
16891    re_dup - duplicate a regexp.
16892
16893    This routine is expected to clone a given regexp structure. It is only
16894    compiled under USE_ITHREADS.
16895
16896    After all of the core data stored in struct regexp is duplicated
16897    the regexp_engine.dupe method is used to copy any private data
16898    stored in the *pprivate pointer. This allows extensions to handle
16899    any duplication it needs to do.
16900
16901    See pregfree() and regfree_internal() if you change anything here.
16902 */
16903 #if defined(USE_ITHREADS)
16904 #ifndef PERL_IN_XSUB_RE
16905 void
16906 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16907 {
16908     dVAR;
16909     I32 npar;
16910     const struct regexp *r = ReANY(sstr);
16911     struct regexp *ret = ReANY(dstr);
16912
16913     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16914
16915     npar = r->nparens+1;
16916     Newx(ret->offs, npar, regexp_paren_pair);
16917     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16918
16919     if (ret->substrs) {
16920         /* Do it this way to avoid reading from *r after the StructCopy().
16921            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16922            cache, it doesn't matter.  */
16923         const bool anchored = r->check_substr
16924             ? r->check_substr == r->anchored_substr
16925             : r->check_utf8 == r->anchored_utf8;
16926         Newx(ret->substrs, 1, struct reg_substr_data);
16927         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16928
16929         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16930         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16931         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16932         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16933
16934         /* check_substr and check_utf8, if non-NULL, point to either their
16935            anchored or float namesakes, and don't hold a second reference.  */
16936
16937         if (ret->check_substr) {
16938             if (anchored) {
16939                 assert(r->check_utf8 == r->anchored_utf8);
16940                 ret->check_substr = ret->anchored_substr;
16941                 ret->check_utf8 = ret->anchored_utf8;
16942             } else {
16943                 assert(r->check_substr == r->float_substr);
16944                 assert(r->check_utf8 == r->float_utf8);
16945                 ret->check_substr = ret->float_substr;
16946                 ret->check_utf8 = ret->float_utf8;
16947             }
16948         } else if (ret->check_utf8) {
16949             if (anchored) {
16950                 ret->check_utf8 = ret->anchored_utf8;
16951             } else {
16952                 ret->check_utf8 = ret->float_utf8;
16953             }
16954         }
16955     }
16956
16957     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16958     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16959
16960     if (ret->pprivate)
16961         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16962
16963     if (RX_MATCH_COPIED(dstr))
16964         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16965     else
16966         ret->subbeg = NULL;
16967 #ifdef PERL_ANY_COW
16968     ret->saved_copy = NULL;
16969 #endif
16970
16971     /* Whether mother_re be set or no, we need to copy the string.  We
16972        cannot refrain from copying it when the storage points directly to
16973        our mother regexp, because that's
16974                1: a buffer in a different thread
16975                2: something we no longer hold a reference on
16976                so we need to copy it locally.  */
16977     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16978     ret->mother_re   = NULL;
16979 }
16980 #endif /* PERL_IN_XSUB_RE */
16981
16982 /*
16983    regdupe_internal()
16984
16985    This is the internal complement to regdupe() which is used to copy
16986    the structure pointed to by the *pprivate pointer in the regexp.
16987    This is the core version of the extension overridable cloning hook.
16988    The regexp structure being duplicated will be copied by perl prior
16989    to this and will be provided as the regexp *r argument, however
16990    with the /old/ structures pprivate pointer value. Thus this routine
16991    may override any copying normally done by perl.
16992
16993    It returns a pointer to the new regexp_internal structure.
16994 */
16995
16996 void *
16997 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16998 {
16999     dVAR;
17000     struct regexp *const r = ReANY(rx);
17001     regexp_internal *reti;
17002     int len;
17003     RXi_GET_DECL(r,ri);
17004
17005     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17006
17007     len = ProgLen(ri);
17008
17009     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17010           char, regexp_internal);
17011     Copy(ri->program, reti->program, len+1, regnode);
17012
17013     reti->num_code_blocks = ri->num_code_blocks;
17014     if (ri->code_blocks) {
17015         int n;
17016         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17017                 struct reg_code_block);
17018         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17019                 struct reg_code_block);
17020         for (n = 0; n < ri->num_code_blocks; n++)
17021              reti->code_blocks[n].src_regex = (REGEXP*)
17022                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17023     }
17024     else
17025         reti->code_blocks = NULL;
17026
17027     reti->regstclass = NULL;
17028
17029     if (ri->data) {
17030         struct reg_data *d;
17031         const int count = ri->data->count;
17032         int i;
17033
17034         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17035                 char, struct reg_data);
17036         Newx(d->what, count, U8);
17037
17038         d->count = count;
17039         for (i = 0; i < count; i++) {
17040             d->what[i] = ri->data->what[i];
17041             switch (d->what[i]) {
17042                 /* see also regcomp.h and regfree_internal() */
17043             case 'a': /* actually an AV, but the dup function is identical.  */
17044             case 'r':
17045             case 's':
17046             case 'S':
17047             case 'u': /* actually an HV, but the dup function is identical.  */
17048                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17049                 break;
17050             case 'f':
17051                 /* This is cheating. */
17052                 Newx(d->data[i], 1, regnode_ssc);
17053                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17054                 reti->regstclass = (regnode*)d->data[i];
17055                 break;
17056             case 'T':
17057                 /* Trie stclasses are readonly and can thus be shared
17058                  * without duplication. We free the stclass in pregfree
17059                  * when the corresponding reg_ac_data struct is freed.
17060                  */
17061                 reti->regstclass= ri->regstclass;
17062                 /* FALLTHROUGH */
17063             case 't':
17064                 OP_REFCNT_LOCK;
17065                 ((reg_trie_data*)ri->data->data[i])->refcount++;
17066                 OP_REFCNT_UNLOCK;
17067                 /* FALLTHROUGH */
17068             case 'l':
17069             case 'L':
17070                 d->data[i] = ri->data->data[i];
17071                 break;
17072             default:
17073                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17074                                                            ri->data->what[i]);
17075             }
17076         }
17077
17078         reti->data = d;
17079     }
17080     else
17081         reti->data = NULL;
17082
17083     reti->name_list_idx = ri->name_list_idx;
17084
17085 #ifdef RE_TRACK_PATTERN_OFFSETS
17086     if (ri->u.offsets) {
17087         Newx(reti->u.offsets, 2*len+1, U32);
17088         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17089     }
17090 #else
17091     SetProgLen(reti,len);
17092 #endif
17093
17094     return (void*)reti;
17095 }
17096
17097 #endif    /* USE_ITHREADS */
17098
17099 #ifndef PERL_IN_XSUB_RE
17100
17101 /*
17102  - regnext - dig the "next" pointer out of a node
17103  */
17104 regnode *
17105 Perl_regnext(pTHX_ regnode *p)
17106 {
17107     I32 offset;
17108
17109     if (!p)
17110         return(NULL);
17111
17112     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
17113         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17114                                                 (int)OP(p), (int)REGNODE_MAX);
17115     }
17116
17117     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17118     if (offset == 0)
17119         return(NULL);
17120
17121     return(p+offset);
17122 }
17123 #endif
17124
17125 STATIC void
17126 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17127 {
17128     va_list args;
17129     STRLEN l1 = strlen(pat1);
17130     STRLEN l2 = strlen(pat2);
17131     char buf[512];
17132     SV *msv;
17133     const char *message;
17134
17135     PERL_ARGS_ASSERT_RE_CROAK2;
17136
17137     if (l1 > 510)
17138         l1 = 510;
17139     if (l1 + l2 > 510)
17140         l2 = 510 - l1;
17141     Copy(pat1, buf, l1 , char);
17142     Copy(pat2, buf + l1, l2 , char);
17143     buf[l1 + l2] = '\n';
17144     buf[l1 + l2 + 1] = '\0';
17145     va_start(args, pat2);
17146     msv = vmess(buf, &args);
17147     va_end(args);
17148     message = SvPV_const(msv,l1);
17149     if (l1 > 512)
17150         l1 = 512;
17151     Copy(message, buf, l1 , char);
17152     /* l1-1 to avoid \n */
17153     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17154 }
17155
17156 #ifdef DEBUGGING
17157 /* Certain characters are output as a sequence with the first being a
17158  * backslash. */
17159 #define isBACKSLASHED_PUNCT(c)                                              \
17160                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
17161
17162 STATIC void
17163 S_put_code_point(pTHX_ SV *sv, UV c)
17164 {
17165     PERL_ARGS_ASSERT_PUT_CODE_POINT;
17166
17167     if (c > 255) {
17168         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17169     }
17170     else if (isPRINT(c)) {
17171         const char string = (char) c;
17172         if (isBACKSLASHED_PUNCT(c))
17173             sv_catpvs(sv, "\\");
17174         sv_catpvn(sv, &string, 1);
17175     }
17176     else {
17177         const char * const mnemonic = cntrl_to_mnemonic((char) c);
17178         if (mnemonic) {
17179             Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17180         }
17181         else {
17182             Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17183         }
17184     }
17185 }
17186
17187 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17188
17189 STATIC void
17190 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17191 {
17192     /* Appends to 'sv' a displayable version of the range of code points from
17193      * 'start' to 'end'.  It assumes that only ASCII printables are displayable
17194      * as-is (though some of these will be escaped by put_code_point()). */
17195
17196     const unsigned int min_range_count = 3;
17197
17198     assert(start <= end);
17199
17200     PERL_ARGS_ASSERT_PUT_RANGE;
17201
17202     while (start <= end) {
17203         UV this_end;
17204         const char * format;
17205
17206         if (end - start < min_range_count) {
17207
17208             /* Individual chars in short ranges */
17209             for (; start <= end; start++) {
17210                 put_code_point(sv, start);
17211             }
17212             break;
17213         }
17214
17215         /* If permitted by the input options, and there is a possibility that
17216          * this range contains a printable literal, look to see if there is
17217          * one.  */
17218         if (allow_literals && start <= MAX_PRINT_A) {
17219
17220             /* If the range begin isn't an ASCII printable, effectively split
17221              * the range into two parts:
17222              *  1) the portion before the first such printable,
17223              *  2) the rest
17224              * and output them separately. */
17225             if (! isPRINT_A(start)) {
17226                 UV temp_end = start + 1;
17227
17228                 /* There is no point looking beyond the final possible
17229                  * printable, in MAX_PRINT_A */
17230                 UV max = MIN(end, MAX_PRINT_A);
17231
17232                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17233                     temp_end++;
17234                 }
17235
17236                 /* Here, temp_end points to one beyond the first printable if
17237                  * found, or to one beyond 'max' if not.  If none found, make
17238                  * sure that we use the entire range */
17239                 if (temp_end > MAX_PRINT_A) {
17240                     temp_end = end + 1;
17241                 }
17242
17243                 /* Output the first part of the split range, the part that
17244                  * doesn't have printables, with no looking for literals
17245                  * (otherwise we would infinitely recurse) */
17246                 put_range(sv, start, temp_end - 1, FALSE);
17247
17248                 /* The 2nd part of the range (if any) starts here. */
17249                 start = temp_end;
17250
17251                 /* We continue instead of dropping down because even if the 2nd
17252                  * part is non-empty, it could be so short that we want to
17253                  * output it specially, as tested for at the top of this loop.
17254                  * */
17255                 continue;
17256             }
17257
17258             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
17259              * output a sub-range of just the digits or letters, then process
17260              * the remaining portion as usual. */
17261             if (isALPHANUMERIC_A(start)) {
17262                 UV mask = (isDIGIT_A(start))
17263                            ? _CC_DIGIT
17264                              : isUPPER_A(start)
17265                                ? _CC_UPPER
17266                                : _CC_LOWER;
17267                 UV temp_end = start + 1;
17268
17269                 /* Find the end of the sub-range that includes just the
17270                  * characters in the same class as the first character in it */
17271                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17272                     temp_end++;
17273                 }
17274                 temp_end--;
17275
17276                 /* For short ranges, don't duplicate the code above to output
17277                  * them; just call recursively */
17278                 if (temp_end - start < min_range_count) {
17279                     put_range(sv, start, temp_end, FALSE);
17280                 }
17281                 else {  /* Output as a range */
17282                     put_code_point(sv, start);
17283                     sv_catpvs(sv, "-");
17284                     put_code_point(sv, temp_end);
17285                 }
17286                 start = temp_end + 1;
17287                 continue;
17288             }
17289
17290             /* We output any other printables as individual characters */
17291             if (isPUNCT_A(start) || isSPACE_A(start)) {
17292                 while (start <= end && (isPUNCT_A(start)
17293                                         || isSPACE_A(start)))
17294                 {
17295                     put_code_point(sv, start);
17296                     start++;
17297                 }
17298                 continue;
17299             }
17300         } /* End of looking for literals */
17301
17302         /* Here is not to output as a literal.  Some control characters have
17303          * mnemonic names.  Split off any of those at the beginning and end of
17304          * the range to print mnemonically.  It isn't possible for many of
17305          * these to be in a row, so this won't overwhelm with output */
17306         while (isMNEMONIC_CNTRL(start) && start <= end) {
17307             put_code_point(sv, start);
17308             start++;
17309         }
17310         if (start < end && isMNEMONIC_CNTRL(end)) {
17311
17312             /* Here, the final character in the range has a mnemonic name.
17313              * Work backwards from the end to find the final non-mnemonic */
17314             UV temp_end = end - 1;
17315             while (isMNEMONIC_CNTRL(temp_end)) {
17316                 temp_end--;
17317             }
17318
17319             /* And separately output the range that doesn't have mnemonics */
17320             put_range(sv, start, temp_end, FALSE);
17321
17322             /* Then output the mnemonic trailing controls */
17323             start = temp_end + 1;
17324             while (start <= end) {
17325                 put_code_point(sv, start);
17326                 start++;
17327             }
17328             break;
17329         }
17330
17331         /* As a final resort, output the range or subrange as hex. */
17332
17333         this_end = (end < NUM_ANYOF_CODE_POINTS)
17334                     ? end
17335                     : NUM_ANYOF_CODE_POINTS - 1;
17336         format = (this_end < 256)
17337                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17338                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17339         GCC_DIAG_IGNORE(-Wformat-nonliteral);
17340         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17341         GCC_DIAG_RESTORE;
17342         break;
17343     }
17344 }
17345
17346 STATIC bool
17347 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17348 {
17349     /* Appends to 'sv' a displayable version of the innards of the bracketed
17350      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
17351      * output anything, and bitmap_invlist, if not NULL, will point to an
17352      * inversion list of what is in the bit map */
17353
17354     int i;
17355     UV start, end;
17356     unsigned int punct_count = 0;
17357     SV* invlist = NULL;
17358     SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
17359     bool allow_literals = TRUE;
17360
17361     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17362
17363     invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17364
17365     /* Worst case is exactly every-other code point is in the list */
17366     *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17367
17368     /* Convert the bit map to an inversion list, keeping track of how many
17369      * ASCII puncts are set, including an extra amount for the backslashed
17370      * ones.  */
17371     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17372         if (BITMAP_TEST(bitmap, i)) {
17373             *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17374             if (isPUNCT_A(i)) {
17375                 punct_count++;
17376                 if isBACKSLASHED_PUNCT(i) {
17377                     punct_count++;
17378                 }
17379             }
17380         }
17381     }
17382
17383     /* Nothing to output */
17384     if (_invlist_len(*invlist_ptr) == 0) {
17385         SvREFCNT_dec(invlist);
17386         return FALSE;
17387     }
17388
17389     /* Generally, it is more readable if printable characters are output as
17390      * literals, but if a range (nearly) spans all of them, it's best to output
17391      * it as a single range.  This code will use a single range if all but 2
17392      * printables are in it */
17393     invlist_iterinit(*invlist_ptr);
17394     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17395
17396         /* If range starts beyond final printable, it doesn't have any in it */
17397         if (start > MAX_PRINT_A) {
17398             break;
17399         }
17400
17401         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
17402          * all but two, the range must start and end no later than 2 from
17403          * either end */
17404         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17405             if (end > MAX_PRINT_A) {
17406                 end = MAX_PRINT_A;
17407             }
17408             if (start < ' ') {
17409                 start = ' ';
17410             }
17411             if (end - start >= MAX_PRINT_A - ' ' - 2) {
17412                 allow_literals = FALSE;
17413             }
17414             break;
17415         }
17416     }
17417     invlist_iterfinish(*invlist_ptr);
17418
17419     /* The legibility of the output depends mostly on how many punctuation
17420      * characters are output.  There are 32 possible ASCII ones, and some have
17421      * an additional backslash, bringing it to currently 36, so if any more
17422      * than 18 are to be output, we can instead output it as its complement,
17423      * yielding fewer puncts, and making it more legible.  But give some weight
17424      * to the fact that outputting it as a complement is less legible than a
17425      * straight output, so don't complement unless we are somewhat over the 18
17426      * mark */
17427     if (allow_literals && punct_count > 22) {
17428         sv_catpvs(sv, "^");
17429
17430         /* Add everything remaining to the list, so when we invert it just
17431          * below, it will be excluded */
17432         _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17433         _invlist_invert(*invlist_ptr);
17434     }
17435
17436     /* Here we have figured things out.  Output each range */
17437     invlist_iterinit(*invlist_ptr);
17438     while (invlist_iternext(*invlist_ptr, &start, &end)) {
17439         if (start >= NUM_ANYOF_CODE_POINTS) {
17440             break;
17441         }
17442         put_range(sv, start, end, allow_literals);
17443     }
17444     invlist_iterfinish(*invlist_ptr);
17445
17446     return TRUE;
17447 }
17448
17449 #define CLEAR_OPTSTART \
17450     if (optstart) STMT_START {                                               \
17451         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
17452                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17453         optstart=NULL;                                                       \
17454     } STMT_END
17455
17456 #define DUMPUNTIL(b,e)                                                       \
17457                     CLEAR_OPTSTART;                                          \
17458                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17459
17460 STATIC const regnode *
17461 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17462             const regnode *last, const regnode *plast,
17463             SV* sv, I32 indent, U32 depth)
17464 {
17465     U8 op = PSEUDO;     /* Arbitrary non-END op. */
17466     const regnode *next;
17467     const regnode *optstart= NULL;
17468
17469     RXi_GET_DECL(r,ri);
17470     GET_RE_DEBUG_FLAGS_DECL;
17471
17472     PERL_ARGS_ASSERT_DUMPUNTIL;
17473
17474 #ifdef DEBUG_DUMPUNTIL
17475     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17476         last ? last-start : 0,plast ? plast-start : 0);
17477 #endif
17478
17479     if (plast && plast < last)
17480         last= plast;
17481
17482     while (PL_regkind[op] != END && (!last || node < last)) {
17483         assert(node);
17484         /* While that wasn't END last time... */
17485         NODE_ALIGN(node);
17486         op = OP(node);
17487         if (op == CLOSE || op == WHILEM)
17488             indent--;
17489         next = regnext((regnode *)node);
17490
17491         /* Where, what. */
17492         if (OP(node) == OPTIMIZED) {
17493             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17494                 optstart = node;
17495             else
17496                 goto after_print;
17497         } else
17498             CLEAR_OPTSTART;
17499
17500         regprop(r, sv, node, NULL, NULL);
17501         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17502                       (int)(2*indent + 1), "", SvPVX_const(sv));
17503
17504         if (OP(node) != OPTIMIZED) {
17505             if (next == NULL)           /* Next ptr. */
17506                 PerlIO_printf(Perl_debug_log, " (0)");
17507             else if (PL_regkind[(U8)op] == BRANCH
17508                      && PL_regkind[OP(next)] != BRANCH )
17509                 PerlIO_printf(Perl_debug_log, " (FAIL)");
17510             else
17511                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17512             (void)PerlIO_putc(Perl_debug_log, '\n');
17513         }
17514
17515       after_print:
17516         if (PL_regkind[(U8)op] == BRANCHJ) {
17517             assert(next);
17518             {
17519                 const regnode *nnode = (OP(next) == LONGJMP
17520                                        ? regnext((regnode *)next)
17521                                        : next);
17522                 if (last && nnode > last)
17523                     nnode = last;
17524                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17525             }
17526         }
17527         else if (PL_regkind[(U8)op] == BRANCH) {
17528             assert(next);
17529             DUMPUNTIL(NEXTOPER(node), next);
17530         }
17531         else if ( PL_regkind[(U8)op]  == TRIE ) {
17532             const regnode *this_trie = node;
17533             const char op = OP(node);
17534             const U32 n = ARG(node);
17535             const reg_ac_data * const ac = op>=AHOCORASICK ?
17536                (reg_ac_data *)ri->data->data[n] :
17537                NULL;
17538             const reg_trie_data * const trie =
17539                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17540 #ifdef DEBUGGING
17541             AV *const trie_words
17542                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17543 #endif
17544             const regnode *nextbranch= NULL;
17545             I32 word_idx;
17546             sv_setpvs(sv, "");
17547             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17548                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17549
17550                 PerlIO_printf(Perl_debug_log, "%*s%s ",
17551                    (int)(2*(indent+3)), "",
17552                     elem_ptr
17553                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17554                                 SvCUR(*elem_ptr), 60,
17555                                 PL_colors[0], PL_colors[1],
17556                                 (SvUTF8(*elem_ptr)
17557                                  ? PERL_PV_ESCAPE_UNI
17558                                  : 0)
17559                                 | PERL_PV_PRETTY_ELLIPSES
17560                                 | PERL_PV_PRETTY_LTGT
17561                             )
17562                     : "???"
17563                 );
17564                 if (trie->jump) {
17565                     U16 dist= trie->jump[word_idx+1];
17566                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17567                                (UV)((dist ? this_trie + dist : next) - start));
17568                     if (dist) {
17569                         if (!nextbranch)
17570                             nextbranch= this_trie + trie->jump[0];
17571                         DUMPUNTIL(this_trie + dist, nextbranch);
17572                     }
17573                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17574                         nextbranch= regnext((regnode *)nextbranch);
17575                 } else {
17576                     PerlIO_printf(Perl_debug_log, "\n");
17577                 }
17578             }
17579             if (last && next > last)
17580                 node= last;
17581             else
17582                 node= next;
17583         }
17584         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
17585             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17586                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17587         }
17588         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17589             assert(next);
17590             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17591         }
17592         else if ( op == PLUS || op == STAR) {
17593             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17594         }
17595         else if (PL_regkind[(U8)op] == ANYOF) {
17596             /* arglen 1 + class block */
17597             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17598                           ? ANYOF_POSIXL_SKIP
17599                           : ANYOF_SKIP);
17600             node = NEXTOPER(node);
17601         }
17602         else if (PL_regkind[(U8)op] == EXACT) {
17603             /* Literal string, where present. */
17604             node += NODE_SZ_STR(node) - 1;
17605             node = NEXTOPER(node);
17606         }
17607         else {
17608             node = NEXTOPER(node);
17609             node += regarglen[(U8)op];
17610         }
17611         if (op == CURLYX || op == OPEN)
17612             indent++;
17613     }
17614     CLEAR_OPTSTART;
17615 #ifdef DEBUG_DUMPUNTIL
17616     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17617 #endif
17618     return node;
17619 }
17620
17621 #endif  /* DEBUGGING */
17622
17623 /*
17624  * Local variables:
17625  * c-indentation-style: bsd
17626  * c-basic-offset: 4
17627  * indent-tabs-mode: nil
17628  * End:
17629  *
17630  * ex: set ts=8 sts=4 sw=4 et:
17631  */