regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* this is a chain of data about sub patterns we are processing that
 105    need to be handled separately/specially in study_chunk. Its so
 106    we can simulate recursion without losing state.  */
 107 struct scan_frame;
 108 typedef struct scan_frame {
 109     regnode *last_regnode;      /* last node to process in this frame */
 110     regnode *next_regnode;      /* next node to process when last is reached */
 111     U32 prev_recursed_depth;
 112     I32 stopparen;              /* what stopparen do we use */
 113
 114     struct scan_frame *this_prev_frame; /* this previous frame */
 115     struct scan_frame *prev_frame;      /* previous frame */
 116     struct scan_frame *next_frame;      /* next frame */
 117 } scan_frame;
 118
 119 /* Certain characters are output as a sequence with the first being a
 120  * backslash. */
 121 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 122
 123
 124 struct RExC_state_t {
 125     U32         flags;                  /* RXf_* are we folding, multilining? */
 126     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 127     char        *precomp;               /* uncompiled string. */
 128     char        *precomp_end;           /* pointer to end of uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     char        *copy_start;            /* start of copy of input within
 137                                            constructed parse string */
 138     char        *copy_start_in_input;   /* Position in input string
 139                                            corresponding to copy_start */
 140     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 141     regnode     *emit_start;            /* Start of emitted-code area */
 142     regnode_offset emit;                /* Code-emit pointer */
 143     I32         naughty;                /* How bad is this pattern? */
 144     I32         sawback;                /* Did we see \1, ...? */
 145     U32         seen;
 146     SSize_t     size;                   /* Number of regnode equivalents in
 147                                            pattern */
 148
 149     /* position beyond 'precomp' of the warning message furthest away from
 150      * 'precomp'.  During the parse, no warnings are raised for any problems
 151      * earlier in the parse than this position.  This works if warnings are
 152      * raised the first time a given spot is parsed, and if only one
 153      * independent warning is raised for any given spot */
 154     Size_t      latest_warn_offset;
 155
 156     I32         npar;                   /* Capture buffer count so far in the
 157                                            parse, (OPEN) plus one. ("par" 0 is
 158                                            the whole pattern)*/
 159     I32         total_par;              /* During initial parse, is either 0,
 160                                            or -1; the latter indicating a
 161                                            reparse is needed.  After that pass,
 162                                            it is what 'npar' became after the
 163                                            pass.  Hence, it being > 0 indicates
 164                                            we are in a reparse situation */
 165     I32         nestroot;               /* root parens we are in - used by
 166                                            accept */
 167     I32         seen_zerolen;
 168     regnode_offset *open_parens;        /* offsets to open parens */
 169     regnode_offset *close_parens;       /* offsets to close parens */
 170     regnode     *end_op;                /* END node in program */
 171     I32         utf8;           /* whether the pattern is utf8 or not */
 172     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 173                                 /* XXX use this for future optimisation of case
 174                                  * where pattern must be upgraded to utf8. */
 175     I32         uni_semantics;  /* If a d charset modifier should use unicode
 176                                    rules, even if the pattern is not in
 177                                    utf8 */
 178     HV          *paren_names;           /* Paren names */
 179
 180     regnode     **recurse;              /* Recurse regops */
 181     I32         recurse_count;          /* Number of recurse regops we have generated */
 182     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 183                                            through */
 184     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 185     I32         in_lookbehind;
 186     I32         contains_locale;
 187     I32         override_recoding;
 188 #ifdef EBCDIC
 189     I32         recode_x_to_native;
 190 #endif
 191     I32         in_multi_char_class;
 192     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 193                                             within pattern */
 194     int         code_index;             /* next code_blocks[] slot */
 195     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 196     scan_frame *frame_head;
 197     scan_frame *frame_last;
 198     U32         frame_count;
 199     AV         *warn_text;
 200 #ifdef ADD_TO_REGEXEC
 201     char        *starttry;              /* -Dr: where regtry was called. */
 202 #define RExC_starttry   (pRExC_state->starttry)
 203 #endif
 204     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 205 #ifdef DEBUGGING
 206     const char  *lastparse;
 207     I32         lastnum;
 208     AV          *paren_name_list;       /* idx -> name */
 209     U32         study_chunk_recursed_count;
 210     SV          *mysv1;
 211     SV          *mysv2;
 212
 213 #define RExC_lastparse  (pRExC_state->lastparse)
 214 #define RExC_lastnum    (pRExC_state->lastnum)
 215 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 216 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 217 #define RExC_mysv       (pRExC_state->mysv1)
 218 #define RExC_mysv1      (pRExC_state->mysv1)
 219 #define RExC_mysv2      (pRExC_state->mysv2)
 220
 221 #endif
 222     bool        seen_d_op;
 223     bool        strict;
 224     bool        study_started;
 225     bool        in_script_run;
 226     bool        use_BRANCHJ;
 227 };
 228
 229 #define RExC_flags      (pRExC_state->flags)
 230 #define RExC_pm_flags   (pRExC_state->pm_flags)
 231 #define RExC_precomp    (pRExC_state->precomp)
 232 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 233 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 242 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 244                                                    under /d from /u ? */
 245
 246
 247 #ifdef RE_TRACK_PATTERN_OFFSETS
 248 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 249                                                          others */
 250 #endif
 251 #define RExC_emit       (pRExC_state->emit)
 252 #define RExC_emit_start (pRExC_state->emit_start)
 253 #define RExC_sawback    (pRExC_state->sawback)
 254 #define RExC_seen       (pRExC_state->seen)
 255 #define RExC_size       (pRExC_state->size)
 256 #define RExC_maxlen        (pRExC_state->maxlen)
 257 #define RExC_npar       (pRExC_state->npar)
 258 #define RExC_total_parens       (pRExC_state->total_par)
 259 #define RExC_nestroot   (pRExC_state->nestroot)
 260 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 261 #define RExC_utf8       (pRExC_state->utf8)
 262 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 263 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 264 #define RExC_open_parens        (pRExC_state->open_parens)
 265 #define RExC_close_parens       (pRExC_state->close_parens)
 266 #define RExC_end_op     (pRExC_state->end_op)
 267 #define RExC_paren_names        (pRExC_state->paren_names)
 268 #define RExC_recurse    (pRExC_state->recurse)
 269 #define RExC_recurse_count      (pRExC_state->recurse_count)
 270 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 271 #define RExC_study_chunk_recursed_bytes  \
 272                                    (pRExC_state->study_chunk_recursed_bytes)
 273 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 274 #define RExC_contains_locale    (pRExC_state->contains_locale)
 275 #ifdef EBCDIC
 276 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 277 #endif
 278 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 279 #define RExC_frame_head (pRExC_state->frame_head)
 280 #define RExC_frame_last (pRExC_state->frame_last)
 281 #define RExC_frame_count (pRExC_state->frame_count)
 282 #define RExC_strict (pRExC_state->strict)
 283 #define RExC_study_started      (pRExC_state->study_started)
 284 #define RExC_warn_text (pRExC_state->warn_text)
 285 #define RExC_in_script_run      (pRExC_state->in_script_run)
 286 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 287
 288 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 289  * a flag to disable back-off on the fixed/floating substrings - if it's
 290  * a high complexity pattern we assume the benefit of avoiding a full match
 291  * is worth the cost of checking for the substrings even if they rarely help.
 292  */
 293 #define RExC_naughty    (pRExC_state->naughty)
 294 #define TOO_NAUGHTY (10)
 295 #define MARK_NAUGHTY(add) \
 296     if (RExC_naughty < TOO_NAUGHTY) \
 297         RExC_naughty += (add)
 298 #define MARK_NAUGHTY_EXP(exp, add) \
 299     if (RExC_naughty < TOO_NAUGHTY) \
 300         RExC_naughty += RExC_naughty / (exp) + (add)
 301
 302 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 303 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 304         ((*s) == '{' && regcurly(s)))
 305
 306 /*
 307  * Flags to be passed up and down.
 308  */
 309 #define WORST           0       /* Worst case. */
 310 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 311                                    non-null ones. */
 312
 313 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 314  * character.  (There needs to be a case: in the switch statement in regexec.c
 315  * for any node marked SIMPLE.)  Note that this is not the same thing as
 316  * REGNODE_SIMPLE */
 317 #define SIMPLE          0x02
 318 #define SPSTART         0x04    /* Starts with * or + */
 319 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 320 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 321 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 322 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 323                                    calcuate sizes as UTF-8 */
 324
 325 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 326
 327 /* whether trie related optimizations are enabled */
 328 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 329 #define TRIE_STUDY_OPT
 330 #define FULL_TRIE_STUDY
 331 #define TRIE_STCLASS
 332 #endif
 333
 334
 335
 336 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 337 #define PBITVAL(paren) (1 << ((paren) & 7))
 338 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 339 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 340 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 341
 342 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 343                                      if (!UTF) {                           \
 344                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 345                                          return 0;                         \
 346                                      }                                     \
 347                              } STMT_END
 348
 349 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 350  * a flag that indicates we need to override /d with /u as a result of
 351  * something in the pattern.  It should only be used in regards to calling
 352  * set_regex_charset() or get_regex_charse() */
 353 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 354     STMT_START {                                                            \
 355             if (DEPENDS_SEMANTICS) {                                        \
 356                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 357                 RExC_uni_semantics = 1;                                     \
 358                 if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) {     \
 359                     /* No need to restart the parse if we haven't seen      \
 360                      * anything that differs between /u and /d, and no need \
 361                      * to restart immediately if we're going to reparse     \
 362                      * anyway to count parens */                            \
 363                     *flagp |= RESTART_PARSE;                                \
 364                     return restart_retval;                                  \
 365                 }                                                           \
 366             }                                                               \
 367     } STMT_END
 368
 369 #define BRANCH_MAX_OFFSET   U16_MAX
 370 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 371     STMT_START {                                                            \
 372                 RExC_use_BRANCHJ = 1;                                       \
 373                 if (LIKELY(RExC_total_parens >= 0)) {                       \
 374                     /* No need to restart the parse immediately if we're    \
 375                      * going to reparse anyway to count parens */           \
 376                     *flagp |= RESTART_PARSE;                                \
 377                     return restart_retval;                                  \
 378                 }                                                           \
 379     } STMT_END
 380
 381 #define REQUIRE_PARENS_PASS                                                 \
 382     STMT_START {                                                            \
 383                     if (RExC_total_parens == 0) RExC_total_parens = -1;     \
 384     } STMT_END
 385
 386 /* This is used to return failure (zero) early from the calling function if
 387  * various flags in 'flags' are set.  Two flags always cause a return:
 388  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 389  * additional flags that should cause a return; 0 if none.  If the return will
 390  * be done, '*flagp' is first set to be all of the flags that caused the
 391  * return. */
 392 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 393     STMT_START {                                                            \
 394             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 395                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 396                 return 0;                                                   \
 397             }                                                               \
 398     } STMT_END
 399
 400 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 401
 402 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 403                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 404 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 405                                     if (MUST_RESTART(*(flagp))) return 0
 406
 407 /* This converts the named class defined in regcomp.h to its equivalent class
 408  * number defined in handy.h. */
 409 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 410 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 411
 412 #define _invlist_union_complement_2nd(a, b, output) \
 413                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 414 #define _invlist_intersection_complement_2nd(a, b, output) \
 415                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 416
 417 /* About scan_data_t.
 418
 419   During optimisation we recurse through the regexp program performing
 420   various inplace (keyhole style) optimisations. In addition study_chunk
 421   and scan_commit populate this data structure with information about
 422   what strings MUST appear in the pattern. We look for the longest
 423   string that must appear at a fixed location, and we look for the
 424   longest string that may appear at a floating location. So for instance
 425   in the pattern:
 426
 427     /FOO[xX]A.*B[xX]BAR/
 428
 429   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 430   strings (because they follow a .* construct). study_chunk will identify
 431   both FOO and BAR as being the longest fixed and floating strings respectively.
 432
 433   The strings can be composites, for instance
 434
 435      /(f)(o)(o)/
 436
 437   will result in a composite fixed substring 'foo'.
 438
 439   For each string some basic information is maintained:
 440
 441   - min_offset
 442     This is the position the string must appear at, or not before.
 443     It also implicitly (when combined with minlenp) tells us how many
 444     characters must match before the string we are searching for.
 445     Likewise when combined with minlenp and the length of the string it
 446     tells us how many characters must appear after the string we have
 447     found.
 448
 449   - max_offset
 450     Only used for floating strings. This is the rightmost point that
 451     the string can appear at. If set to SSize_t_MAX it indicates that the
 452     string can occur infinitely far to the right.
 453     For fixed strings, it is equal to min_offset.
 454
 455   - minlenp
 456     A pointer to the minimum number of characters of the pattern that the
 457     string was found inside. This is important as in the case of positive
 458     lookahead or positive lookbehind we can have multiple patterns
 459     involved. Consider
 460
 461     /(?=FOO).*F/
 462
 463     The minimum length of the pattern overall is 3, the minimum length
 464     of the lookahead part is 3, but the minimum length of the part that
 465     will actually match is 1. So 'FOO's minimum length is 3, but the
 466     minimum length for the F is 1. This is important as the minimum length
 467     is used to determine offsets in front of and behind the string being
 468     looked for.  Since strings can be composites this is the length of the
 469     pattern at the time it was committed with a scan_commit. Note that
 470     the length is calculated by study_chunk, so that the minimum lengths
 471     are not known until the full pattern has been compiled, thus the
 472     pointer to the value.
 473
 474   - lookbehind
 475
 476     In the case of lookbehind the string being searched for can be
 477     offset past the start point of the final matching string.
 478     If this value was just blithely removed from the min_offset it would
 479     invalidate some of the calculations for how many chars must match
 480     before or after (as they are derived from min_offset and minlen and
 481     the length of the string being searched for).
 482     When the final pattern is compiled and the data is moved from the
 483     scan_data_t structure into the regexp structure the information
 484     about lookbehind is factored in, with the information that would
 485     have been lost precalculated in the end_shift field for the
 486     associated string.
 487
 488   The fields pos_min and pos_delta are used to store the minimum offset
 489   and the delta to the maximum offset at the current point in the pattern.
 490
 491 */
 492
 493 struct scan_data_substrs {
 494     SV      *str;       /* longest substring found in pattern */
 495     SSize_t min_offset; /* earliest point in string it can appear */
 496     SSize_t max_offset; /* latest point in string it can appear */
 497     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 498     SSize_t lookbehind; /* is the pos of the string modified by LB */
 499     I32 flags;          /* per substring SF_* and SCF_* flags */
 500 };
 501
 502 typedef struct scan_data_t {
 503     /*I32 len_min;      unused */
 504     /*I32 len_delta;    unused */
 505     SSize_t pos_min;
 506     SSize_t pos_delta;
 507     SV *last_found;
 508     SSize_t last_end;       /* min value, <0 unless valid. */
 509     SSize_t last_start_min;
 510     SSize_t last_start_max;
 511     U8      cur_is_floating; /* whether the last_* values should be set as
 512                               * the next fixed (0) or floating (1)
 513                               * substring */
 514
 515     /* [0] is longest fixed substring so far, [1] is longest float so far */
 516     struct scan_data_substrs  substrs[2];
 517
 518     I32 flags;             /* common SF_* and SCF_* flags */
 519     I32 whilem_c;
 520     SSize_t *last_closep;
 521     regnode_ssc *start_class;
 522 } scan_data_t;
 523
 524 /*
 525  * Forward declarations for pregcomp()'s friends.
 526  */
 527
 528 static const scan_data_t zero_scan_data = {
 529     0, 0, NULL, 0, 0, 0, 0,
 530     {
 531         { NULL, 0, 0, 0, 0, 0 },
 532         { NULL, 0, 0, 0, 0, 0 },
 533     },
 534     0, 0, NULL, NULL
 535 };
 536
 537 /* study flags */
 538
 539 #define SF_BEFORE_SEOL          0x0001
 540 #define SF_BEFORE_MEOL          0x0002
 541 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 542
 543 #define SF_IS_INF               0x0040
 544 #define SF_HAS_PAR              0x0080
 545 #define SF_IN_PAR               0x0100
 546 #define SF_HAS_EVAL             0x0200
 547
 548
 549 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 550  * longest substring in the pattern. When it is not set the optimiser keeps
 551  * track of position, but does not keep track of the actual strings seen,
 552  *
 553  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 554  * /foo/i will not.
 555  *
 556  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 557  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 558  * turned off because of the alternation (BRANCH). */
 559 #define SCF_DO_SUBSTR           0x0400
 560
 561 #define SCF_DO_STCLASS_AND      0x0800
 562 #define SCF_DO_STCLASS_OR       0x1000
 563 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 564 #define SCF_WHILEM_VISITED_POS  0x2000
 565
 566 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 567 #define SCF_SEEN_ACCEPT         0x8000
 568 #define SCF_TRIE_DOING_RESTUDY 0x10000
 569 #define SCF_IN_DEFINE          0x20000
 570
 571
 572
 573
 574 #define UTF cBOOL(RExC_utf8)
 575
 576 /* The enums for all these are ordered so things work out correctly */
 577 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 578 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 579                                                      == REGEX_DEPENDS_CHARSET)
 580 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 581 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 582                                                      >= REGEX_UNICODE_CHARSET)
 583 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 584                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 585 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 586                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 587 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 588                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 589
 590 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 591
 592 /* For programs that want to be strictly Unicode compatible by dying if any
 593  * attempt is made to match a non-Unicode code point against a Unicode
 594  * property.  */
 595 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 596
 597 #define OOB_NAMEDCLASS          -1
 598
 599 /* There is no code point that is out-of-bounds, so this is problematic.  But
 600  * its only current use is to initialize a variable that is always set before
 601  * looked at. */
 602 #define OOB_UNICODE             0xDEADBEEF
 603
 604 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 605
 606
 607 /* length of regex to show in messages that don't mark a position within */
 608 #define RegexLengthToShowInErrorMessages 127
 609
 610 /*
 611  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 612  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 613  * op/pragma/warn/regcomp.
 614  */
 615 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 616 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 617
 618 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 619                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 620
 621 /* The code in this file in places uses one level of recursion with parsing
 622  * rebased to an alternate string constructed by us in memory.  This can take
 623  * the form of something that is completely different from the input, or
 624  * something that uses the input as part of the alternate.  In the first case,
 625  * there should be no possibility of an error, as we are in complete control of
 626  * the alternate string.  But in the second case we don't completely control
 627  * the input portion, so there may be errors in that.  Here's an example:
 628  *      /[abc\x{DF}def]/ui
 629  * is handled specially because \x{df} folds to a sequence of more than one
 630  * character: 'ss'.  What is done is to create and parse an alternate string,
 631  * which looks like this:
 632  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 633  * where it uses the input unchanged in the middle of something it constructs,
 634  * which is a branch for the DF outside the character class, and clustering
 635  * parens around the whole thing. (It knows enough to skip the DF inside the
 636  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 637  * need to be reported.  The general situation looks like this:
 638  *
 639  *                                       |<------- identical ------>|
 640  *              sI                       tI               xI       eI
 641  * Input:       ---------------------------------------------------------------
 642  * Constructed:         ---------------------------------------------------
 643  *                      sC               tC               xC       eC     EC
 644  *                                       |<------- identical ------>|
 645  *
 646  * sI..eI   is the portion of the input pattern we are concerned with here.
 647  * sC..EC   is the constructed substitute parse string.
 648  *  sC..tC  is constructed by us
 649  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 650  *          In the diagram, these are vertically aligned.
 651  *  eC..EC  is also constructed by us.
 652  * xC       is the position in the substitute parse string where we found a
 653  *          problem.
 654  * xI       is the position in the original pattern corresponding to xC.
 655  *
 656  * We want to display a message showing the real input string.  Thus we need to
 657  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 658  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 659  * get:
 660  *      xI = tI + (xC - tC)
 661  *
 662  * When the substitute parse is constructed, the code needs to set:
 663  *      RExC_start (sC)
 664  *      RExC_end (eC)
 665  *      RExC_copy_start_in_input  (tI)
 666  *      RExC_copy_start_in_constructed (tC)
 667  * and restore them when done.
 668  *
 669  * During normal processing of the input pattern, both
 670  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 671  * sI, so that xC equals xI.
 672  */
 673
 674 #define sI              RExC_precomp
 675 #define eI              RExC_precomp_end
 676 #define sC              RExC_start
 677 #define eC              RExC_end
 678 #define tI              RExC_copy_start_in_input
 679 #define tC              RExC_copy_start_in_constructed
 680 #define xI(xC)          (tI + (xC - tC))
 681 #define xI_offset(xC)   (xI(xC) - sI)
 682
 683 #define REPORT_LOCATION_ARGS(xC)                                            \
 684     UTF8fARG(UTF,                                                           \
 685              (xI(xC) > eI) /* Don't run off end */                          \
 686               ? eI - sI   /* Length before the <--HERE */                   \
 687               : ((xI_offset(xC) >= 0)                                       \
 688                  ? xI_offset(xC)                                            \
 689                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 690                                     IVdf " trying to output message for "   \
 691                                     " pattern %.*s",                        \
 692                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 693                                     ((int) (eC - sC)), sC), 0)),            \
 694              sI),         /* The input pattern printed up to the <--HERE */ \
 695     UTF8fARG(UTF,                                                           \
 696              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 697              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 698
 699 /* Used to point after bad bytes for an error message, but avoid skipping
 700  * past a nul byte. */
 701 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 702
 703 /* Set up to clean up after our imminent demise */
 704 #define PREPARE_TO_DIE                                                      \
 705     STMT_START {                                                            \
 706         if (RExC_rx_sv)                                                     \
 707             SAVEFREESV(RExC_rx_sv);                                         \
 708         if (RExC_open_parens)                                               \
 709             SAVEFREEPV(RExC_open_parens);                                   \
 710         if (RExC_close_parens)                                              \
 711             SAVEFREEPV(RExC_close_parens);                                  \
 712     } STMT_END
 713
 714 /*
 715  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 716  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 717  * "...".
 718  */
 719 #define _FAIL(code) STMT_START {                                        \
 720     const char *ellipses = "";                                          \
 721     IV len = RExC_precomp_end - RExC_precomp;                           \
 722                                                                         \
 723     PREPARE_TO_DIE;                                                     \
 724     if (len > RegexLengthToShowInErrorMessages) {                       \
 725         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 726         len = RegexLengthToShowInErrorMessages - 10;                    \
 727         ellipses = "...";                                               \
 728     }                                                                   \
 729     code;                                                               \
 730 } STMT_END
 731
 732 #define FAIL(msg) _FAIL(                            \
 733     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 734             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 735
 736 #define FAIL2(msg,arg) _FAIL(                       \
 737     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 738             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 739
 740 /*
 741  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 742  */
 743 #define Simple_vFAIL(m) STMT_START {                                    \
 744     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 745             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 746 } STMT_END
 747
 748 /*
 749  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 750  */
 751 #define vFAIL(m) STMT_START {                           \
 752     PREPARE_TO_DIE;                                     \
 753     Simple_vFAIL(m);                                    \
 754 } STMT_END
 755
 756 /*
 757  * Like Simple_vFAIL(), but accepts two arguments.
 758  */
 759 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 760     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 761                       REPORT_LOCATION_ARGS(RExC_parse));        \
 762 } STMT_END
 763
 764 /*
 765  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 766  */
 767 #define vFAIL2(m,a1) STMT_START {                       \
 768     PREPARE_TO_DIE;                                     \
 769     Simple_vFAIL2(m, a1);                               \
 770 } STMT_END
 771
 772
 773 /*
 774  * Like Simple_vFAIL(), but accepts three arguments.
 775  */
 776 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 777     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 778             REPORT_LOCATION_ARGS(RExC_parse));                  \
 779 } STMT_END
 780
 781 /*
 782  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 783  */
 784 #define vFAIL3(m,a1,a2) STMT_START {                    \
 785     PREPARE_TO_DIE;                                     \
 786     Simple_vFAIL3(m, a1, a2);                           \
 787 } STMT_END
 788
 789 /*
 790  * Like Simple_vFAIL(), but accepts four arguments.
 791  */
 792 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 793     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 794             REPORT_LOCATION_ARGS(RExC_parse));                  \
 795 } STMT_END
 796
 797 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 798     PREPARE_TO_DIE;                                     \
 799     Simple_vFAIL4(m, a1, a2, a3);                       \
 800 } STMT_END
 801
 802 /* A specialized version of vFAIL2 that works with UTF8f */
 803 #define vFAIL2utf8f(m, a1) STMT_START {             \
 804     PREPARE_TO_DIE;                                 \
 805     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 806             REPORT_LOCATION_ARGS(RExC_parse));      \
 807 } STMT_END
 808
 809 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 810     PREPARE_TO_DIE;                                     \
 811     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 812             REPORT_LOCATION_ARGS(RExC_parse));          \
 813 } STMT_END
 814
 815 /* Setting this to NULL is a signal to not output warnings */
 816 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
 817 #define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
 818
 819 /* Since a warning can be generated multiple times as the input is reparsed, we
 820  * output it the first time we come to that point in the parse, but suppress it
 821  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 822  * generate any warnings */
 823 #define TO_OUTPUT_WARNINGS(loc)                                         \
 824   (   RExC_copy_start_in_constructed                                    \
 825    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 826
 827 /* After we've emitted a warning, we save the position in the input so we don't
 828  * output it again */
 829 #define UPDATE_WARNINGS_LOC(loc)                                        \
 830     STMT_START {                                                        \
 831         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 832             RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
 833         }                                                               \
 834     } STMT_END
 835
 836 /* 'warns' is the output of the packWARNx macro used in 'code' */
 837 #define _WARN_HELPER(loc, warns, code)                                  \
 838     STMT_START {                                                        \
 839         if (! RExC_copy_start_in_constructed) {                         \
 840             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 841                               " expected at '%s'",                      \
 842                               __FILE__, __LINE__, loc);                 \
 843         }                                                               \
 844         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 845             if (ckDEAD(warns))                                          \
 846                 PREPARE_TO_DIE;                                         \
 847             code;                                                       \
 848             UPDATE_WARNINGS_LOC(loc);                                   \
 849         }                                                               \
 850     } STMT_END
 851
 852 /* m is not necessarily a "literal string", in this macro */
 853 #define reg_warn_non_literal_string(loc, m)                             \
 854     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 855                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 856                                        "%s" REPORT_LOCATION,            \
 857                                   m, REPORT_LOCATION_ARGS(loc)))
 858
 859 #define ckWARNreg(loc,m)                                                \
 860     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 861                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 862                                           m REPORT_LOCATION,            \
 863                                           REPORT_LOCATION_ARGS(loc)))
 864
 865 #define vWARN(loc, m)                                                   \
 866     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 867                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 868                                        m REPORT_LOCATION,               \
 869                                        REPORT_LOCATION_ARGS(loc)))      \
 870
 871 #define vWARN_dep(loc, m)                                               \
 872     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 873                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 874                                        m REPORT_LOCATION,               \
 875                                        REPORT_LOCATION_ARGS(loc)))
 876
 877 #define ckWARNdep(loc,m)                                                \
 878     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 879                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 880                                             m REPORT_LOCATION,          \
 881                                             REPORT_LOCATION_ARGS(loc)))
 882
 883 #define ckWARNregdep(loc,m)                                                 \
 884     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 885                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 886                                                       WARN_REGEXP),         \
 887                                              m REPORT_LOCATION,             \
 888                                              REPORT_LOCATION_ARGS(loc)))
 889
 890 #define ckWARN2reg_d(loc,m, a1)                                             \
 891     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 892                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
 893                                             m REPORT_LOCATION,              \
 894                                             a1, REPORT_LOCATION_ARGS(loc)))
 895
 896 #define ckWARN2reg(loc, m, a1)                                              \
 897     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 898                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 899                                           m REPORT_LOCATION,                \
 900                                           a1, REPORT_LOCATION_ARGS(loc)))
 901
 902 #define vWARN3(loc, m, a1, a2)                                              \
 903     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 904                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
 905                                        m REPORT_LOCATION,                   \
 906                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
 907
 908 #define ckWARN3reg(loc, m, a1, a2)                                          \
 909     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 910                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 911                                           m REPORT_LOCATION,                \
 912                                           a1, a2,                           \
 913                                           REPORT_LOCATION_ARGS(loc)))
 914
 915 #define vWARN4(loc, m, a1, a2, a3)                                      \
 916     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 917                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 918                                        m REPORT_LOCATION,               \
 919                                        a1, a2, a3,                      \
 920                                        REPORT_LOCATION_ARGS(loc)))
 921
 922 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
 923     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 924                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 925                                           m REPORT_LOCATION,            \
 926                                           a1, a2, a3,                   \
 927                                           REPORT_LOCATION_ARGS(loc)))
 928
 929 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
 930     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 931                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 932                                        m REPORT_LOCATION,               \
 933                                        a1, a2, a3, a4,                  \
 934                                        REPORT_LOCATION_ARGS(loc)))
 935
 936 #define ckWARNexperimental(loc, class, m)                               \
 937     _WARN_HELPER(loc, packWARN(class),                                  \
 938                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
 939                                             m REPORT_LOCATION,          \
 940                                             REPORT_LOCATION_ARGS(loc)))
 941
 942 /* Convert between a pointer to a node and its offset from the beginning of the
 943  * program */
 944 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
 945 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
 946
 947 /* Macros for recording node offsets.   20001227 mjd@plover.com
 948  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 949  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 950  * Element 0 holds the number n.
 951  * Position is 1 indexed.
 952  */
 953 #ifndef RE_TRACK_PATTERN_OFFSETS
 954 #define Set_Node_Offset_To_R(offset,byte)
 955 #define Set_Node_Offset(node,byte)
 956 #define Set_Cur_Node_Offset
 957 #define Set_Node_Length_To_R(node,len)
 958 #define Set_Node_Length(node,len)
 959 #define Set_Node_Cur_Length(node,start)
 960 #define Node_Offset(n)
 961 #define Node_Length(n)
 962 #define Set_Node_Offset_Length(node,offset,len)
 963 #define ProgLen(ri) ri->u.proglen
 964 #define SetProgLen(ri,x) ri->u.proglen = x
 965 #define Track_Code(code)
 966 #else
 967 #define ProgLen(ri) ri->u.offsets[0]
 968 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 969 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
 970         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 971                     __LINE__, (int)(offset), (int)(byte)));             \
 972         if((offset) < 0) {                                              \
 973             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 974                                          (int)(offset));                \
 975         } else {                                                        \
 976             RExC_offsets[2*(offset)-1] = (byte);                        \
 977         }                                                               \
 978 } STMT_END
 979
 980 #define Set_Node_Offset(node,byte)                                      \
 981     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
 982 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 983
 984 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 985         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 986                 __LINE__, (int)(node), (int)(len)));                    \
 987         if((node) < 0) {                                                \
 988             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 989                                          (int)(node));                  \
 990         } else {                                                        \
 991             RExC_offsets[2*(node)] = (len);                             \
 992         }                                                               \
 993 } STMT_END
 994
 995 #define Set_Node_Length(node,len) \
 996     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
 997 #define Set_Node_Cur_Length(node, start)                \
 998     Set_Node_Length(node, RExC_parse - start)
 999
1000 /* Get offsets and lengths */
1001 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1002 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1003
1004 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1005     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1006     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1007 } STMT_END
1008
1009 #define Track_Code(code) STMT_START { code } STMT_END
1010 #endif
1011
1012 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1013 #define EXPERIMENTAL_INPLACESCAN
1014 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1015
1016 #ifdef DEBUGGING
1017 int
1018 Perl_re_printf(pTHX_ const char *fmt, ...)
1019 {
1020     va_list ap;
1021     int result;
1022     PerlIO *f= Perl_debug_log;
1023     PERL_ARGS_ASSERT_RE_PRINTF;
1024     va_start(ap, fmt);
1025     result = PerlIO_vprintf(f, fmt, ap);
1026     va_end(ap);
1027     return result;
1028 }
1029
1030 int
1031 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1032 {
1033     va_list ap;
1034     int result;
1035     PerlIO *f= Perl_debug_log;
1036     PERL_ARGS_ASSERT_RE_INDENTF;
1037     va_start(ap, depth);
1038     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1039     result = PerlIO_vprintf(f, fmt, ap);
1040     va_end(ap);
1041     return result;
1042 }
1043 #endif /* DEBUGGING */
1044
1045 #define DEBUG_RExC_seen()                                                   \
1046         DEBUG_OPTIMISE_MORE_r({                                             \
1047             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1048                                                                             \
1049             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1050                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1051                                                                             \
1052             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1053                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1054                                                                             \
1055             if (RExC_seen & REG_GPOS_SEEN)                                  \
1056                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1057                                                                             \
1058             if (RExC_seen & REG_RECURSE_SEEN)                               \
1059                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1060                                                                             \
1061             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1062                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1063                                                                             \
1064             if (RExC_seen & REG_VERBARG_SEEN)                               \
1065                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1066                                                                             \
1067             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1068                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1069                                                                             \
1070             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1071                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1072                                                                             \
1073             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1074                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1075                                                                             \
1076             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1077                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1078                                                                             \
1079             Perl_re_printf( aTHX_ "\n");                                    \
1080         });
1081
1082 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1083   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1084
1085
1086 #ifdef DEBUGGING
1087 static void
1088 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1089                                     const char *close_str)
1090 {
1091     if (!flags)
1092         return;
1093
1094     Perl_re_printf( aTHX_  "%s", open_str);
1095     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1096     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1097     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1098     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1099     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1100     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1101     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1102     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1103     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1104     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1105     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1106     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1107     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1108     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1109     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1110     Perl_re_printf( aTHX_  "%s", close_str);
1111 }
1112
1113
1114 static void
1115 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1116                     U32 depth, int is_inf)
1117 {
1118     GET_RE_DEBUG_FLAGS_DECL;
1119
1120     DEBUG_OPTIMISE_MORE_r({
1121         if (!data)
1122             return;
1123         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1124             depth,
1125             where,
1126             (IV)data->pos_min,
1127             (IV)data->pos_delta,
1128             (UV)data->flags
1129         );
1130
1131         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1132
1133         Perl_re_printf( aTHX_
1134             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1135             (IV)data->whilem_c,
1136             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1137             is_inf ? "INF " : ""
1138         );
1139
1140         if (data->last_found) {
1141             int i;
1142             Perl_re_printf(aTHX_
1143                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1144                     SvPVX_const(data->last_found),
1145                     (IV)data->last_end,
1146                     (IV)data->last_start_min,
1147                     (IV)data->last_start_max
1148             );
1149
1150             for (i = 0; i < 2; i++) {
1151                 Perl_re_printf(aTHX_
1152                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1153                     data->cur_is_floating == i ? "*" : "",
1154                     i ? "Float" : "Fixed",
1155                     SvPVX_const(data->substrs[i].str),
1156                     (IV)data->substrs[i].min_offset,
1157                     (IV)data->substrs[i].max_offset
1158                 );
1159                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1160             }
1161         }
1162
1163         Perl_re_printf( aTHX_ "\n");
1164     });
1165 }
1166
1167
1168 static void
1169 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1170                 regnode *scan, U32 depth, U32 flags)
1171 {
1172     GET_RE_DEBUG_FLAGS_DECL;
1173
1174     DEBUG_OPTIMISE_r({
1175         regnode *Next;
1176
1177         if (!scan)
1178             return;
1179         Next = regnext(scan);
1180         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1181         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1182             depth,
1183             str,
1184             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1185             Next ? (REG_NODE_NUM(Next)) : 0 );
1186         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1187         Perl_re_printf( aTHX_  "\n");
1188    });
1189 }
1190
1191
1192 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1193                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1194
1195 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1196                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1197
1198 #else
1199 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1200 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1201 #endif
1202
1203
1204 /* =========================================================
1205  * BEGIN edit_distance stuff.
1206  *
1207  * This calculates how many single character changes of any type are needed to
1208  * transform a string into another one.  It is taken from version 3.1 of
1209  *
1210  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1211  */
1212
1213 /* Our unsorted dictionary linked list.   */
1214 /* Note we use UVs, not chars. */
1215
1216 struct dictionary{
1217   UV key;
1218   UV value;
1219   struct dictionary* next;
1220 };
1221 typedef struct dictionary item;
1222
1223
1224 PERL_STATIC_INLINE item*
1225 push(UV key, item* curr)
1226 {
1227     item* head;
1228     Newx(head, 1, item);
1229     head->key = key;
1230     head->value = 0;
1231     head->next = curr;
1232     return head;
1233 }
1234
1235
1236 PERL_STATIC_INLINE item*
1237 find(item* head, UV key)
1238 {
1239     item* iterator = head;
1240     while (iterator){
1241         if (iterator->key == key){
1242             return iterator;
1243         }
1244         iterator = iterator->next;
1245     }
1246
1247     return NULL;
1248 }
1249
1250 PERL_STATIC_INLINE item*
1251 uniquePush(item* head, UV key)
1252 {
1253     item* iterator = head;
1254
1255     while (iterator){
1256         if (iterator->key == key) {
1257             return head;
1258         }
1259         iterator = iterator->next;
1260     }
1261
1262     return push(key, head);
1263 }
1264
1265 PERL_STATIC_INLINE void
1266 dict_free(item* head)
1267 {
1268     item* iterator = head;
1269
1270     while (iterator) {
1271         item* temp = iterator;
1272         iterator = iterator->next;
1273         Safefree(temp);
1274     }
1275
1276     head = NULL;
1277 }
1278
1279 /* End of Dictionary Stuff */
1280
1281 /* All calculations/work are done here */
1282 STATIC int
1283 S_edit_distance(const UV* src,
1284                 const UV* tgt,
1285                 const STRLEN x,             /* length of src[] */
1286                 const STRLEN y,             /* length of tgt[] */
1287                 const SSize_t maxDistance
1288 )
1289 {
1290     item *head = NULL;
1291     UV swapCount, swapScore, targetCharCount, i, j;
1292     UV *scores;
1293     UV score_ceil = x + y;
1294
1295     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1296
1297     /* intialize matrix start values */
1298     Newx(scores, ( (x + 2) * (y + 2)), UV);
1299     scores[0] = score_ceil;
1300     scores[1 * (y + 2) + 0] = score_ceil;
1301     scores[0 * (y + 2) + 1] = score_ceil;
1302     scores[1 * (y + 2) + 1] = 0;
1303     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1304
1305     /* work loops    */
1306     /* i = src index */
1307     /* j = tgt index */
1308     for (i=1;i<=x;i++) {
1309         if (i < x)
1310             head = uniquePush(head, src[i]);
1311         scores[(i+1) * (y + 2) + 1] = i;
1312         scores[(i+1) * (y + 2) + 0] = score_ceil;
1313         swapCount = 0;
1314
1315         for (j=1;j<=y;j++) {
1316             if (i == 1) {
1317                 if(j < y)
1318                 head = uniquePush(head, tgt[j]);
1319                 scores[1 * (y + 2) + (j + 1)] = j;
1320                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1321             }
1322
1323             targetCharCount = find(head, tgt[j-1])->value;
1324             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1325
1326             if (src[i-1] != tgt[j-1]){
1327                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1328             }
1329             else {
1330                 swapCount = j;
1331                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1332             }
1333         }
1334
1335         find(head, src[i-1])->value = i;
1336     }
1337
1338     {
1339         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1340         dict_free(head);
1341         Safefree(scores);
1342         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1343     }
1344 }
1345
1346 /* END of edit_distance() stuff
1347  * ========================================================= */
1348
1349 /* is c a control character for which we have a mnemonic? */
1350 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1351
1352 STATIC const char *
1353 S_cntrl_to_mnemonic(const U8 c)
1354 {
1355     /* Returns the mnemonic string that represents character 'c', if one
1356      * exists; NULL otherwise.  The only ones that exist for the purposes of
1357      * this routine are a few control characters */
1358
1359     switch (c) {
1360         case '\a':       return "\\a";
1361         case '\b':       return "\\b";
1362         case ESC_NATIVE: return "\\e";
1363         case '\f':       return "\\f";
1364         case '\n':       return "\\n";
1365         case '\r':       return "\\r";
1366         case '\t':       return "\\t";
1367     }
1368
1369     return NULL;
1370 }
1371
1372 /* Mark that we cannot extend a found fixed substring at this point.
1373    Update the longest found anchored substring or the longest found
1374    floating substrings if needed. */
1375
1376 STATIC void
1377 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1378                     SSize_t *minlenp, int is_inf)
1379 {
1380     const STRLEN l = CHR_SVLEN(data->last_found);
1381     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1382     const STRLEN old_l = CHR_SVLEN(longest_sv);
1383     GET_RE_DEBUG_FLAGS_DECL;
1384
1385     PERL_ARGS_ASSERT_SCAN_COMMIT;
1386
1387     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1388         const U8 i = data->cur_is_floating;
1389         SvSetMagicSV(longest_sv, data->last_found);
1390         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1391
1392         if (!i) /* fixed */
1393             data->substrs[0].max_offset = data->substrs[0].min_offset;
1394         else { /* float */
1395             data->substrs[1].max_offset = (l
1396                           ? data->last_start_max
1397                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1398                                          ? SSize_t_MAX
1399                                          : data->pos_min + data->pos_delta));
1400             if (is_inf
1401                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1402                 data->substrs[1].max_offset = SSize_t_MAX;
1403         }
1404
1405         if (data->flags & SF_BEFORE_EOL)
1406             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1407         else
1408             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1409         data->substrs[i].minlenp = minlenp;
1410         data->substrs[i].lookbehind = 0;
1411     }
1412
1413     SvCUR_set(data->last_found, 0);
1414     {
1415         SV * const sv = data->last_found;
1416         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1417             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1418             if (mg)
1419                 mg->mg_len = 0;
1420         }
1421     }
1422     data->last_end = -1;
1423     data->flags &= ~SF_BEFORE_EOL;
1424     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1425 }
1426
1427 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1428  * list that describes which code points it matches */
1429
1430 STATIC void
1431 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1432 {
1433     /* Set the SSC 'ssc' to match an empty string or any code point */
1434
1435     PERL_ARGS_ASSERT_SSC_ANYTHING;
1436
1437     assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439     /* mortalize so won't leak */
1440     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1441     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1442 }
1443
1444 STATIC int
1445 S_ssc_is_anything(const regnode_ssc *ssc)
1446 {
1447     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1448      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1449      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1450      * in any way, so there's no point in using it */
1451
1452     UV start, end;
1453     bool ret;
1454
1455     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1456
1457     assert(is_ANYOF_SYNTHETIC(ssc));
1458
1459     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1460         return FALSE;
1461     }
1462
1463     /* See if the list consists solely of the range 0 - Infinity */
1464     invlist_iterinit(ssc->invlist);
1465     ret = invlist_iternext(ssc->invlist, &start, &end)
1466           && start == 0
1467           && end == UV_MAX;
1468
1469     invlist_iterfinish(ssc->invlist);
1470
1471     if (ret) {
1472         return TRUE;
1473     }
1474
1475     /* If e.g., both \w and \W are set, matches everything */
1476     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1477         int i;
1478         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1479             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1480                 return TRUE;
1481             }
1482         }
1483     }
1484
1485     return FALSE;
1486 }
1487
1488 STATIC void
1489 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1490 {
1491     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1492      * string, any code point, or any posix class under locale */
1493
1494     PERL_ARGS_ASSERT_SSC_INIT;
1495
1496     Zero(ssc, 1, regnode_ssc);
1497     set_ANYOF_SYNTHETIC(ssc);
1498     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1499     ssc_anything(ssc);
1500
1501     /* If any portion of the regex is to operate under locale rules that aren't
1502      * fully known at compile time, initialization includes it.  The reason
1503      * this isn't done for all regexes is that the optimizer was written under
1504      * the assumption that locale was all-or-nothing.  Given the complexity and
1505      * lack of documentation in the optimizer, and that there are inadequate
1506      * test cases for locale, many parts of it may not work properly, it is
1507      * safest to avoid locale unless necessary. */
1508     if (RExC_contains_locale) {
1509         ANYOF_POSIXL_SETALL(ssc);
1510     }
1511     else {
1512         ANYOF_POSIXL_ZERO(ssc);
1513     }
1514 }
1515
1516 STATIC int
1517 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1518                         const regnode_ssc *ssc)
1519 {
1520     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1521      * to the list of code points matched, and locale posix classes; hence does
1522      * not check its flags) */
1523
1524     UV start, end;
1525     bool ret;
1526
1527     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1528
1529     assert(is_ANYOF_SYNTHETIC(ssc));
1530
1531     invlist_iterinit(ssc->invlist);
1532     ret = invlist_iternext(ssc->invlist, &start, &end)
1533           && start == 0
1534           && end == UV_MAX;
1535
1536     invlist_iterfinish(ssc->invlist);
1537
1538     if (! ret) {
1539         return FALSE;
1540     }
1541
1542     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1543         return FALSE;
1544     }
1545
1546     return TRUE;
1547 }
1548
1549 STATIC SV*
1550 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1551                                const regnode_charclass* const node)
1552 {
1553     /* Returns a mortal inversion list defining which code points are matched
1554      * by 'node', which is of type ANYOF.  Handles complementing the result if
1555      * appropriate.  If some code points aren't knowable at this time, the
1556      * returned list must, and will, contain every code point that is a
1557      * possibility. */
1558
1559     SV* invlist = NULL;
1560     SV* only_utf8_locale_invlist = NULL;
1561     unsigned int i;
1562     const U32 n = ARG(node);
1563     bool new_node_has_latin1 = FALSE;
1564
1565     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1566
1567     /* Look at the data structure created by S_set_ANYOF_arg() */
1568     if (n != ANYOF_ONLY_HAS_BITMAP) {
1569         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1570         AV * const av = MUTABLE_AV(SvRV(rv));
1571         SV **const ary = AvARRAY(av);
1572         assert(RExC_rxi->data->what[n] == 's');
1573
1574         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1575             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
1576         }
1577         else if (ary[0] && ary[0] != &PL_sv_undef) {
1578
1579             /* Here, no compile-time swash, and there are things that won't be
1580              * known until runtime -- we have to assume it could be anything */
1581             invlist = sv_2mortal(_new_invlist(1));
1582             return _add_range_to_invlist(invlist, 0, UV_MAX);
1583         }
1584         else if (ary[3] && ary[3] != &PL_sv_undef) {
1585
1586             /* Here no compile-time swash, and no run-time only data.  Use the
1587              * node's inversion list */
1588             invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1589         }
1590
1591         /* Get the code points valid only under UTF-8 locales */
1592         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1593             && ary[2] && ary[2] != &PL_sv_undef)
1594         {
1595             only_utf8_locale_invlist = ary[2];
1596         }
1597     }
1598
1599     if (! invlist) {
1600         invlist = sv_2mortal(_new_invlist(0));
1601     }
1602
1603     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1604      * code points, and an inversion list for the others, but if there are code
1605      * points that should match only conditionally on the target string being
1606      * UTF-8, those are placed in the inversion list, and not the bitmap.
1607      * Since there are circumstances under which they could match, they are
1608      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1609      * to exclude them here, so that when we invert below, the end result
1610      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1611      * have to do this here before we add the unconditionally matched code
1612      * points */
1613     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1614         _invlist_intersection_complement_2nd(invlist,
1615                                              PL_UpperLatin1,
1616                                              &invlist);
1617     }
1618
1619     /* Add in the points from the bit map */
1620     if (OP(node) != ANYOFH) {
1621         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1622             if (ANYOF_BITMAP_TEST(node, i)) {
1623                 unsigned int start = i++;
1624
1625                 for (;    i < NUM_ANYOF_CODE_POINTS
1626                        && ANYOF_BITMAP_TEST(node, i); ++i)
1627                 {
1628                     /* empty */
1629                 }
1630                 invlist = _add_range_to_invlist(invlist, start, i-1);
1631                 new_node_has_latin1 = TRUE;
1632             }
1633         }
1634     }
1635
1636     /* If this can match all upper Latin1 code points, have to add them
1637      * as well.  But don't add them if inverting, as when that gets done below,
1638      * it would exclude all these characters, including the ones it shouldn't
1639      * that were added just above */
1640     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1641         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1642     {
1643         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1644     }
1645
1646     /* Similarly for these */
1647     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1648         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1649     }
1650
1651     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1652         _invlist_invert(invlist);
1653     }
1654     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1655
1656         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1657          * locale.  We can skip this if there are no 0-255 at all. */
1658         _invlist_union(invlist, PL_Latin1, &invlist);
1659     }
1660
1661     /* Similarly add the UTF-8 locale possible matches.  These have to be
1662      * deferred until after the non-UTF-8 locale ones are taken care of just
1663      * above, or it leads to wrong results under ANYOF_INVERT */
1664     if (only_utf8_locale_invlist) {
1665         _invlist_union_maybe_complement_2nd(invlist,
1666                                             only_utf8_locale_invlist,
1667                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1668                                             &invlist);
1669     }
1670
1671     return invlist;
1672 }
1673
1674 /* These two functions currently do the exact same thing */
1675 #define ssc_init_zero           ssc_init
1676
1677 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1678 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1679
1680 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1681  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1682  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1683
1684 STATIC void
1685 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1686                 const regnode_charclass *and_with)
1687 {
1688     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1689      * another SSC or a regular ANYOF class.  Can create false positives. */
1690
1691     SV* anded_cp_list;
1692     U8  anded_flags;
1693
1694     PERL_ARGS_ASSERT_SSC_AND;
1695
1696     assert(is_ANYOF_SYNTHETIC(ssc));
1697
1698     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1699      * the code point inversion list and just the relevant flags */
1700     if (is_ANYOF_SYNTHETIC(and_with)) {
1701         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1702         anded_flags = ANYOF_FLAGS(and_with);
1703
1704         /* XXX This is a kludge around what appears to be deficiencies in the
1705          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1706          * there are paths through the optimizer where it doesn't get weeded
1707          * out when it should.  And if we don't make some extra provision for
1708          * it like the code just below, it doesn't get added when it should.
1709          * This solution is to add it only when AND'ing, which is here, and
1710          * only when what is being AND'ed is the pristine, original node
1711          * matching anything.  Thus it is like adding it to ssc_anything() but
1712          * only when the result is to be AND'ed.  Probably the same solution
1713          * could be adopted for the same problem we have with /l matching,
1714          * which is solved differently in S_ssc_init(), and that would lead to
1715          * fewer false positives than that solution has.  But if this solution
1716          * creates bugs, the consequences are only that a warning isn't raised
1717          * that should be; while the consequences for having /l bugs is
1718          * incorrect matches */
1719         if (ssc_is_anything((regnode_ssc *)and_with)) {
1720             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1721         }
1722     }
1723     else {
1724         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1725         if (OP(and_with) == ANYOFD) {
1726             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1727         }
1728         else {
1729             anded_flags = ANYOF_FLAGS(and_with)
1730             &( ANYOF_COMMON_FLAGS
1731               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1732               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1733             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1734                 anded_flags &=
1735                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1736             }
1737         }
1738     }
1739
1740     ANYOF_FLAGS(ssc) &= anded_flags;
1741
1742     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1743      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1744      * 'and_with' may be inverted.  When not inverted, we have the situation of
1745      * computing:
1746      *  (C1 | P1) & (C2 | P2)
1747      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1748      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1749      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1750      *                    <=  ((C1 & C2) | P1 | P2)
1751      * Alternatively, the last few steps could be:
1752      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1753      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1754      *                    <=  (C1 | C2 | (P1 & P2))
1755      * We favor the second approach if either P1 or P2 is non-empty.  This is
1756      * because these components are a barrier to doing optimizations, as what
1757      * they match cannot be known until the moment of matching as they are
1758      * dependent on the current locale, 'AND"ing them likely will reduce or
1759      * eliminate them.
1760      * But we can do better if we know that C1,P1 are in their initial state (a
1761      * frequent occurrence), each matching everything:
1762      *  (<everything>) & (C2 | P2) =  C2 | P2
1763      * Similarly, if C2,P2 are in their initial state (again a frequent
1764      * occurrence), the result is a no-op
1765      *  (C1 | P1) & (<everything>) =  C1 | P1
1766      *
1767      * Inverted, we have
1768      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1769      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1770      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1771      * */
1772
1773     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1774         && ! is_ANYOF_SYNTHETIC(and_with))
1775     {
1776         unsigned int i;
1777
1778         ssc_intersection(ssc,
1779                          anded_cp_list,
1780                          FALSE /* Has already been inverted */
1781                          );
1782
1783         /* If either P1 or P2 is empty, the intersection will be also; can skip
1784          * the loop */
1785         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1786             ANYOF_POSIXL_ZERO(ssc);
1787         }
1788         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1789
1790             /* Note that the Posix class component P from 'and_with' actually
1791              * looks like:
1792              *      P = Pa | Pb | ... | Pn
1793              * where each component is one posix class, such as in [\w\s].
1794              * Thus
1795              *      ~P = ~(Pa | Pb | ... | Pn)
1796              *         = ~Pa & ~Pb & ... & ~Pn
1797              *        <= ~Pa | ~Pb | ... | ~Pn
1798              * The last is something we can easily calculate, but unfortunately
1799              * is likely to have many false positives.  We could do better
1800              * in some (but certainly not all) instances if two classes in
1801              * P have known relationships.  For example
1802              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1803              * So
1804              *      :lower: & :print: = :lower:
1805              * And similarly for classes that must be disjoint.  For example,
1806              * since \s and \w can have no elements in common based on rules in
1807              * the POSIX standard,
1808              *      \w & ^\S = nothing
1809              * Unfortunately, some vendor locales do not meet the Posix
1810              * standard, in particular almost everything by Microsoft.
1811              * The loop below just changes e.g., \w into \W and vice versa */
1812
1813             regnode_charclass_posixl temp;
1814             int add = 1;    /* To calculate the index of the complement */
1815
1816             Zero(&temp, 1, regnode_charclass_posixl);
1817             ANYOF_POSIXL_ZERO(&temp);
1818             for (i = 0; i < ANYOF_MAX; i++) {
1819                 assert(i % 2 != 0
1820                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1821                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1822
1823                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1824                     ANYOF_POSIXL_SET(&temp, i + add);
1825                 }
1826                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1827             }
1828             ANYOF_POSIXL_AND(&temp, ssc);
1829
1830         } /* else ssc already has no posixes */
1831     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1832          in its initial state */
1833     else if (! is_ANYOF_SYNTHETIC(and_with)
1834              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1835     {
1836         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1837          * copy it over 'ssc' */
1838         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1839             if (is_ANYOF_SYNTHETIC(and_with)) {
1840                 StructCopy(and_with, ssc, regnode_ssc);
1841             }
1842             else {
1843                 ssc->invlist = anded_cp_list;
1844                 ANYOF_POSIXL_ZERO(ssc);
1845                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1846                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1847                 }
1848             }
1849         }
1850         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1851                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1852         {
1853             /* One or the other of P1, P2 is non-empty. */
1854             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1855                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1856             }
1857             ssc_union(ssc, anded_cp_list, FALSE);
1858         }
1859         else { /* P1 = P2 = empty */
1860             ssc_intersection(ssc, anded_cp_list, FALSE);
1861         }
1862     }
1863 }
1864
1865 STATIC void
1866 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1867                const regnode_charclass *or_with)
1868 {
1869     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1870      * another SSC or a regular ANYOF class.  Can create false positives if
1871      * 'or_with' is to be inverted. */
1872
1873     SV* ored_cp_list;
1874     U8 ored_flags;
1875
1876     PERL_ARGS_ASSERT_SSC_OR;
1877
1878     assert(is_ANYOF_SYNTHETIC(ssc));
1879
1880     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1881      * the code point inversion list and just the relevant flags */
1882     if (is_ANYOF_SYNTHETIC(or_with)) {
1883         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1884         ored_flags = ANYOF_FLAGS(or_with);
1885     }
1886     else {
1887         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1888         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1889         if (OP(or_with) != ANYOFD) {
1890             ored_flags
1891             |= ANYOF_FLAGS(or_with)
1892              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1893                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1894             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1895                 ored_flags |=
1896                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1897             }
1898         }
1899     }
1900
1901     ANYOF_FLAGS(ssc) |= ored_flags;
1902
1903     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1904      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1905      * 'or_with' may be inverted.  When not inverted, we have the simple
1906      * situation of computing:
1907      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1908      * If P1|P2 yields a situation with both a class and its complement are
1909      * set, like having both \w and \W, this matches all code points, and we
1910      * can delete these from the P component of the ssc going forward.  XXX We
1911      * might be able to delete all the P components, but I (khw) am not certain
1912      * about this, and it is better to be safe.
1913      *
1914      * Inverted, we have
1915      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1916      *                         <=  (C1 | P1) | ~C2
1917      *                         <=  (C1 | ~C2) | P1
1918      * (which results in actually simpler code than the non-inverted case)
1919      * */
1920
1921     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1922         && ! is_ANYOF_SYNTHETIC(or_with))
1923     {
1924         /* We ignore P2, leaving P1 going forward */
1925     }   /* else  Not inverted */
1926     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1927         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1928         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1929             unsigned int i;
1930             for (i = 0; i < ANYOF_MAX; i += 2) {
1931                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1932                 {
1933                     ssc_match_all_cp(ssc);
1934                     ANYOF_POSIXL_CLEAR(ssc, i);
1935                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1936                 }
1937             }
1938         }
1939     }
1940
1941     ssc_union(ssc,
1942               ored_cp_list,
1943               FALSE /* Already has been inverted */
1944               );
1945 }
1946
1947 PERL_STATIC_INLINE void
1948 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1949 {
1950     PERL_ARGS_ASSERT_SSC_UNION;
1951
1952     assert(is_ANYOF_SYNTHETIC(ssc));
1953
1954     _invlist_union_maybe_complement_2nd(ssc->invlist,
1955                                         invlist,
1956                                         invert2nd,
1957                                         &ssc->invlist);
1958 }
1959
1960 PERL_STATIC_INLINE void
1961 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1962                          SV* const invlist,
1963                          const bool invert2nd)
1964 {
1965     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1966
1967     assert(is_ANYOF_SYNTHETIC(ssc));
1968
1969     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1970                                                invlist,
1971                                                invert2nd,
1972                                                &ssc->invlist);
1973 }
1974
1975 PERL_STATIC_INLINE void
1976 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1977 {
1978     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1979
1980     assert(is_ANYOF_SYNTHETIC(ssc));
1981
1982     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1983 }
1984
1985 PERL_STATIC_INLINE void
1986 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1987 {
1988     /* AND just the single code point 'cp' into the SSC 'ssc' */
1989
1990     SV* cp_list = _new_invlist(2);
1991
1992     PERL_ARGS_ASSERT_SSC_CP_AND;
1993
1994     assert(is_ANYOF_SYNTHETIC(ssc));
1995
1996     cp_list = add_cp_to_invlist(cp_list, cp);
1997     ssc_intersection(ssc, cp_list,
1998                      FALSE /* Not inverted */
1999                      );
2000     SvREFCNT_dec_NN(cp_list);
2001 }
2002
2003 PERL_STATIC_INLINE void
2004 S_ssc_clear_locale(regnode_ssc *ssc)
2005 {
2006     /* Set the SSC 'ssc' to not match any locale things */
2007     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2008
2009     assert(is_ANYOF_SYNTHETIC(ssc));
2010
2011     ANYOF_POSIXL_ZERO(ssc);
2012     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2013 }
2014
2015 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2016
2017 STATIC bool
2018 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2019 {
2020     /* The synthetic start class is used to hopefully quickly winnow down
2021      * places where a pattern could start a match in the target string.  If it
2022      * doesn't really narrow things down that much, there isn't much point to
2023      * having the overhead of using it.  This function uses some very crude
2024      * heuristics to decide if to use the ssc or not.
2025      *
2026      * It returns TRUE if 'ssc' rules out more than half what it considers to
2027      * be the "likely" possible matches, but of course it doesn't know what the
2028      * actual things being matched are going to be; these are only guesses
2029      *
2030      * For /l matches, it assumes that the only likely matches are going to be
2031      *      in the 0-255 range, uniformly distributed, so half of that is 127
2032      * For /a and /d matches, it assumes that the likely matches will be just
2033      *      the ASCII range, so half of that is 63
2034      * For /u and there isn't anything matching above the Latin1 range, it
2035      *      assumes that that is the only range likely to be matched, and uses
2036      *      half that as the cut-off: 127.  If anything matches above Latin1,
2037      *      it assumes that all of Unicode could match (uniformly), except for
2038      *      non-Unicode code points and things in the General Category "Other"
2039      *      (unassigned, private use, surrogates, controls and formats).  This
2040      *      is a much large number. */
2041
2042     U32 count = 0;      /* Running total of number of code points matched by
2043                            'ssc' */
2044     UV start, end;      /* Start and end points of current range in inversion
2045                            list */
2046     const U32 max_code_points = (LOC)
2047                                 ?  256
2048                                 : ((  ! UNI_SEMANTICS
2049                                     ||  invlist_highest(ssc->invlist) < 256)
2050                                   ? 128
2051                                   : NON_OTHER_COUNT);
2052     const U32 max_match = max_code_points / 2;
2053
2054     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2055
2056     invlist_iterinit(ssc->invlist);
2057     while (invlist_iternext(ssc->invlist, &start, &end)) {
2058         if (start >= max_code_points) {
2059             break;
2060         }
2061         end = MIN(end, max_code_points - 1);
2062         count += end - start + 1;
2063         if (count >= max_match) {
2064             invlist_iterfinish(ssc->invlist);
2065             return FALSE;
2066         }
2067     }
2068
2069     return TRUE;
2070 }
2071
2072
2073 STATIC void
2074 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2075 {
2076     /* The inversion list in the SSC is marked mortal; now we need a more
2077      * permanent copy, which is stored the same way that is done in a regular
2078      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2079      * map */
2080
2081     SV* invlist = invlist_clone(ssc->invlist, NULL);
2082
2083     PERL_ARGS_ASSERT_SSC_FINALIZE;
2084
2085     assert(is_ANYOF_SYNTHETIC(ssc));
2086
2087     /* The code in this file assumes that all but these flags aren't relevant
2088      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2089      * by the time we reach here */
2090     assert(! (ANYOF_FLAGS(ssc)
2091         & ~( ANYOF_COMMON_FLAGS
2092             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2093             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2094
2095     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2096
2097     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2098                                 NULL, NULL, NULL, FALSE);
2099
2100     /* Make sure is clone-safe */
2101     ssc->invlist = NULL;
2102
2103     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2104         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2105         OP(ssc) = ANYOFPOSIXL;
2106     }
2107     else if (RExC_contains_locale) {
2108         OP(ssc) = ANYOFL;
2109     }
2110
2111     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2112 }
2113
2114 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2115 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2116 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2117 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2118                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2119                                : 0 )
2120
2121
2122 #ifdef DEBUGGING
2123 /*
2124    dump_trie(trie,widecharmap,revcharmap)
2125    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2126    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2127
2128    These routines dump out a trie in a somewhat readable format.
2129    The _interim_ variants are used for debugging the interim
2130    tables that are used to generate the final compressed
2131    representation which is what dump_trie expects.
2132
2133    Part of the reason for their existence is to provide a form
2134    of documentation as to how the different representations function.
2135
2136 */
2137
2138 /*
2139   Dumps the final compressed table form of the trie to Perl_debug_log.
2140   Used for debugging make_trie().
2141 */
2142
2143 STATIC void
2144 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2145             AV *revcharmap, U32 depth)
2146 {
2147     U32 state;
2148     SV *sv=sv_newmortal();
2149     int colwidth= widecharmap ? 6 : 4;
2150     U16 word;
2151     GET_RE_DEBUG_FLAGS_DECL;
2152
2153     PERL_ARGS_ASSERT_DUMP_TRIE;
2154
2155     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2156         depth+1, "Match","Base","Ofs" );
2157
2158     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2159         SV ** const tmp = av_fetch( revcharmap, state, 0);
2160         if ( tmp ) {
2161             Perl_re_printf( aTHX_  "%*s",
2162                 colwidth,
2163                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2164                             PL_colors[0], PL_colors[1],
2165                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2166                             PERL_PV_ESCAPE_FIRSTCHAR
2167                 )
2168             );
2169         }
2170     }
2171     Perl_re_printf( aTHX_  "\n");
2172     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2173
2174     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2175         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2176     Perl_re_printf( aTHX_  "\n");
2177
2178     for( state = 1 ; state < trie->statecount ; state++ ) {
2179         const U32 base = trie->states[ state ].trans.base;
2180
2181         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2182
2183         if ( trie->states[ state ].wordnum ) {
2184             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2185         } else {
2186             Perl_re_printf( aTHX_  "%6s", "" );
2187         }
2188
2189         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2190
2191         if ( base ) {
2192             U32 ofs = 0;
2193
2194             while( ( base + ofs  < trie->uniquecharcount ) ||
2195                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2196                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2197                                                                     != state))
2198                     ofs++;
2199
2200             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2201
2202             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2203                 if ( ( base + ofs >= trie->uniquecharcount )
2204                         && ( base + ofs - trie->uniquecharcount
2205                                                         < trie->lasttrans )
2206                         && trie->trans[ base + ofs
2207                                     - trie->uniquecharcount ].check == state )
2208                 {
2209                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2210                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2211                    );
2212                 } else {
2213                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2214                 }
2215             }
2216
2217             Perl_re_printf( aTHX_  "]");
2218
2219         }
2220         Perl_re_printf( aTHX_  "\n" );
2221     }
2222     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2223                                 depth);
2224     for (word=1; word <= trie->wordcount; word++) {
2225         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2226             (int)word, (int)(trie->wordinfo[word].prev),
2227             (int)(trie->wordinfo[word].len));
2228     }
2229     Perl_re_printf( aTHX_  "\n" );
2230 }
2231 /*
2232   Dumps a fully constructed but uncompressed trie in list form.
2233   List tries normally only are used for construction when the number of
2234   possible chars (trie->uniquecharcount) is very high.
2235   Used for debugging make_trie().
2236 */
2237 STATIC void
2238 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2239                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2240                          U32 depth)
2241 {
2242     U32 state;
2243     SV *sv=sv_newmortal();
2244     int colwidth= widecharmap ? 6 : 4;
2245     GET_RE_DEBUG_FLAGS_DECL;
2246
2247     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2248
2249     /* print out the table precompression.  */
2250     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2251             depth+1 );
2252     Perl_re_indentf( aTHX_  "%s",
2253             depth+1, "------:-----+-----------------\n" );
2254
2255     for( state=1 ; state < next_alloc ; state ++ ) {
2256         U16 charid;
2257
2258         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2259             depth+1, (UV)state  );
2260         if ( ! trie->states[ state ].wordnum ) {
2261             Perl_re_printf( aTHX_  "%5s| ","");
2262         } else {
2263             Perl_re_printf( aTHX_  "W%4x| ",
2264                 trie->states[ state ].wordnum
2265             );
2266         }
2267         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2268             SV ** const tmp = av_fetch( revcharmap,
2269                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2270             if ( tmp ) {
2271                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2272                     colwidth,
2273                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2274                               colwidth,
2275                               PL_colors[0], PL_colors[1],
2276                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2277                               | PERL_PV_ESCAPE_FIRSTCHAR
2278                     ) ,
2279                     TRIE_LIST_ITEM(state, charid).forid,
2280                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2281                 );
2282                 if (!(charid % 10))
2283                     Perl_re_printf( aTHX_  "\n%*s| ",
2284                         (int)((depth * 2) + 14), "");
2285             }
2286         }
2287         Perl_re_printf( aTHX_  "\n");
2288     }
2289 }
2290
2291 /*
2292   Dumps a fully constructed but uncompressed trie in table form.
2293   This is the normal DFA style state transition table, with a few
2294   twists to facilitate compression later.
2295   Used for debugging make_trie().
2296 */
2297 STATIC void
2298 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2299                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2300                           U32 depth)
2301 {
2302     U32 state;
2303     U16 charid;
2304     SV *sv=sv_newmortal();
2305     int colwidth= widecharmap ? 6 : 4;
2306     GET_RE_DEBUG_FLAGS_DECL;
2307
2308     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2309
2310     /*
2311        print out the table precompression so that we can do a visual check
2312        that they are identical.
2313      */
2314
2315     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2316
2317     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2318         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2319         if ( tmp ) {
2320             Perl_re_printf( aTHX_  "%*s",
2321                 colwidth,
2322                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2323                             PL_colors[0], PL_colors[1],
2324                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2325                             PERL_PV_ESCAPE_FIRSTCHAR
2326                 )
2327             );
2328         }
2329     }
2330
2331     Perl_re_printf( aTHX_ "\n");
2332     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2333
2334     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2335         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2336     }
2337
2338     Perl_re_printf( aTHX_  "\n" );
2339
2340     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2341
2342         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2343             depth+1,
2344             (UV)TRIE_NODENUM( state ) );
2345
2346         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2347             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2348             if (v)
2349                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2350             else
2351                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2352         }
2353         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2354             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2355                                             (UV)trie->trans[ state ].check );
2356         } else {
2357             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2358                                             (UV)trie->trans[ state ].check,
2359             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2360         }
2361     }
2362 }
2363
2364 #endif
2365
2366
2367 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2368   startbranch: the first branch in the whole branch sequence
2369   first      : start branch of sequence of branch-exact nodes.
2370                May be the same as startbranch
2371   last       : Thing following the last branch.
2372                May be the same as tail.
2373   tail       : item following the branch sequence
2374   count      : words in the sequence
2375   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2376   depth      : indent depth
2377
2378 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2379
2380 A trie is an N'ary tree where the branches are determined by digital
2381 decomposition of the key. IE, at the root node you look up the 1st character and
2382 follow that branch repeat until you find the end of the branches. Nodes can be
2383 marked as "accepting" meaning they represent a complete word. Eg:
2384
2385   /he|she|his|hers/
2386
2387 would convert into the following structure. Numbers represent states, letters
2388 following numbers represent valid transitions on the letter from that state, if
2389 the number is in square brackets it represents an accepting state, otherwise it
2390 will be in parenthesis.
2391
2392       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2393       |    |
2394       |   (2)
2395       |    |
2396      (1)   +-i->(6)-+-s->[7]
2397       |
2398       +-s->(3)-+-h->(4)-+-e->[5]
2399
2400       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2401
2402 This shows that when matching against the string 'hers' we will begin at state 1
2403 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2404 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2405 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2406 single traverse. We store a mapping from accepting to state to which word was
2407 matched, and then when we have multiple possibilities we try to complete the
2408 rest of the regex in the order in which they occurred in the alternation.
2409
2410 The only prior NFA like behaviour that would be changed by the TRIE support is
2411 the silent ignoring of duplicate alternations which are of the form:
2412
2413  / (DUPE|DUPE) X? (?{ ... }) Y /x
2414
2415 Thus EVAL blocks following a trie may be called a different number of times with
2416 and without the optimisation. With the optimisations dupes will be silently
2417 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2418 the following demonstrates:
2419
2420  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2421
2422 which prints out 'word' three times, but
2423
2424  'words'=~/(word|word|word)(?{ print $1 })S/
2425
2426 which doesnt print it out at all. This is due to other optimisations kicking in.
2427
2428 Example of what happens on a structural level:
2429
2430 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2431
2432    1: CURLYM[1] {1,32767}(18)
2433    5:   BRANCH(8)
2434    6:     EXACT <ac>(16)
2435    8:   BRANCH(11)
2436    9:     EXACT <ad>(16)
2437   11:   BRANCH(14)
2438   12:     EXACT <ab>(16)
2439   16:   SUCCEED(0)
2440   17:   NOTHING(18)
2441   18: END(0)
2442
2443 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2444 and should turn into:
2445
2446    1: CURLYM[1] {1,32767}(18)
2447    5:   TRIE(16)
2448         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2449           <ac>
2450           <ad>
2451           <ab>
2452   16:   SUCCEED(0)
2453   17:   NOTHING(18)
2454   18: END(0)
2455
2456 Cases where tail != last would be like /(?foo|bar)baz/:
2457
2458    1: BRANCH(4)
2459    2:   EXACT <foo>(8)
2460    4: BRANCH(7)
2461    5:   EXACT <bar>(8)
2462    7: TAIL(8)
2463    8: EXACT <baz>(10)
2464   10: END(0)
2465
2466 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2467 and would end up looking like:
2468
2469     1: TRIE(8)
2470       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2471         <foo>
2472         <bar>
2473    7: TAIL(8)
2474    8: EXACT <baz>(10)
2475   10: END(0)
2476
2477     d = uvchr_to_utf8_flags(d, uv, 0);
2478
2479 is the recommended Unicode-aware way of saying
2480
2481     *(d++) = uv;
2482 */
2483
2484 #define TRIE_STORE_REVCHAR(val)                                            \
2485     STMT_START {                                                           \
2486         if (UTF) {                                                         \
2487             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2488             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2489             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2490             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2491             SvPOK_on(zlopp);                                               \
2492             SvUTF8_on(zlopp);                                              \
2493             av_push(revcharmap, zlopp);                                    \
2494         } else {                                                           \
2495             char ooooff = (char)val;                                           \
2496             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2497         }                                                                  \
2498         } STMT_END
2499
2500 /* This gets the next character from the input, folding it if not already
2501  * folded. */
2502 #define TRIE_READ_CHAR STMT_START {                                           \
2503     wordlen++;                                                                \
2504     if ( UTF ) {                                                              \
2505         /* if it is UTF then it is either already folded, or does not need    \
2506          * folding */                                                         \
2507         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2508     }                                                                         \
2509     else if (folder == PL_fold_latin1) {                                      \
2510         /* This folder implies Unicode rules, which in the range expressible  \
2511          *  by not UTF is the lower case, with the two exceptions, one of     \
2512          *  which should have been taken care of before calling this */       \
2513         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2514         uvc = toLOWER_L1(*uc);                                                \
2515         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2516         len = 1;                                                              \
2517     } else {                                                                  \
2518         /* raw data, will be folded later if needed */                        \
2519         uvc = (U32)*uc;                                                       \
2520         len = 1;                                                              \
2521     }                                                                         \
2522 } STMT_END
2523
2524
2525
2526 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2527     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2528         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2529         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2530         TRIE_LIST_LEN( state ) = ging;                          \
2531     }                                                           \
2532     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2533     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2534     TRIE_LIST_CUR( state )++;                                   \
2535 } STMT_END
2536
2537 #define TRIE_LIST_NEW(state) STMT_START {                       \
2538     Newx( trie->states[ state ].trans.list,                     \
2539         4, reg_trie_trans_le );                                 \
2540      TRIE_LIST_CUR( state ) = 1;                                \
2541      TRIE_LIST_LEN( state ) = 4;                                \
2542 } STMT_END
2543
2544 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2545     U16 dupe= trie->states[ state ].wordnum;                    \
2546     regnode * const noper_next = regnext( noper );              \
2547                                                                 \
2548     DEBUG_r({                                                   \
2549         /* store the word for dumping */                        \
2550         SV* tmp;                                                \
2551         if (OP(noper) != NOTHING)                               \
2552             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2553         else                                                    \
2554             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2555         av_push( trie_words, tmp );                             \
2556     });                                                         \
2557                                                                 \
2558     curword++;                                                  \
2559     trie->wordinfo[curword].prev   = 0;                         \
2560     trie->wordinfo[curword].len    = wordlen;                   \
2561     trie->wordinfo[curword].accept = state;                     \
2562                                                                 \
2563     if ( noper_next < tail ) {                                  \
2564         if (!trie->jump)                                        \
2565             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2566                                                  sizeof(U16) ); \
2567         trie->jump[curword] = (U16)(noper_next - convert);      \
2568         if (!jumper)                                            \
2569             jumper = noper_next;                                \
2570         if (!nextbranch)                                        \
2571             nextbranch= regnext(cur);                           \
2572     }                                                           \
2573                                                                 \
2574     if ( dupe ) {                                               \
2575         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2576         /* chain, so that when the bits of chain are later    */\
2577         /* linked together, the dups appear in the chain      */\
2578         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2579         trie->wordinfo[dupe].prev = curword;                    \
2580     } else {                                                    \
2581         /* we haven't inserted this word yet.                */ \
2582         trie->states[ state ].wordnum = curword;                \
2583     }                                                           \
2584 } STMT_END
2585
2586
2587 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2588      ( ( base + charid >=  ucharcount                                   \
2589          && base + charid < ubound                                      \
2590          && state == trie->trans[ base - ucharcount + charid ].check    \
2591          && trie->trans[ base - ucharcount + charid ].next )            \
2592            ? trie->trans[ base - ucharcount + charid ].next             \
2593            : ( state==1 ? special : 0 )                                 \
2594       )
2595
2596 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2597 STMT_START {                                                \
2598     TRIE_BITMAP_SET(trie, uvc);                             \
2599     /* store the folded codepoint */                        \
2600     if ( folder )                                           \
2601         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2602                                                             \
2603     if ( !UTF ) {                                           \
2604         /* store first byte of utf8 representation of */    \
2605         /* variant codepoints */                            \
2606         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2607             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2608         }                                                   \
2609     }                                                       \
2610 } STMT_END
2611 #define MADE_TRIE       1
2612 #define MADE_JUMP_TRIE  2
2613 #define MADE_EXACT_TRIE 4
2614
2615 STATIC I32
2616 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2617                   regnode *first, regnode *last, regnode *tail,
2618                   U32 word_count, U32 flags, U32 depth)
2619 {
2620     /* first pass, loop through and scan words */
2621     reg_trie_data *trie;
2622     HV *widecharmap = NULL;
2623     AV *revcharmap = newAV();
2624     regnode *cur;
2625     STRLEN len = 0;
2626     UV uvc = 0;
2627     U16 curword = 0;
2628     U32 next_alloc = 0;
2629     regnode *jumper = NULL;
2630     regnode *nextbranch = NULL;
2631     regnode *convert = NULL;
2632     U32 *prev_states; /* temp array mapping each state to previous one */
2633     /* we just use folder as a flag in utf8 */
2634     const U8 * folder = NULL;
2635
2636     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2637      * which stands for one trie structure, one hash, optionally followed
2638      * by two arrays */
2639 #ifdef DEBUGGING
2640     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2641     AV *trie_words = NULL;
2642     /* along with revcharmap, this only used during construction but both are
2643      * useful during debugging so we store them in the struct when debugging.
2644      */
2645 #else
2646     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2647     STRLEN trie_charcount=0;
2648 #endif
2649     SV *re_trie_maxbuff;
2650     GET_RE_DEBUG_FLAGS_DECL;
2651
2652     PERL_ARGS_ASSERT_MAKE_TRIE;
2653 #ifndef DEBUGGING
2654     PERL_UNUSED_ARG(depth);
2655 #endif
2656
2657     switch (flags) {
2658         case EXACT: case EXACT_ONLY8: case EXACTL: break;
2659         case EXACTFAA:
2660         case EXACTFUP:
2661         case EXACTFU:
2662         case EXACTFLU8: folder = PL_fold_latin1; break;
2663         case EXACTF:  folder = PL_fold; break;
2664         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2665     }
2666
2667     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2668     trie->refcount = 1;
2669     trie->startstate = 1;
2670     trie->wordcount = word_count;
2671     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2672     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2673     if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2674         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2675     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2676                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2677
2678     DEBUG_r({
2679         trie_words = newAV();
2680     });
2681
2682     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2683     assert(re_trie_maxbuff);
2684     if (!SvIOK(re_trie_maxbuff)) {
2685         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2686     }
2687     DEBUG_TRIE_COMPILE_r({
2688         Perl_re_indentf( aTHX_
2689           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2690           depth+1,
2691           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2692           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2693     });
2694
2695    /* Find the node we are going to overwrite */
2696     if ( first == startbranch && OP( last ) != BRANCH ) {
2697         /* whole branch chain */
2698         convert = first;
2699     } else {
2700         /* branch sub-chain */
2701         convert = NEXTOPER( first );
2702     }
2703
2704     /*  -- First loop and Setup --
2705
2706        We first traverse the branches and scan each word to determine if it
2707        contains widechars, and how many unique chars there are, this is
2708        important as we have to build a table with at least as many columns as we
2709        have unique chars.
2710
2711        We use an array of integers to represent the character codes 0..255
2712        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2713        the native representation of the character value as the key and IV's for
2714        the coded index.
2715
2716        *TODO* If we keep track of how many times each character is used we can
2717        remap the columns so that the table compression later on is more
2718        efficient in terms of memory by ensuring the most common value is in the
2719        middle and the least common are on the outside.  IMO this would be better
2720        than a most to least common mapping as theres a decent chance the most
2721        common letter will share a node with the least common, meaning the node
2722        will not be compressible. With a middle is most common approach the worst
2723        case is when we have the least common nodes twice.
2724
2725      */
2726
2727     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2728         regnode *noper = NEXTOPER( cur );
2729         const U8 *uc;
2730         const U8 *e;
2731         int foldlen = 0;
2732         U32 wordlen      = 0;         /* required init */
2733         STRLEN minchars = 0;
2734         STRLEN maxchars = 0;
2735         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2736                                                bitmap?*/
2737
2738         if (OP(noper) == NOTHING) {
2739             /* skip past a NOTHING at the start of an alternation
2740              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2741              */
2742             regnode *noper_next= regnext(noper);
2743             if (noper_next < tail)
2744                 noper= noper_next;
2745         }
2746
2747         if (    noper < tail
2748             && (    OP(noper) == flags
2749                 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2750                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2751                                          || OP(noper) == EXACTFUP))))
2752         {
2753             uc= (U8*)STRING(noper);
2754             e= uc + STR_LEN(noper);
2755         } else {
2756             trie->minlen= 0;
2757             continue;
2758         }
2759
2760
2761         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2762             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2763                                           regardless of encoding */
2764             if (OP( noper ) == EXACTFUP) {
2765                 /* false positives are ok, so just set this */
2766                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2767             }
2768         }
2769
2770         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2771                                            branch */
2772             TRIE_CHARCOUNT(trie)++;
2773             TRIE_READ_CHAR;
2774
2775             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2776              * is in effect.  Under /i, this character can match itself, or
2777              * anything that folds to it.  If not under /i, it can match just
2778              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2779              * all fold to k, and all are single characters.   But some folds
2780              * expand to more than one character, so for example LATIN SMALL
2781              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2782              * the string beginning at 'uc' is 'ffi', it could be matched by
2783              * three characters, or just by the one ligature character. (It
2784              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2785              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2786              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2787              * match.)  The trie needs to know the minimum and maximum number
2788              * of characters that could match so that it can use size alone to
2789              * quickly reject many match attempts.  The max is simple: it is
2790              * the number of folded characters in this branch (since a fold is
2791              * never shorter than what folds to it. */
2792
2793             maxchars++;
2794
2795             /* And the min is equal to the max if not under /i (indicated by
2796              * 'folder' being NULL), or there are no multi-character folds.  If
2797              * there is a multi-character fold, the min is incremented just
2798              * once, for the character that folds to the sequence.  Each
2799              * character in the sequence needs to be added to the list below of
2800              * characters in the trie, but we count only the first towards the
2801              * min number of characters needed.  This is done through the
2802              * variable 'foldlen', which is returned by the macros that look
2803              * for these sequences as the number of bytes the sequence
2804              * occupies.  Each time through the loop, we decrement 'foldlen' by
2805              * how many bytes the current char occupies.  Only when it reaches
2806              * 0 do we increment 'minchars' or look for another multi-character
2807              * sequence. */
2808             if (folder == NULL) {
2809                 minchars++;
2810             }
2811             else if (foldlen > 0) {
2812                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2813             }
2814             else {
2815                 minchars++;
2816
2817                 /* See if *uc is the beginning of a multi-character fold.  If
2818                  * so, we decrement the length remaining to look at, to account
2819                  * for the current character this iteration.  (We can use 'uc'
2820                  * instead of the fold returned by TRIE_READ_CHAR because for
2821                  * non-UTF, the latin1_safe macro is smart enough to account
2822                  * for all the unfolded characters, and because for UTF, the
2823                  * string will already have been folded earlier in the
2824                  * compilation process */
2825                 if (UTF) {
2826                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2827                         foldlen -= UTF8SKIP(uc);
2828                     }
2829                 }
2830                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2831                     foldlen--;
2832                 }
2833             }
2834
2835             /* The current character (and any potential folds) should be added
2836              * to the possible matching characters for this position in this
2837              * branch */
2838             if ( uvc < 256 ) {
2839                 if ( folder ) {
2840                     U8 folded= folder[ (U8) uvc ];
2841                     if ( !trie->charmap[ folded ] ) {
2842                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2843                         TRIE_STORE_REVCHAR( folded );
2844                     }
2845                 }
2846                 if ( !trie->charmap[ uvc ] ) {
2847                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2848                     TRIE_STORE_REVCHAR( uvc );
2849                 }
2850                 if ( set_bit ) {
2851                     /* store the codepoint in the bitmap, and its folded
2852                      * equivalent. */
2853                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2854                     set_bit = 0; /* We've done our bit :-) */
2855                 }
2856             } else {
2857
2858                 /* XXX We could come up with the list of code points that fold
2859                  * to this using PL_utf8_foldclosures, except not for
2860                  * multi-char folds, as there may be multiple combinations
2861                  * there that could work, which needs to wait until runtime to
2862                  * resolve (The comment about LIGATURE FFI above is such an
2863                  * example */
2864
2865                 SV** svpp;
2866                 if ( !widecharmap )
2867                     widecharmap = newHV();
2868
2869                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2870
2871                 if ( !svpp )
2872                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2873
2874                 if ( !SvTRUE( *svpp ) ) {
2875                     sv_setiv( *svpp, ++trie->uniquecharcount );
2876                     TRIE_STORE_REVCHAR(uvc);
2877                 }
2878             }
2879         } /* end loop through characters in this branch of the trie */
2880
2881         /* We take the min and max for this branch and combine to find the min
2882          * and max for all branches processed so far */
2883         if( cur == first ) {
2884             trie->minlen = minchars;
2885             trie->maxlen = maxchars;
2886         } else if (minchars < trie->minlen) {
2887             trie->minlen = minchars;
2888         } else if (maxchars > trie->maxlen) {
2889             trie->maxlen = maxchars;
2890         }
2891     } /* end first pass */
2892     DEBUG_TRIE_COMPILE_r(
2893         Perl_re_indentf( aTHX_
2894                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2895                 depth+1,
2896                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2897                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2898                 (int)trie->minlen, (int)trie->maxlen )
2899     );
2900
2901     /*
2902         We now know what we are dealing with in terms of unique chars and
2903         string sizes so we can calculate how much memory a naive
2904         representation using a flat table  will take. If it's over a reasonable
2905         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2906         conservative but potentially much slower representation using an array
2907         of lists.
2908
2909         At the end we convert both representations into the same compressed
2910         form that will be used in regexec.c for matching with. The latter
2911         is a form that cannot be used to construct with but has memory
2912         properties similar to the list form and access properties similar
2913         to the table form making it both suitable for fast searches and
2914         small enough that its feasable to store for the duration of a program.
2915
2916         See the comment in the code where the compressed table is produced
2917         inplace from the flat tabe representation for an explanation of how
2918         the compression works.
2919
2920     */
2921
2922
2923     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2924     prev_states[1] = 0;
2925
2926     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2927                                                     > SvIV(re_trie_maxbuff) )
2928     {
2929         /*
2930             Second Pass -- Array Of Lists Representation
2931
2932             Each state will be represented by a list of charid:state records
2933             (reg_trie_trans_le) the first such element holds the CUR and LEN
2934             points of the allocated array. (See defines above).
2935
2936             We build the initial structure using the lists, and then convert
2937             it into the compressed table form which allows faster lookups
2938             (but cant be modified once converted).
2939         */
2940
2941         STRLEN transcount = 1;
2942
2943         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2944             depth+1));
2945
2946         trie->states = (reg_trie_state *)
2947             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2948                                   sizeof(reg_trie_state) );
2949         TRIE_LIST_NEW(1);
2950         next_alloc = 2;
2951
2952         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2953
2954             regnode *noper   = NEXTOPER( cur );
2955             U32 state        = 1;         /* required init */
2956             U16 charid       = 0;         /* sanity init */
2957             U32 wordlen      = 0;         /* required init */
2958
2959             if (OP(noper) == NOTHING) {
2960                 regnode *noper_next= regnext(noper);
2961                 if (noper_next < tail)
2962                     noper= noper_next;
2963             }
2964
2965             if (    noper < tail
2966                 && (    OP(noper) == flags
2967                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2968                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2969                                              || OP(noper) == EXACTFUP))))
2970             {
2971                 const U8 *uc= (U8*)STRING(noper);
2972                 const U8 *e= uc + STR_LEN(noper);
2973
2974                 for ( ; uc < e ; uc += len ) {
2975
2976                     TRIE_READ_CHAR;
2977
2978                     if ( uvc < 256 ) {
2979                         charid = trie->charmap[ uvc ];
2980                     } else {
2981                         SV** const svpp = hv_fetch( widecharmap,
2982                                                     (char*)&uvc,
2983                                                     sizeof( UV ),
2984                                                     0);
2985                         if ( !svpp ) {
2986                             charid = 0;
2987                         } else {
2988                             charid=(U16)SvIV( *svpp );
2989                         }
2990                     }
2991                     /* charid is now 0 if we dont know the char read, or
2992                      * nonzero if we do */
2993                     if ( charid ) {
2994
2995                         U16 check;
2996                         U32 newstate = 0;
2997
2998                         charid--;
2999                         if ( !trie->states[ state ].trans.list ) {
3000                             TRIE_LIST_NEW( state );
3001                         }
3002                         for ( check = 1;
3003                               check <= TRIE_LIST_USED( state );
3004                               check++ )
3005                         {
3006                             if ( TRIE_LIST_ITEM( state, check ).forid
3007                                                                     == charid )
3008                             {
3009                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3010                                 break;
3011                             }
3012                         }
3013                         if ( ! newstate ) {
3014                             newstate = next_alloc++;
3015                             prev_states[newstate] = state;
3016                             TRIE_LIST_PUSH( state, charid, newstate );
3017                             transcount++;
3018                         }
3019                         state = newstate;
3020                     } else {
3021                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3022                     }
3023                 }
3024             }
3025             TRIE_HANDLE_WORD(state);
3026
3027         } /* end second pass */
3028
3029         /* next alloc is the NEXT state to be allocated */
3030         trie->statecount = next_alloc;
3031         trie->states = (reg_trie_state *)
3032             PerlMemShared_realloc( trie->states,
3033                                    next_alloc
3034                                    * sizeof(reg_trie_state) );
3035
3036         /* and now dump it out before we compress it */
3037         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3038                                                          revcharmap, next_alloc,
3039                                                          depth+1)
3040         );
3041
3042         trie->trans = (reg_trie_trans *)
3043             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3044         {
3045             U32 state;
3046             U32 tp = 0;
3047             U32 zp = 0;
3048
3049
3050             for( state=1 ; state < next_alloc ; state ++ ) {
3051                 U32 base=0;
3052
3053                 /*
3054                 DEBUG_TRIE_COMPILE_MORE_r(
3055                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3056                 );
3057                 */
3058
3059                 if (trie->states[state].trans.list) {
3060                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3061                     U16 maxid=minid;
3062                     U16 idx;
3063
3064                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3065                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3066                         if ( forid < minid ) {
3067                             minid=forid;
3068                         } else if ( forid > maxid ) {
3069                             maxid=forid;
3070                         }
3071                     }
3072                     if ( transcount < tp + maxid - minid + 1) {
3073                         transcount *= 2;
3074                         trie->trans = (reg_trie_trans *)
3075                             PerlMemShared_realloc( trie->trans,
3076                                                      transcount
3077                                                      * sizeof(reg_trie_trans) );
3078                         Zero( trie->trans + (transcount / 2),
3079                               transcount / 2,
3080                               reg_trie_trans );
3081                     }
3082                     base = trie->uniquecharcount + tp - minid;
3083                     if ( maxid == minid ) {
3084                         U32 set = 0;
3085                         for ( ; zp < tp ; zp++ ) {
3086                             if ( ! trie->trans[ zp ].next ) {
3087                                 base = trie->uniquecharcount + zp - minid;
3088                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3089                                                                    1).newstate;
3090                                 trie->trans[ zp ].check = state;
3091                                 set = 1;
3092                                 break;
3093                             }
3094                         }
3095                         if ( !set ) {
3096                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3097                                                                    1).newstate;
3098                             trie->trans[ tp ].check = state;
3099                             tp++;
3100                             zp = tp;
3101                         }
3102                     } else {
3103                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3104                             const U32 tid = base
3105                                            - trie->uniquecharcount
3106                                            + TRIE_LIST_ITEM( state, idx ).forid;
3107                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3108                                                                 idx ).newstate;
3109                             trie->trans[ tid ].check = state;
3110                         }
3111                         tp += ( maxid - minid + 1 );
3112                     }
3113                     Safefree(trie->states[ state ].trans.list);
3114                 }
3115                 /*
3116                 DEBUG_TRIE_COMPILE_MORE_r(
3117                     Perl_re_printf( aTHX_  " base: %d\n",base);
3118                 );
3119                 */
3120                 trie->states[ state ].trans.base=base;
3121             }
3122             trie->lasttrans = tp + 1;
3123         }
3124     } else {
3125         /*
3126            Second Pass -- Flat Table Representation.
3127
3128            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3129            each.  We know that we will need Charcount+1 trans at most to store
3130            the data (one row per char at worst case) So we preallocate both
3131            structures assuming worst case.
3132
3133            We then construct the trie using only the .next slots of the entry
3134            structs.
3135
3136            We use the .check field of the first entry of the node temporarily
3137            to make compression both faster and easier by keeping track of how
3138            many non zero fields are in the node.
3139
3140            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3141            transition.
3142
3143            There are two terms at use here: state as a TRIE_NODEIDX() which is
3144            a number representing the first entry of the node, and state as a
3145            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3146            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3147            if there are 2 entrys per node. eg:
3148
3149              A B       A B
3150           1. 2 4    1. 3 7
3151           2. 0 3    3. 0 5
3152           3. 0 0    5. 0 0
3153           4. 0 0    7. 0 0
3154
3155            The table is internally in the right hand, idx form. However as we
3156            also have to deal with the states array which is indexed by nodenum
3157            we have to use TRIE_NODENUM() to convert.
3158
3159         */
3160         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3161             depth+1));
3162
3163         trie->trans = (reg_trie_trans *)
3164             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3165                                   * trie->uniquecharcount + 1,
3166                                   sizeof(reg_trie_trans) );
3167         trie->states = (reg_trie_state *)
3168             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3169                                   sizeof(reg_trie_state) );
3170         next_alloc = trie->uniquecharcount + 1;
3171
3172
3173         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3174
3175             regnode *noper   = NEXTOPER( cur );
3176
3177             U32 state        = 1;         /* required init */
3178
3179             U16 charid       = 0;         /* sanity init */
3180             U32 accept_state = 0;         /* sanity init */
3181
3182             U32 wordlen      = 0;         /* required init */
3183
3184             if (OP(noper) == NOTHING) {
3185                 regnode *noper_next= regnext(noper);
3186                 if (noper_next < tail)
3187                     noper= noper_next;
3188             }
3189
3190             if (    noper < tail
3191                 && (    OP(noper) == flags
3192                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3193                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3194                                              || OP(noper) == EXACTFUP))))
3195             {
3196                 const U8 *uc= (U8*)STRING(noper);
3197                 const U8 *e= uc + STR_LEN(noper);
3198
3199                 for ( ; uc < e ; uc += len ) {
3200
3201                     TRIE_READ_CHAR;
3202
3203                     if ( uvc < 256 ) {
3204                         charid = trie->charmap[ uvc ];
3205                     } else {
3206                         SV* const * const svpp = hv_fetch( widecharmap,
3207                                                            (char*)&uvc,
3208                                                            sizeof( UV ),
3209                                                            0);
3210                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3211                     }
3212                     if ( charid ) {
3213                         charid--;
3214                         if ( !trie->trans[ state + charid ].next ) {
3215                             trie->trans[ state + charid ].next = next_alloc;
3216                             trie->trans[ state ].check++;
3217                             prev_states[TRIE_NODENUM(next_alloc)]
3218                                     = TRIE_NODENUM(state);
3219                             next_alloc += trie->uniquecharcount;
3220                         }
3221                         state = trie->trans[ state + charid ].next;
3222                     } else {
3223                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3224                     }
3225                     /* charid is now 0 if we dont know the char read, or
3226                      * nonzero if we do */
3227                 }
3228             }
3229             accept_state = TRIE_NODENUM( state );
3230             TRIE_HANDLE_WORD(accept_state);
3231
3232         } /* end second pass */
3233
3234         /* and now dump it out before we compress it */
3235         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3236                                                           revcharmap,
3237                                                           next_alloc, depth+1));
3238
3239         {
3240         /*
3241            * Inplace compress the table.*
3242
3243            For sparse data sets the table constructed by the trie algorithm will
3244            be mostly 0/FAIL transitions or to put it another way mostly empty.
3245            (Note that leaf nodes will not contain any transitions.)
3246
3247            This algorithm compresses the tables by eliminating most such
3248            transitions, at the cost of a modest bit of extra work during lookup:
3249
3250            - Each states[] entry contains a .base field which indicates the
3251            index in the state[] array wheres its transition data is stored.
3252
3253            - If .base is 0 there are no valid transitions from that node.
3254
3255            - If .base is nonzero then charid is added to it to find an entry in
3256            the trans array.
3257
3258            -If trans[states[state].base+charid].check!=state then the
3259            transition is taken to be a 0/Fail transition. Thus if there are fail
3260            transitions at the front of the node then the .base offset will point
3261            somewhere inside the previous nodes data (or maybe even into a node
3262            even earlier), but the .check field determines if the transition is
3263            valid.
3264
3265            XXX - wrong maybe?
3266            The following process inplace converts the table to the compressed
3267            table: We first do not compress the root node 1,and mark all its
3268            .check pointers as 1 and set its .base pointer as 1 as well. This
3269            allows us to do a DFA construction from the compressed table later,
3270            and ensures that any .base pointers we calculate later are greater
3271            than 0.
3272
3273            - We set 'pos' to indicate the first entry of the second node.
3274
3275            - We then iterate over the columns of the node, finding the first and
3276            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3277            and set the .check pointers accordingly, and advance pos
3278            appropriately and repreat for the next node. Note that when we copy
3279            the next pointers we have to convert them from the original
3280            NODEIDX form to NODENUM form as the former is not valid post
3281            compression.
3282
3283            - If a node has no transitions used we mark its base as 0 and do not
3284            advance the pos pointer.
3285
3286            - If a node only has one transition we use a second pointer into the
3287            structure to fill in allocated fail transitions from other states.
3288            This pointer is independent of the main pointer and scans forward
3289            looking for null transitions that are allocated to a state. When it
3290            finds one it writes the single transition into the "hole".  If the
3291            pointer doesnt find one the single transition is appended as normal.
3292
3293            - Once compressed we can Renew/realloc the structures to release the
3294            excess space.
3295
3296            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3297            specifically Fig 3.47 and the associated pseudocode.
3298
3299            demq
3300         */
3301         const U32 laststate = TRIE_NODENUM( next_alloc );
3302         U32 state, charid;
3303         U32 pos = 0, zp=0;
3304         trie->statecount = laststate;
3305
3306         for ( state = 1 ; state < laststate ; state++ ) {
3307             U8 flag = 0;
3308             const U32 stateidx = TRIE_NODEIDX( state );
3309             const U32 o_used = trie->trans[ stateidx ].check;
3310             U32 used = trie->trans[ stateidx ].check;
3311             trie->trans[ stateidx ].check = 0;
3312
3313             for ( charid = 0;
3314                   used && charid < trie->uniquecharcount;
3315                   charid++ )
3316             {
3317                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3318                     if ( trie->trans[ stateidx + charid ].next ) {
3319                         if (o_used == 1) {
3320                             for ( ; zp < pos ; zp++ ) {
3321                                 if ( ! trie->trans[ zp ].next ) {
3322                                     break;
3323                                 }
3324                             }
3325                             trie->states[ state ].trans.base
3326                                                     = zp
3327                                                       + trie->uniquecharcount
3328                                                       - charid ;
3329                             trie->trans[ zp ].next
3330                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3331                                                              + charid ].next );
3332                             trie->trans[ zp ].check = state;
3333                             if ( ++zp > pos ) pos = zp;
3334                             break;
3335                         }
3336                         used--;
3337                     }
3338                     if ( !flag ) {
3339                         flag = 1;
3340                         trie->states[ state ].trans.base
3341                                        = pos + trie->uniquecharcount - charid ;
3342                     }
3343                     trie->trans[ pos ].next
3344                         = SAFE_TRIE_NODENUM(
3345                                        trie->trans[ stateidx + charid ].next );
3346                     trie->trans[ pos ].check = state;
3347                     pos++;
3348                 }
3349             }
3350         }
3351         trie->lasttrans = pos + 1;
3352         trie->states = (reg_trie_state *)
3353             PerlMemShared_realloc( trie->states, laststate
3354                                    * sizeof(reg_trie_state) );
3355         DEBUG_TRIE_COMPILE_MORE_r(
3356             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3357                 depth+1,
3358                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3359                        + 1 ),
3360                 (IV)next_alloc,
3361                 (IV)pos,
3362                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3363             );
3364
3365         } /* end table compress */
3366     }
3367     DEBUG_TRIE_COMPILE_MORE_r(
3368             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3369                 depth+1,
3370                 (UV)trie->statecount,
3371                 (UV)trie->lasttrans)
3372     );
3373     /* resize the trans array to remove unused space */
3374     trie->trans = (reg_trie_trans *)
3375         PerlMemShared_realloc( trie->trans, trie->lasttrans
3376                                * sizeof(reg_trie_trans) );
3377
3378     {   /* Modify the program and insert the new TRIE node */
3379         U8 nodetype =(U8)(flags & 0xFF);
3380         char *str=NULL;
3381
3382 #ifdef DEBUGGING
3383         regnode *optimize = NULL;
3384 #ifdef RE_TRACK_PATTERN_OFFSETS
3385
3386         U32 mjd_offset = 0;
3387         U32 mjd_nodelen = 0;
3388 #endif /* RE_TRACK_PATTERN_OFFSETS */
3389 #endif /* DEBUGGING */
3390         /*
3391            This means we convert either the first branch or the first Exact,
3392            depending on whether the thing following (in 'last') is a branch
3393            or not and whther first is the startbranch (ie is it a sub part of
3394            the alternation or is it the whole thing.)
3395            Assuming its a sub part we convert the EXACT otherwise we convert
3396            the whole branch sequence, including the first.
3397          */
3398         /* Find the node we are going to overwrite */
3399         if ( first != startbranch || OP( last ) == BRANCH ) {
3400             /* branch sub-chain */
3401             NEXT_OFF( first ) = (U16)(last - first);
3402 #ifdef RE_TRACK_PATTERN_OFFSETS
3403             DEBUG_r({
3404                 mjd_offset= Node_Offset((convert));
3405                 mjd_nodelen= Node_Length((convert));
3406             });
3407 #endif
3408             /* whole branch chain */
3409         }
3410 #ifdef RE_TRACK_PATTERN_OFFSETS
3411         else {
3412             DEBUG_r({
3413                 const  regnode *nop = NEXTOPER( convert );
3414                 mjd_offset= Node_Offset((nop));
3415                 mjd_nodelen= Node_Length((nop));
3416             });
3417         }
3418         DEBUG_OPTIMISE_r(
3419             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3420                 depth+1,
3421                 (UV)mjd_offset, (UV)mjd_nodelen)
3422         );
3423 #endif
3424         /* But first we check to see if there is a common prefix we can
3425            split out as an EXACT and put in front of the TRIE node.  */
3426         trie->startstate= 1;
3427         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3428             /* we want to find the first state that has more than
3429              * one transition, if that state is not the first state
3430              * then we have a common prefix which we can remove.
3431              */
3432             U32 state;
3433             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3434                 U32 ofs = 0;
3435                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3436                                        transition, -1 means none */
3437                 U32 count = 0;
3438                 const U32 base = trie->states[ state ].trans.base;
3439
3440                 /* does this state terminate an alternation? */
3441                 if ( trie->states[state].wordnum )
3442                         count = 1;
3443
3444                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3445                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3446                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3447                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3448                     {
3449                         if ( ++count > 1 ) {
3450                             /* we have more than one transition */
3451                             SV **tmp;
3452                             U8 *ch;
3453                             /* if this is the first state there is no common prefix
3454                              * to extract, so we can exit */
3455                             if ( state == 1 ) break;
3456                             tmp = av_fetch( revcharmap, ofs, 0);
3457                             ch = (U8*)SvPV_nolen_const( *tmp );
3458
3459                             /* if we are on count 2 then we need to initialize the
3460                              * bitmap, and store the previous char if there was one
3461                              * in it*/
3462                             if ( count == 2 ) {
3463                                 /* clear the bitmap */
3464                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3465                                 DEBUG_OPTIMISE_r(
3466                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3467                                         depth+1,
3468                                         (UV)state));
3469                                 if (first_ofs >= 0) {
3470                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3471                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3472
3473                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3474                                     DEBUG_OPTIMISE_r(
3475                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3476                                     );
3477                                 }
3478                             }
3479                             /* store the current firstchar in the bitmap */
3480                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3481                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3482                         }
3483                         first_ofs = ofs;
3484                     }
3485                 }
3486                 if ( count == 1 ) {
3487                     /* This state has only one transition, its transition is part
3488                      * of a common prefix - we need to concatenate the char it
3489                      * represents to what we have so far. */
3490                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3491                     STRLEN len;
3492                     char *ch = SvPV( *tmp, len );
3493                     DEBUG_OPTIMISE_r({
3494                         SV *sv=sv_newmortal();
3495                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3496                             depth+1,
3497                             (UV)state, (UV)first_ofs,
3498                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3499                                 PL_colors[0], PL_colors[1],
3500                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3501                                 PERL_PV_ESCAPE_FIRSTCHAR
3502                             )
3503                         );
3504                     });
3505                     if ( state==1 ) {
3506                         OP( convert ) = nodetype;
3507                         str=STRING(convert);
3508                         STR_LEN(convert)=0;
3509                     }
3510                     STR_LEN(convert) += len;
3511                     while (len--)
3512                         *str++ = *ch++;
3513                 } else {
3514 #ifdef DEBUGGING
3515                     if (state>1)
3516                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3517 #endif
3518                     break;
3519                 }
3520             }
3521             trie->prefixlen = (state-1);
3522             if (str) {
3523                 regnode *n = convert+NODE_SZ_STR(convert);
3524                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3525                 trie->startstate = state;
3526                 trie->minlen -= (state - 1);
3527                 trie->maxlen -= (state - 1);
3528 #ifdef DEBUGGING
3529                /* At least the UNICOS C compiler choked on this
3530                 * being argument to DEBUG_r(), so let's just have
3531                 * it right here. */
3532                if (
3533 #ifdef PERL_EXT_RE_BUILD
3534                    1
3535 #else
3536                    DEBUG_r_TEST
3537 #endif
3538                    ) {
3539                    regnode *fix = convert;
3540                    U32 word = trie->wordcount;
3541 #ifdef RE_TRACK_PATTERN_OFFSETS
3542                    mjd_nodelen++;
3543 #endif
3544                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3545                    while( ++fix < n ) {
3546                        Set_Node_Offset_Length(fix, 0, 0);
3547                    }
3548                    while (word--) {
3549                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3550                        if (tmp) {
3551                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3552                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3553                            else
3554                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3555                        }
3556                    }
3557                }
3558 #endif
3559                 if (trie->maxlen) {
3560                     convert = n;
3561                 } else {
3562                     NEXT_OFF(convert) = (U16)(tail - convert);
3563                     DEBUG_r(optimize= n);
3564                 }
3565             }
3566         }
3567         if (!jumper)
3568             jumper = last;
3569         if ( trie->maxlen ) {
3570             NEXT_OFF( convert ) = (U16)(tail - convert);
3571             ARG_SET( convert, data_slot );
3572             /* Store the offset to the first unabsorbed branch in
3573                jump[0], which is otherwise unused by the jump logic.
3574                We use this when dumping a trie and during optimisation. */
3575             if (trie->jump)
3576                 trie->jump[0] = (U16)(nextbranch - convert);
3577
3578             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3579              *   and there is a bitmap
3580              *   and the first "jump target" node we found leaves enough room
3581              * then convert the TRIE node into a TRIEC node, with the bitmap
3582              * embedded inline in the opcode - this is hypothetically faster.
3583              */
3584             if ( !trie->states[trie->startstate].wordnum
3585                  && trie->bitmap
3586                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3587             {
3588                 OP( convert ) = TRIEC;
3589                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3590                 PerlMemShared_free(trie->bitmap);
3591                 trie->bitmap= NULL;
3592             } else
3593                 OP( convert ) = TRIE;
3594
3595             /* store the type in the flags */
3596             convert->flags = nodetype;
3597             DEBUG_r({
3598             optimize = convert
3599                       + NODE_STEP_REGNODE
3600                       + regarglen[ OP( convert ) ];
3601             });
3602             /* XXX We really should free up the resource in trie now,
3603                    as we won't use them - (which resources?) dmq */
3604         }
3605         /* needed for dumping*/
3606         DEBUG_r(if (optimize) {
3607             regnode *opt = convert;
3608
3609             while ( ++opt < optimize) {
3610                 Set_Node_Offset_Length(opt, 0, 0);
3611             }
3612             /*
3613                 Try to clean up some of the debris left after the
3614                 optimisation.
3615              */
3616             while( optimize < jumper ) {
3617                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3618                 OP( optimize ) = OPTIMIZED;
3619                 Set_Node_Offset_Length(optimize, 0, 0);
3620                 optimize++;
3621             }
3622             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3623         });
3624     } /* end node insert */
3625
3626     /*  Finish populating the prev field of the wordinfo array.  Walk back
3627      *  from each accept state until we find another accept state, and if
3628      *  so, point the first word's .prev field at the second word. If the
3629      *  second already has a .prev field set, stop now. This will be the
3630      *  case either if we've already processed that word's accept state,
3631      *  or that state had multiple words, and the overspill words were
3632      *  already linked up earlier.
3633      */
3634     {
3635         U16 word;
3636         U32 state;
3637         U16 prev;
3638
3639         for (word=1; word <= trie->wordcount; word++) {
3640             prev = 0;
3641             if (trie->wordinfo[word].prev)
3642                 continue;
3643             state = trie->wordinfo[word].accept;
3644             while (state) {
3645                 state = prev_states[state];
3646                 if (!state)
3647                     break;
3648                 prev = trie->states[state].wordnum;
3649                 if (prev)
3650                     break;
3651             }
3652             trie->wordinfo[word].prev = prev;
3653         }
3654         Safefree(prev_states);
3655     }
3656
3657
3658     /* and now dump out the compressed format */
3659     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3660
3661     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3662 #ifdef DEBUGGING
3663     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3664     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3665 #else
3666     SvREFCNT_dec_NN(revcharmap);
3667 #endif
3668     return trie->jump
3669            ? MADE_JUMP_TRIE
3670            : trie->startstate>1
3671              ? MADE_EXACT_TRIE
3672              : MADE_TRIE;
3673 }
3674
3675 STATIC regnode *
3676 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3677 {
3678 /* The Trie is constructed and compressed now so we can build a fail array if
3679  * it's needed
3680
3681    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3682    3.32 in the
3683    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3684    Ullman 1985/88
3685    ISBN 0-201-10088-6
3686
3687    We find the fail state for each state in the trie, this state is the longest
3688    proper suffix of the current state's 'word' that is also a proper prefix of
3689    another word in our trie. State 1 represents the word '' and is thus the
3690    default fail state. This allows the DFA not to have to restart after its
3691    tried and failed a word at a given point, it simply continues as though it
3692    had been matching the other word in the first place.
3693    Consider
3694       'abcdgu'=~/abcdefg|cdgu/
3695    When we get to 'd' we are still matching the first word, we would encounter
3696    'g' which would fail, which would bring us to the state representing 'd' in
3697    the second word where we would try 'g' and succeed, proceeding to match
3698    'cdgu'.
3699  */
3700  /* add a fail transition */
3701     const U32 trie_offset = ARG(source);
3702     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3703     U32 *q;
3704     const U32 ucharcount = trie->uniquecharcount;
3705     const U32 numstates = trie->statecount;
3706     const U32 ubound = trie->lasttrans + ucharcount;
3707     U32 q_read = 0;
3708     U32 q_write = 0;
3709     U32 charid;
3710     U32 base = trie->states[ 1 ].trans.base;
3711     U32 *fail;
3712     reg_ac_data *aho;
3713     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3714     regnode *stclass;
3715     GET_RE_DEBUG_FLAGS_DECL;
3716
3717     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3718     PERL_UNUSED_CONTEXT;
3719 #ifndef DEBUGGING
3720     PERL_UNUSED_ARG(depth);
3721 #endif
3722
3723     if ( OP(source) == TRIE ) {
3724         struct regnode_1 *op = (struct regnode_1 *)
3725             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3726         StructCopy(source, op, struct regnode_1);
3727         stclass = (regnode *)op;
3728     } else {
3729         struct regnode_charclass *op = (struct regnode_charclass *)
3730             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3731         StructCopy(source, op, struct regnode_charclass);
3732         stclass = (regnode *)op;
3733     }
3734     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3735
3736     ARG_SET( stclass, data_slot );
3737     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3738     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3739     aho->trie=trie_offset;
3740     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3741     Copy( trie->states, aho->states, numstates, reg_trie_state );
3742     Newx( q, numstates, U32);
3743     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3744     aho->refcount = 1;
3745     fail = aho->fail;
3746     /* initialize fail[0..1] to be 1 so that we always have
3747        a valid final fail state */
3748     fail[ 0 ] = fail[ 1 ] = 1;
3749
3750     for ( charid = 0; charid < ucharcount ; charid++ ) {
3751         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3752         if ( newstate ) {
3753             q[ q_write ] = newstate;
3754             /* set to point at the root */
3755             fail[ q[ q_write++ ] ]=1;
3756         }
3757     }
3758     while ( q_read < q_write) {
3759         const U32 cur = q[ q_read++ % numstates ];
3760         base = trie->states[ cur ].trans.base;
3761
3762         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3763             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3764             if (ch_state) {
3765                 U32 fail_state = cur;
3766                 U32 fail_base;
3767                 do {
3768                     fail_state = fail[ fail_state ];
3769                     fail_base = aho->states[ fail_state ].trans.base;
3770                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3771
3772                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3773                 fail[ ch_state ] = fail_state;
3774                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3775                 {
3776                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3777                 }
3778                 q[ q_write++ % numstates] = ch_state;
3779             }
3780         }
3781     }
3782     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3783        when we fail in state 1, this allows us to use the
3784        charclass scan to find a valid start char. This is based on the principle
3785        that theres a good chance the string being searched contains lots of stuff
3786        that cant be a start char.
3787      */
3788     fail[ 0 ] = fail[ 1 ] = 0;
3789     DEBUG_TRIE_COMPILE_r({
3790         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3791                       depth, (UV)numstates
3792         );
3793         for( q_read=1; q_read<numstates; q_read++ ) {
3794             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3795         }
3796         Perl_re_printf( aTHX_  "\n");
3797     });
3798     Safefree(q);
3799     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3800     return stclass;
3801 }
3802
3803
3804 /* The below joins as many adjacent EXACTish nodes as possible into a single
3805  * one.  The regop may be changed if the node(s) contain certain sequences that
3806  * require special handling.  The joining is only done if:
3807  * 1) there is room in the current conglomerated node to entirely contain the
3808  *    next one.
3809  * 2) they are compatible node types
3810  *
3811  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3812  * these get optimized out
3813  *
3814  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3815  * as possible, even if that means splitting an existing node so that its first
3816  * part is moved to the preceeding node.  This would maximise the efficiency of
3817  * memEQ during matching.
3818  *
3819  * If a node is to match under /i (folded), the number of characters it matches
3820  * can be different than its character length if it contains a multi-character
3821  * fold.  *min_subtract is set to the total delta number of characters of the
3822  * input nodes.
3823  *
3824  * And *unfolded_multi_char is set to indicate whether or not the node contains
3825  * an unfolded multi-char fold.  This happens when it won't be known until
3826  * runtime whether the fold is valid or not; namely
3827  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3828  *      target string being matched against turns out to be UTF-8 is that fold
3829  *      valid; or
3830  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3831  *      runtime.
3832  * (Multi-char folds whose components are all above the Latin1 range are not
3833  * run-time locale dependent, and have already been folded by the time this
3834  * function is called.)
3835  *
3836  * This is as good a place as any to discuss the design of handling these
3837  * multi-character fold sequences.  It's been wrong in Perl for a very long
3838  * time.  There are three code points in Unicode whose multi-character folds
3839  * were long ago discovered to mess things up.  The previous designs for
3840  * dealing with these involved assigning a special node for them.  This
3841  * approach doesn't always work, as evidenced by this example:
3842  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3843  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3844  * would match just the \xDF, it won't be able to handle the case where a
3845  * successful match would have to cross the node's boundary.  The new approach
3846  * that hopefully generally solves the problem generates an EXACTFUP node
3847  * that is "sss" in this case.
3848  *
3849  * It turns out that there are problems with all multi-character folds, and not
3850  * just these three.  Now the code is general, for all such cases.  The
3851  * approach taken is:
3852  * 1)   This routine examines each EXACTFish node that could contain multi-
3853  *      character folded sequences.  Since a single character can fold into
3854  *      such a sequence, the minimum match length for this node is less than
3855  *      the number of characters in the node.  This routine returns in
3856  *      *min_subtract how many characters to subtract from the the actual
3857  *      length of the string to get a real minimum match length; it is 0 if
3858  *      there are no multi-char foldeds.  This delta is used by the caller to
3859  *      adjust the min length of the match, and the delta between min and max,
3860  *      so that the optimizer doesn't reject these possibilities based on size
3861  *      constraints.
3862  *
3863  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3864  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3865  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3866  *      EXACTFU nodes.  The node type of such nodes is then changed to
3867  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3868  *      (The procedures in step 1) above are sufficient to handle this case in
3869  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3870  *      the only case where there is a possible fold length change in non-UTF-8
3871  *      patterns.  By reserving a special node type for problematic cases, the
3872  *      far more common regular EXACTFU nodes can be processed faster.
3873  *      regexec.c takes advantage of this.
3874  *
3875  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3876  *      problematic cases.   These all only occur when the pattern is not
3877  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3878  *      length change, it handles the situation where the string cannot be
3879  *      entirely folded.  The strings in an EXACTFish node are folded as much
3880  *      as possible during compilation in regcomp.c.  This saves effort in
3881  *      regex matching.  By using an EXACTFUP node when it is not possible to
3882  *      fully fold at compile time, regexec.c can know that everything in an
3883  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3884  *      case where folding in EXACTFU nodes can't be done at compile time is
3885  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3886  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3887  *      handle two very different cases.  Alternatively, there could have been
3888  *      a node type where there are length changes, one for unfolded, and one
3889  *      for both.  If yet another special case needed to be created, the number
3890  *      of required node types would have to go to 7.  khw figures that even
3891  *      though there are plenty of node types to spare, that the maintenance
3892  *      cost wasn't worth the small speedup of doing it that way, especially
3893  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3894  *
3895  *      There are other cases where folding isn't done at compile time, but
3896  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3897  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3898  *      changes.  Some folds in EXACTF depend on if the runtime target string
3899  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3900  *      when no fold in it depends on the UTF-8ness of the target string.)
3901  *
3902  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3903  *      validity of the fold won't be known until runtime, and so must remain
3904  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3905  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3906  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3907  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3908  *      The reason this is a problem is that the optimizer part of regexec.c
3909  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3910  *      that a character in the pattern corresponds to at most a single
3911  *      character in the target string.  (And I do mean character, and not byte
3912  *      here, unlike other parts of the documentation that have never been
3913  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3914  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3915  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3916  *      EXACTFL nodes, violate the assumption, and they are the only instances
3917  *      where it is violated.  I'm reluctant to try to change the assumption,
3918  *      as the code involved is impenetrable to me (khw), so instead the code
3919  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3920  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3921  *      boolean indicating whether or not the node contains such a fold.  When
3922  *      it is true, the caller sets a flag that later causes the optimizer in
3923  *      this file to not set values for the floating and fixed string lengths,
3924  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3925  *      assumption.  Thus, there is no optimization based on string lengths for
3926  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3927  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3928  *      assumption is wrong only in these cases is that all other non-UTF-8
3929  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3930  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3931  *      EXACTF nodes because we don't know at compile time if it actually
3932  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3933  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3934  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3935  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3936  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3937  *      string would require the pattern to be forced into UTF-8, the overhead
3938  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3939  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3940  *      locale.)
3941  *
3942  *      Similarly, the code that generates tries doesn't currently handle
3943  *      not-already-folded multi-char folds, and it looks like a pain to change
3944  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
3945  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
3946  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
3947  *      using /iaa matching will be doing so almost entirely with ASCII
3948  *      strings, so this should rarely be encountered in practice */
3949
3950 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3951     if (PL_regkind[OP(scan)] == EXACT) \
3952         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3953
3954 STATIC U32
3955 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3956                    UV *min_subtract, bool *unfolded_multi_char,
3957                    U32 flags, regnode *val, U32 depth)
3958 {
3959     /* Merge several consecutive EXACTish nodes into one. */
3960
3961     regnode *n = regnext(scan);
3962     U32 stringok = 1;
3963     regnode *next = scan + NODE_SZ_STR(scan);
3964     U32 merged = 0;
3965     U32 stopnow = 0;
3966 #ifdef DEBUGGING
3967     regnode *stop = scan;
3968     GET_RE_DEBUG_FLAGS_DECL;
3969 #else
3970     PERL_UNUSED_ARG(depth);
3971 #endif
3972
3973     PERL_ARGS_ASSERT_JOIN_EXACT;
3974 #ifndef EXPERIMENTAL_INPLACESCAN
3975     PERL_UNUSED_ARG(flags);
3976     PERL_UNUSED_ARG(val);
3977 #endif
3978     DEBUG_PEEP("join", scan, depth, 0);
3979
3980     assert(PL_regkind[OP(scan)] == EXACT);
3981
3982     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3983      * EXACT ones that are mergeable to the current one. */
3984     while (    n
3985            && (    PL_regkind[OP(n)] == NOTHING
3986                || (stringok && PL_regkind[OP(n)] == EXACT))
3987            && NEXT_OFF(n)
3988            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3989     {
3990
3991         if (OP(n) == TAIL || n > next)
3992             stringok = 0;
3993         if (PL_regkind[OP(n)] == NOTHING) {
3994             DEBUG_PEEP("skip:", n, depth, 0);
3995             NEXT_OFF(scan) += NEXT_OFF(n);
3996             next = n + NODE_STEP_REGNODE;
3997 #ifdef DEBUGGING
3998             if (stringok)
3999                 stop = n;
4000 #endif
4001             n = regnext(n);
4002         }
4003         else if (stringok) {
4004             const unsigned int oldl = STR_LEN(scan);
4005             regnode * const nnext = regnext(n);
4006
4007             /* XXX I (khw) kind of doubt that this works on platforms (should
4008              * Perl ever run on one) where U8_MAX is above 255 because of lots
4009              * of other assumptions */
4010             /* Don't join if the sum can't fit into a single node */
4011             if (oldl + STR_LEN(n) > U8_MAX)
4012                 break;
4013
4014             /* Joining something that requires UTF-8 with something that
4015              * doesn't, means the result requires UTF-8. */
4016             if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
4017                 OP(scan) = EXACT_ONLY8;
4018             }
4019             else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
4020                 ;   /* join is compatible, no need to change OP */
4021             }
4022             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
4023                 OP(scan) = EXACTFU_ONLY8;
4024             }
4025             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
4026                 ;   /* join is compatible, no need to change OP */
4027             }
4028             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4029                 ;   /* join is compatible, no need to change OP */
4030             }
4031             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4032
4033                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4034                   * which can join with EXACTFU ones.  We check for this case
4035                   * here.  These need to be resolved to either EXACTFU or
4036                   * EXACTF at joining time.  They have nothing in them that
4037                   * would forbid them from being the more desirable EXACTFU
4038                   * nodes except that they begin and/or end with a single [Ss].
4039                   * The reason this is problematic is because they could be
4040                   * joined in this loop with an adjacent node that ends and/or
4041                   * begins with [Ss] which would then form the sequence 'ss',
4042                   * which matches differently under /di than /ui, in which case
4043                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4044                   * formed, the nodes get absorbed into any adjacent EXACTFU
4045                   * node.  And if the only adjacent node is EXACTF, they get
4046                   * absorbed into that, under the theory that a longer node is
4047                   * better than two shorter ones, even if one is EXACTFU.  Note
4048                   * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
4049                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4050
4051                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4052
4053                     /* Here the joined node would end with 's'.  If the node
4054                      * following the combination is an EXACTF one, it's better to
4055                      * join this trailing edge 's' node with that one, leaving the
4056                      * current one in 'scan' be the more desirable EXACTFU */
4057                     if (OP(nnext) == EXACTF) {
4058                         break;
4059                     }
4060
4061                     OP(scan) = EXACTFU_S_EDGE;
4062
4063                 }   /* Otherwise, the beginning 's' of the 2nd node just
4064                        becomes an interior 's' in 'scan' */
4065             }
4066             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4067                 ;   /* join is compatible, no need to change OP */
4068             }
4069             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4070
4071                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4072                  * nodes.  But the latter nodes can be also joined with EXACTFU
4073                  * ones, and that is a better outcome, so if the node following
4074                  * 'n' is EXACTFU, quit now so that those two can be joined
4075                  * later */
4076                 if (OP(nnext) == EXACTFU) {
4077                     break;
4078                 }
4079
4080                 /* The join is compatible, and the combined node will be
4081                  * EXACTF.  (These don't care if they begin or end with 's' */
4082             }
4083             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4084                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4085                     && STRING(n)[0] == 's')
4086                 {
4087                     /* When combined, we have the sequence 'ss', which means we
4088                      * have to remain /di */
4089                     OP(scan) = EXACTF;
4090                 }
4091             }
4092             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4093                 if (STRING(n)[0] == 's') {
4094                     ;   /* Here the join is compatible and the combined node
4095                            starts with 's', no need to change OP */
4096                 }
4097                 else {  /* Now the trailing 's' is in the interior */
4098                     OP(scan) = EXACTFU;
4099                 }
4100             }
4101             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4102
4103                 /* The join is compatible, and the combined node will be
4104                  * EXACTF.  (These don't care if they begin or end with 's' */
4105                 OP(scan) = EXACTF;
4106             }
4107             else if (OP(scan) != OP(n)) {
4108
4109                 /* The only other compatible joinings are the same node type */
4110                 break;
4111             }
4112
4113             DEBUG_PEEP("merg", n, depth, 0);
4114             merged++;
4115
4116             NEXT_OFF(scan) += NEXT_OFF(n);
4117             STR_LEN(scan) += STR_LEN(n);
4118             next = n + NODE_SZ_STR(n);
4119             /* Now we can overwrite *n : */
4120             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4121 #ifdef DEBUGGING
4122             stop = next - 1;
4123 #endif
4124             n = nnext;
4125             if (stopnow) break;
4126         }
4127
4128 #ifdef EXPERIMENTAL_INPLACESCAN
4129         if (flags && !NEXT_OFF(n)) {
4130             DEBUG_PEEP("atch", val, depth, 0);
4131             if (reg_off_by_arg[OP(n)]) {
4132                 ARG_SET(n, val - n);
4133             }
4134             else {
4135                 NEXT_OFF(n) = val - n;
4136             }
4137             stopnow = 1;
4138         }
4139 #endif
4140     }
4141
4142     /* This temporary node can now be turned into EXACTFU, and must, as
4143      * regexec.c doesn't handle it */
4144     if (OP(scan) == EXACTFU_S_EDGE) {
4145         OP(scan) = EXACTFU;
4146     }
4147
4148     *min_subtract = 0;
4149     *unfolded_multi_char = FALSE;
4150
4151     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4152      * can now analyze for sequences of problematic code points.  (Prior to
4153      * this final joining, sequences could have been split over boundaries, and
4154      * hence missed).  The sequences only happen in folding, hence for any
4155      * non-EXACT EXACTish node */
4156     if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4157         U8* s0 = (U8*) STRING(scan);
4158         U8* s = s0;
4159         U8* s_end = s0 + STR_LEN(scan);
4160
4161         int total_count_delta = 0;  /* Total delta number of characters that
4162                                        multi-char folds expand to */
4163
4164         /* One pass is made over the node's string looking for all the
4165          * possibilities.  To avoid some tests in the loop, there are two main
4166          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4167          * non-UTF-8 */
4168         if (UTF) {
4169             U8* folded = NULL;
4170
4171             if (OP(scan) == EXACTFL) {
4172                 U8 *d;
4173
4174                 /* An EXACTFL node would already have been changed to another
4175                  * node type unless there is at least one character in it that
4176                  * is problematic; likely a character whose fold definition
4177                  * won't be known until runtime, and so has yet to be folded.
4178                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4179                  * to handle the UTF-8 case, we need to create a temporary
4180                  * folded copy using UTF-8 locale rules in order to analyze it.
4181                  * This is because our macros that look to see if a sequence is
4182                  * a multi-char fold assume everything is folded (otherwise the
4183                  * tests in those macros would be too complicated and slow).
4184                  * Note that here, the non-problematic folds will have already
4185                  * been done, so we can just copy such characters.  We actually
4186                  * don't completely fold the EXACTFL string.  We skip the
4187                  * unfolded multi-char folds, as that would just create work
4188                  * below to figure out the size they already are */
4189
4190                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4191                 d = folded;
4192                 while (s < s_end) {
4193                     STRLEN s_len = UTF8SKIP(s);
4194                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4195                         Copy(s, d, s_len, U8);
4196                         d += s_len;
4197                     }
4198                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4199                         *unfolded_multi_char = TRUE;
4200                         Copy(s, d, s_len, U8);
4201                         d += s_len;
4202                     }
4203                     else if (isASCII(*s)) {
4204                         *(d++) = toFOLD(*s);
4205                     }
4206                     else {
4207                         STRLEN len;
4208                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4209                         d += len;
4210                     }
4211                     s += s_len;
4212                 }
4213
4214                 /* Point the remainder of the routine to look at our temporary
4215                  * folded copy */
4216                 s = folded;
4217                 s_end = d;
4218             } /* End of creating folded copy of EXACTFL string */
4219
4220             /* Examine the string for a multi-character fold sequence.  UTF-8
4221              * patterns have all characters pre-folded by the time this code is
4222              * executed */
4223             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4224                                      length sequence we are looking for is 2 */
4225             {
4226                 int count = 0;  /* How many characters in a multi-char fold */
4227                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4228                 if (! len) {    /* Not a multi-char fold: get next char */
4229                     s += UTF8SKIP(s);
4230                     continue;
4231                 }
4232
4233                 { /* Here is a generic multi-char fold. */
4234                     U8* multi_end  = s + len;
4235
4236                     /* Count how many characters are in it.  In the case of
4237                      * /aa, no folds which contain ASCII code points are
4238                      * allowed, so check for those, and skip if found. */
4239                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4240                         count = utf8_length(s, multi_end);
4241                         s = multi_end;
4242                     }
4243                     else {
4244                         while (s < multi_end) {
4245                             if (isASCII(*s)) {
4246                                 s++;
4247                                 goto next_iteration;
4248                             }
4249                             else {
4250                                 s += UTF8SKIP(s);
4251                             }
4252                             count++;
4253                         }
4254                     }
4255                 }
4256
4257                 /* The delta is how long the sequence is minus 1 (1 is how long
4258                  * the character that folds to the sequence is) */
4259                 total_count_delta += count - 1;
4260               next_iteration: ;
4261             }
4262
4263             /* We created a temporary folded copy of the string in EXACTFL
4264              * nodes.  Therefore we need to be sure it doesn't go below zero,
4265              * as the real string could be shorter */
4266             if (OP(scan) == EXACTFL) {
4267                 int total_chars = utf8_length((U8*) STRING(scan),
4268                                            (U8*) STRING(scan) + STR_LEN(scan));
4269                 if (total_count_delta > total_chars) {
4270                     total_count_delta = total_chars;
4271                 }
4272             }
4273
4274             *min_subtract += total_count_delta;
4275             Safefree(folded);
4276         }
4277         else if (OP(scan) == EXACTFAA) {
4278
4279             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4280              * fold to the ASCII range (and there are no existing ones in the
4281              * upper latin1 range).  But, as outlined in the comments preceding
4282              * this function, we need to flag any occurrences of the sharp s.
4283              * This character forbids trie formation (because of added
4284              * complexity) */
4285 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4286    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4287                                       || UNICODE_DOT_DOT_VERSION > 0)
4288             while (s < s_end) {
4289                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4290                     OP(scan) = EXACTFAA_NO_TRIE;
4291                     *unfolded_multi_char = TRUE;
4292                     break;
4293                 }
4294                 s++;
4295             }
4296         }
4297         else {
4298
4299             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4300              * folds that are all Latin1.  As explained in the comments
4301              * preceding this function, we look also for the sharp s in EXACTF
4302              * and EXACTFL nodes; it can be in the final position.  Otherwise
4303              * we can stop looking 1 byte earlier because have to find at least
4304              * two characters for a multi-fold */
4305             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4306                               ? s_end
4307                               : s_end -1;
4308
4309             while (s < upper) {
4310                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4311                 if (! len) {    /* Not a multi-char fold. */
4312                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4313                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4314                     {
4315                         *unfolded_multi_char = TRUE;
4316                     }
4317                     s++;
4318                     continue;
4319                 }
4320
4321                 if (len == 2
4322                     && isALPHA_FOLD_EQ(*s, 's')
4323                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4324                 {
4325
4326                     /* EXACTF nodes need to know that the minimum length
4327                      * changed so that a sharp s in the string can match this
4328                      * ss in the pattern, but they remain EXACTF nodes, as they
4329                      * won't match this unless the target string is is UTF-8,
4330                      * which we don't know until runtime.  EXACTFL nodes can't
4331                      * transform into EXACTFU nodes */
4332                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4333                         OP(scan) = EXACTFUP;
4334                     }
4335                 }
4336
4337                 *min_subtract += len - 1;
4338                 s += len;
4339             }
4340 #endif
4341         }
4342
4343         if (     STR_LEN(scan) == 1
4344             &&   isALPHA_A(* STRING(scan))
4345             &&  (         OP(scan) == EXACTFAA
4346                  || (     OP(scan) == EXACTFU
4347                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4348         {
4349             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4350
4351             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4352              * with the mask set to the complement of the bit that differs
4353              * between upper and lower case, and the lowest code point of the
4354              * pair (which the '&' forces) */
4355             OP(scan) = ANYOFM;
4356             ARG_SET(scan, *STRING(scan) & mask);
4357             FLAGS(scan) = mask;
4358         }
4359     }
4360
4361 #ifdef DEBUGGING
4362     /* Allow dumping but overwriting the collection of skipped
4363      * ops and/or strings with fake optimized ops */
4364     n = scan + NODE_SZ_STR(scan);
4365     while (n <= stop) {
4366         OP(n) = OPTIMIZED;
4367         FLAGS(n) = 0;
4368         NEXT_OFF(n) = 0;
4369         n++;
4370     }
4371 #endif
4372     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4373     return stopnow;
4374 }
4375
4376 /* REx optimizer.  Converts nodes into quicker variants "in place".
4377    Finds fixed substrings.  */
4378
4379 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4380    to the position after last scanned or to NULL. */
4381
4382 #define INIT_AND_WITHP \
4383     assert(!and_withp); \
4384     Newx(and_withp, 1, regnode_ssc); \
4385     SAVEFREEPV(and_withp)
4386
4387
4388 static void
4389 S_unwind_scan_frames(pTHX_ const void *p)
4390 {
4391     scan_frame *f= (scan_frame *)p;
4392     do {
4393         scan_frame *n= f->next_frame;
4394         Safefree(f);
4395         f= n;
4396     } while (f);
4397 }
4398
4399 /* the return from this sub is the minimum length that could possibly match */
4400 STATIC SSize_t
4401 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4402                         SSize_t *minlenp, SSize_t *deltap,
4403                         regnode *last,
4404                         scan_data_t *data,
4405                         I32 stopparen,
4406                         U32 recursed_depth,
4407                         regnode_ssc *and_withp,
4408                         U32 flags, U32 depth)
4409                         /* scanp: Start here (read-write). */
4410                         /* deltap: Write maxlen-minlen here. */
4411                         /* last: Stop before this one. */
4412                         /* data: string data about the pattern */
4413                         /* stopparen: treat close N as END */
4414                         /* recursed: which subroutines have we recursed into */
4415                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4416 {
4417     /* There must be at least this number of characters to match */
4418     SSize_t min = 0;
4419     I32 pars = 0, code;
4420     regnode *scan = *scanp, *next;
4421     SSize_t delta = 0;
4422     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4423     int is_inf_internal = 0;            /* The studied chunk is infinite */
4424     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4425     scan_data_t data_fake;
4426     SV *re_trie_maxbuff = NULL;
4427     regnode *first_non_open = scan;
4428     SSize_t stopmin = SSize_t_MAX;
4429     scan_frame *frame = NULL;
4430     GET_RE_DEBUG_FLAGS_DECL;
4431
4432     PERL_ARGS_ASSERT_STUDY_CHUNK;
4433     RExC_study_started= 1;
4434
4435     Zero(&data_fake, 1, scan_data_t);
4436
4437     if ( depth == 0 ) {
4438         while (first_non_open && OP(first_non_open) == OPEN)
4439             first_non_open=regnext(first_non_open);
4440     }
4441
4442
4443   fake_study_recurse:
4444     DEBUG_r(
4445         RExC_study_chunk_recursed_count++;
4446     );
4447     DEBUG_OPTIMISE_MORE_r(
4448     {
4449         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4450             depth, (long)stopparen,
4451             (unsigned long)RExC_study_chunk_recursed_count,
4452             (unsigned long)depth, (unsigned long)recursed_depth,
4453             scan,
4454             last);
4455         if (recursed_depth) {
4456             U32 i;
4457             U32 j;
4458             for ( j = 0 ; j < recursed_depth ; j++ ) {
4459                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4460                     if (
4461                         PAREN_TEST(RExC_study_chunk_recursed +
4462                                    ( j * RExC_study_chunk_recursed_bytes), i )
4463                         && (
4464                             !j ||
4465                             !PAREN_TEST(RExC_study_chunk_recursed +
4466                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4467                         )
4468                     ) {
4469                         Perl_re_printf( aTHX_ " %d",(int)i);
4470                         break;
4471                     }
4472                 }
4473                 if ( j + 1 < recursed_depth ) {
4474                     Perl_re_printf( aTHX_  ",");
4475                 }
4476             }
4477         }
4478         Perl_re_printf( aTHX_ "\n");
4479     }
4480     );
4481     while ( scan && OP(scan) != END && scan < last ){
4482         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4483                                    node length to get a real minimum (because
4484                                    the folded version may be shorter) */
4485         bool unfolded_multi_char = FALSE;
4486         /* Peephole optimizer: */
4487         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4488         DEBUG_PEEP("Peep", scan, depth, flags);
4489
4490
4491         /* The reason we do this here is that we need to deal with things like
4492          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4493          * parsing code, as each (?:..) is handled by a different invocation of
4494          * reg() -- Yves
4495          */
4496         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4497
4498         /* Follow the next-chain of the current node and optimize
4499            away all the NOTHINGs from it.  */
4500         if (OP(scan) != CURLYX) {
4501             const int max = (reg_off_by_arg[OP(scan)]
4502                        ? I32_MAX
4503                        /* I32 may be smaller than U16 on CRAYs! */
4504                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4505             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4506             int noff;
4507             regnode *n = scan;
4508
4509             /* Skip NOTHING and LONGJMP. */
4510             while ((n = regnext(n))
4511                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4512                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4513                    && off + noff < max)
4514                 off += noff;
4515             if (reg_off_by_arg[OP(scan)])
4516                 ARG(scan) = off;
4517             else
4518                 NEXT_OFF(scan) = off;
4519         }
4520
4521         /* The principal pseudo-switch.  Cannot be a switch, since we
4522            look into several different things.  */
4523         if ( OP(scan) == DEFINEP ) {
4524             SSize_t minlen = 0;
4525             SSize_t deltanext = 0;
4526             SSize_t fake_last_close = 0;
4527             I32 f = SCF_IN_DEFINE;
4528
4529             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4530             scan = regnext(scan);
4531             assert( OP(scan) == IFTHEN );
4532             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4533
4534             data_fake.last_closep= &fake_last_close;
4535             minlen = *minlenp;
4536             next = regnext(scan);
4537             scan = NEXTOPER(NEXTOPER(scan));
4538             DEBUG_PEEP("scan", scan, depth, flags);
4539             DEBUG_PEEP("next", next, depth, flags);
4540
4541             /* we suppose the run is continuous, last=next...
4542              * NOTE we dont use the return here! */
4543             /* DEFINEP study_chunk() recursion */
4544             (void)study_chunk(pRExC_state, &scan, &minlen,
4545                               &deltanext, next, &data_fake, stopparen,
4546                               recursed_depth, NULL, f, depth+1);
4547
4548             scan = next;
4549         } else
4550         if (
4551             OP(scan) == BRANCH  ||
4552             OP(scan) == BRANCHJ ||
4553             OP(scan) == IFTHEN
4554         ) {
4555             next = regnext(scan);
4556             code = OP(scan);
4557
4558             /* The op(next)==code check below is to see if we
4559              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4560              * IFTHEN is special as it might not appear in pairs.
4561              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4562              * we dont handle it cleanly. */
4563             if (OP(next) == code || code == IFTHEN) {
4564                 /* NOTE - There is similar code to this block below for
4565                  * handling TRIE nodes on a re-study.  If you change stuff here
4566                  * check there too. */
4567                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4568                 regnode_ssc accum;
4569                 regnode * const startbranch=scan;
4570
4571                 if (flags & SCF_DO_SUBSTR) {
4572                     /* Cannot merge strings after this. */
4573                     scan_commit(pRExC_state, data, minlenp, is_inf);
4574                 }
4575
4576                 if (flags & SCF_DO_STCLASS)
4577                     ssc_init_zero(pRExC_state, &accum);
4578
4579                 while (OP(scan) == code) {
4580                     SSize_t deltanext, minnext, fake;
4581                     I32 f = 0;
4582                     regnode_ssc this_class;
4583
4584                     DEBUG_PEEP("Branch", scan, depth, flags);
4585
4586                     num++;
4587                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4588                     if (data) {
4589                         data_fake.whilem_c = data->whilem_c;
4590                         data_fake.last_closep = data->last_closep;
4591                     }
4592                     else
4593                         data_fake.last_closep = &fake;
4594
4595                     data_fake.pos_delta = delta;
4596                     next = regnext(scan);
4597
4598                     scan = NEXTOPER(scan); /* everything */
4599                     if (code != BRANCH)    /* everything but BRANCH */
4600                         scan = NEXTOPER(scan);
4601
4602                     if (flags & SCF_DO_STCLASS) {
4603                         ssc_init(pRExC_state, &this_class);
4604                         data_fake.start_class = &this_class;
4605                         f = SCF_DO_STCLASS_AND;
4606                     }
4607                     if (flags & SCF_WHILEM_VISITED_POS)
4608                         f |= SCF_WHILEM_VISITED_POS;
4609
4610                     /* we suppose the run is continuous, last=next...*/
4611                     /* recurse study_chunk() for each BRANCH in an alternation */
4612                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4613                                       &deltanext, next, &data_fake, stopparen,
4614                                       recursed_depth, NULL, f, depth+1);
4615
4616                     if (min1 > minnext)
4617                         min1 = minnext;
4618                     if (deltanext == SSize_t_MAX) {
4619                         is_inf = is_inf_internal = 1;
4620                         max1 = SSize_t_MAX;
4621                     } else if (max1 < minnext + deltanext)
4622                         max1 = minnext + deltanext;
4623                     scan = next;
4624                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4625                         pars++;
4626                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4627                         if ( stopmin > minnext)
4628                             stopmin = min + min1;
4629                         flags &= ~SCF_DO_SUBSTR;
4630                         if (data)
4631                             data->flags |= SCF_SEEN_ACCEPT;
4632                     }
4633                     if (data) {
4634                         if (data_fake.flags & SF_HAS_EVAL)
4635                             data->flags |= SF_HAS_EVAL;
4636                         data->whilem_c = data_fake.whilem_c;
4637                     }
4638                     if (flags & SCF_DO_STCLASS)
4639                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4640                 }
4641                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4642                     min1 = 0;
4643                 if (flags & SCF_DO_SUBSTR) {
4644                     data->pos_min += min1;
4645                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4646                         data->pos_delta = SSize_t_MAX;
4647                     else
4648                         data->pos_delta += max1 - min1;
4649                     if (max1 != min1 || is_inf)
4650                         data->cur_is_floating = 1;
4651                 }
4652                 min += min1;
4653                 if (delta == SSize_t_MAX
4654                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4655                     delta = SSize_t_MAX;
4656                 else
4657                     delta += max1 - min1;
4658                 if (flags & SCF_DO_STCLASS_OR) {
4659                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4660                     if (min1) {
4661                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4662                         flags &= ~SCF_DO_STCLASS;
4663                     }
4664                 }
4665                 else if (flags & SCF_DO_STCLASS_AND) {
4666                     if (min1) {
4667                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4668                         flags &= ~SCF_DO_STCLASS;
4669                     }
4670                     else {
4671                         /* Switch to OR mode: cache the old value of
4672                          * data->start_class */
4673                         INIT_AND_WITHP;
4674                         StructCopy(data->start_class, and_withp, regnode_ssc);
4675                         flags &= ~SCF_DO_STCLASS_AND;
4676                         StructCopy(&accum, data->start_class, regnode_ssc);
4677                         flags |= SCF_DO_STCLASS_OR;
4678                     }
4679                 }
4680
4681                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4682                         OP( startbranch ) == BRANCH )
4683                 {
4684                 /* demq.
4685
4686                    Assuming this was/is a branch we are dealing with: 'scan'
4687                    now points at the item that follows the branch sequence,
4688                    whatever it is. We now start at the beginning of the
4689                    sequence and look for subsequences of
4690
4691                    BRANCH->EXACT=>x1
4692                    BRANCH->EXACT=>x2
4693                    tail
4694
4695                    which would be constructed from a pattern like
4696                    /A|LIST|OF|WORDS/
4697
4698                    If we can find such a subsequence we need to turn the first
4699                    element into a trie and then add the subsequent branch exact
4700                    strings to the trie.
4701
4702                    We have two cases
4703
4704                      1. patterns where the whole set of branches can be
4705                         converted.
4706
4707                      2. patterns where only a subset can be converted.
4708
4709                    In case 1 we can replace the whole set with a single regop
4710                    for the trie. In case 2 we need to keep the start and end
4711                    branches so
4712
4713                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4714                      becomes BRANCH TRIE; BRANCH X;
4715
4716                   There is an additional case, that being where there is a
4717                   common prefix, which gets split out into an EXACT like node
4718                   preceding the TRIE node.
4719
4720                   If x(1..n)==tail then we can do a simple trie, if not we make
4721                   a "jump" trie, such that when we match the appropriate word
4722                   we "jump" to the appropriate tail node. Essentially we turn
4723                   a nested if into a case structure of sorts.
4724
4725                 */
4726
4727                     int made=0;
4728                     if (!re_trie_maxbuff) {
4729                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4730                         if (!SvIOK(re_trie_maxbuff))
4731                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4732                     }
4733                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4734                         regnode *cur;
4735                         regnode *first = (regnode *)NULL;
4736                         regnode *last = (regnode *)NULL;
4737                         regnode *tail = scan;
4738                         U8 trietype = 0;
4739                         U32 count=0;
4740
4741                         /* var tail is used because there may be a TAIL
4742                            regop in the way. Ie, the exacts will point to the
4743                            thing following the TAIL, but the last branch will
4744                            point at the TAIL. So we advance tail. If we
4745                            have nested (?:) we may have to move through several
4746                            tails.
4747                          */
4748
4749                         while ( OP( tail ) == TAIL ) {
4750                             /* this is the TAIL generated by (?:) */
4751                             tail = regnext( tail );
4752                         }
4753
4754
4755                         DEBUG_TRIE_COMPILE_r({
4756                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4757                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4758                               depth+1,
4759                               "Looking for TRIE'able sequences. Tail node is ",
4760                               (UV) REGNODE_OFFSET(tail),
4761                               SvPV_nolen_const( RExC_mysv )
4762                             );
4763                         });
4764
4765                         /*
4766
4767                             Step through the branches
4768                                 cur represents each branch,
4769                                 noper is the first thing to be matched as part
4770                                       of that branch
4771                                 noper_next is the regnext() of that node.
4772
4773                             We normally handle a case like this
4774                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4775                             support building with NOJUMPTRIE, which restricts
4776                             the trie logic to structures like /FOO|BAR/.
4777
4778                             If noper is a trieable nodetype then the branch is
4779                             a possible optimization target. If we are building
4780                             under NOJUMPTRIE then we require that noper_next is
4781                             the same as scan (our current position in the regex
4782                             program).
4783
4784                             Once we have two or more consecutive such branches
4785                             we can create a trie of the EXACT's contents and
4786                             stitch it in place into the program.
4787
4788                             If the sequence represents all of the branches in
4789                             the alternation we replace the entire thing with a
4790                             single TRIE node.
4791
4792                             Otherwise when it is a subsequence we need to
4793                             stitch it in place and replace only the relevant
4794                             branches. This means the first branch has to remain
4795                             as it is used by the alternation logic, and its
4796                             next pointer, and needs to be repointed at the item
4797                             on the branch chain following the last branch we
4798                             have optimized away.
4799
4800                             This could be either a BRANCH, in which case the
4801                             subsequence is internal, or it could be the item
4802                             following the branch sequence in which case the
4803                             subsequence is at the end (which does not
4804                             necessarily mean the first node is the start of the
4805                             alternation).
4806
4807                             TRIE_TYPE(X) is a define which maps the optype to a
4808                             trietype.
4809
4810                                 optype          |  trietype
4811                                 ----------------+-----------
4812                                 NOTHING         | NOTHING
4813                                 EXACT           | EXACT
4814                                 EXACT_ONLY8     | EXACT
4815                                 EXACTFU         | EXACTFU
4816                                 EXACTFU_ONLY8   | EXACTFU
4817                                 EXACTFUP        | EXACTFU
4818                                 EXACTFAA        | EXACTFAA
4819                                 EXACTL          | EXACTL
4820                                 EXACTFLU8       | EXACTFLU8
4821
4822
4823                         */
4824 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4825                        ? NOTHING                                            \
4826                        : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
4827                          ? EXACT                                            \
4828                          : (     EXACTFU == (X)                             \
4829                               || EXACTFU_ONLY8 == (X)                       \
4830                               || EXACTFUP == (X) )                          \
4831                            ? EXACTFU                                        \
4832                            : ( EXACTFAA == (X) )                            \
4833                              ? EXACTFAA                                     \
4834                              : ( EXACTL == (X) )                            \
4835                                ? EXACTL                                     \
4836                                : ( EXACTFLU8 == (X) )                       \
4837                                  ? EXACTFLU8                                \
4838                                  : 0 )
4839
4840                         /* dont use tail as the end marker for this traverse */
4841                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4842                             regnode * const noper = NEXTOPER( cur );
4843                             U8 noper_type = OP( noper );
4844                             U8 noper_trietype = TRIE_TYPE( noper_type );
4845 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4846                             regnode * const noper_next = regnext( noper );
4847                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4848                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4849 #endif
4850
4851                             DEBUG_TRIE_COMPILE_r({
4852                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4853                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4854                                    depth+1,
4855                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4856
4857                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4858                                 Perl_re_printf( aTHX_  " -> %d:%s",
4859                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4860
4861                                 if ( noper_next ) {
4862                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4863                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4864                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4865                                 }
4866                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4867                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4868                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4869                                 );
4870                             });
4871
4872                             /* Is noper a trieable nodetype that can be merged
4873                              * with the current trie (if there is one)? */
4874                             if ( noper_trietype
4875                                   &&
4876                                   (
4877                                         ( noper_trietype == NOTHING )
4878                                         || ( trietype == NOTHING )
4879                                         || ( trietype == noper_trietype )
4880                                   )
4881 #ifdef NOJUMPTRIE
4882                                   && noper_next >= tail
4883 #endif
4884                                   && count < U16_MAX)
4885                             {
4886                                 /* Handle mergable triable node Either we are
4887                                  * the first node in a new trieable sequence,
4888                                  * in which case we do some bookkeeping,
4889                                  * otherwise we update the end pointer. */
4890                                 if ( !first ) {
4891                                     first = cur;
4892                                     if ( noper_trietype == NOTHING ) {
4893 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4894                                         regnode * const noper_next = regnext( noper );
4895                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4896                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4897 #endif
4898
4899                                         if ( noper_next_trietype ) {
4900                                             trietype = noper_next_trietype;
4901                                         } else if (noper_next_type)  {
4902                                             /* a NOTHING regop is 1 regop wide.
4903                                              * We need at least two for a trie
4904                                              * so we can't merge this in */
4905                                             first = NULL;
4906                                         }
4907                                     } else {
4908                                         trietype = noper_trietype;
4909                                     }
4910                                 } else {
4911                                     if ( trietype == NOTHING )
4912                                         trietype = noper_trietype;
4913                                     last = cur;
4914                                 }
4915                                 if (first)
4916                                     count++;
4917                             } /* end handle mergable triable node */
4918                             else {
4919                                 /* handle unmergable node -
4920                                  * noper may either be a triable node which can
4921                                  * not be tried together with the current trie,
4922                                  * or a non triable node */
4923                                 if ( last ) {
4924                                     /* If last is set and trietype is not
4925                                      * NOTHING then we have found at least two
4926                                      * triable branch sequences in a row of a
4927                                      * similar trietype so we can turn them
4928                                      * into a trie. If/when we allow NOTHING to
4929                                      * start a trie sequence this condition
4930                                      * will be required, and it isn't expensive
4931                                      * so we leave it in for now. */
4932                                     if ( trietype && trietype != NOTHING )
4933                                         make_trie( pRExC_state,
4934                                                 startbranch, first, cur, tail,
4935                                                 count, trietype, depth+1 );
4936                                     last = NULL; /* note: we clear/update
4937                                                     first, trietype etc below,
4938                                                     so we dont do it here */
4939                                 }
4940                                 if ( noper_trietype
4941 #ifdef NOJUMPTRIE
4942                                      && noper_next >= tail
4943 #endif
4944                                 ){
4945                                     /* noper is triable, so we can start a new
4946                                      * trie sequence */
4947                                     count = 1;
4948                                     first = cur;
4949                                     trietype = noper_trietype;
4950                                 } else if (first) {
4951                                     /* if we already saw a first but the
4952                                      * current node is not triable then we have
4953                                      * to reset the first information. */
4954                                     count = 0;
4955                                     first = NULL;
4956                                     trietype = 0;
4957                                 }
4958                             } /* end handle unmergable node */
4959                         } /* loop over branches */
4960                         DEBUG_TRIE_COMPILE_r({
4961                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4962                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4963                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4964                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4965                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4966                                PL_reg_name[trietype]
4967                             );
4968
4969                         });
4970                         if ( last && trietype ) {
4971                             if ( trietype != NOTHING ) {
4972                                 /* the last branch of the sequence was part of
4973                                  * a trie, so we have to construct it here
4974                                  * outside of the loop */
4975                                 made= make_trie( pRExC_state, startbranch,
4976                                                  first, scan, tail, count,
4977                                                  trietype, depth+1 );
4978 #ifdef TRIE_STUDY_OPT
4979                                 if ( ((made == MADE_EXACT_TRIE &&
4980                                      startbranch == first)
4981                                      || ( first_non_open == first )) &&
4982                                      depth==0 ) {
4983                                     flags |= SCF_TRIE_RESTUDY;
4984                                     if ( startbranch == first
4985                                          && scan >= tail )
4986                                     {
4987                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4988                                     }
4989                                 }
4990 #endif
4991                             } else {
4992                                 /* at this point we know whatever we have is a
4993                                  * NOTHING sequence/branch AND if 'startbranch'
4994                                  * is 'first' then we can turn the whole thing
4995                                  * into a NOTHING
4996                                  */
4997                                 if ( startbranch == first ) {
4998                                     regnode *opt;
4999                                     /* the entire thing is a NOTHING sequence,
5000                                      * something like this: (?:|) So we can
5001                                      * turn it into a plain NOTHING op. */
5002                                     DEBUG_TRIE_COMPILE_r({
5003                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5004                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5005                                           depth+1,
5006                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5007
5008                                     });
5009                                     OP(startbranch)= NOTHING;
5010                                     NEXT_OFF(startbranch)= tail - startbranch;
5011                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5012                                         OP(opt)= OPTIMIZED;
5013                                 }
5014                             }
5015                         } /* end if ( last) */
5016                     } /* TRIE_MAXBUF is non zero */
5017
5018                 } /* do trie */
5019
5020             }
5021             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5022                 scan = NEXTOPER(NEXTOPER(scan));
5023             } else                      /* single branch is optimized. */
5024                 scan = NEXTOPER(scan);
5025             continue;
5026         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5027             I32 paren = 0;
5028             regnode *start = NULL;
5029             regnode *end = NULL;
5030             U32 my_recursed_depth= recursed_depth;
5031
5032             if (OP(scan) != SUSPEND) { /* GOSUB */
5033                 /* Do setup, note this code has side effects beyond
5034                  * the rest of this block. Specifically setting
5035                  * RExC_recurse[] must happen at least once during
5036                  * study_chunk(). */
5037                 paren = ARG(scan);
5038                 RExC_recurse[ARG2L(scan)] = scan;
5039                 start = REGNODE_p(RExC_open_parens[paren]);
5040                 end   = REGNODE_p(RExC_close_parens[paren]);
5041
5042                 /* NOTE we MUST always execute the above code, even
5043                  * if we do nothing with a GOSUB */
5044                 if (
5045                     ( flags & SCF_IN_DEFINE )
5046                     ||
5047                     (
5048                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5049                         &&
5050                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5051                     )
5052                 ) {
5053                     /* no need to do anything here if we are in a define. */
5054                     /* or we are after some kind of infinite construct
5055                      * so we can skip recursing into this item.
5056                      * Since it is infinite we will not change the maxlen
5057                      * or delta, and if we miss something that might raise
5058                      * the minlen it will merely pessimise a little.
5059                      *
5060                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5061                      * might result in a minlen of 1 and not of 4,
5062                      * but this doesn't make us mismatch, just try a bit
5063                      * harder than we should.
5064                      * */
5065                     scan= regnext(scan);
5066                     continue;
5067                 }
5068
5069                 if (
5070                     !recursed_depth
5071                     ||
5072                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5073                 ) {
5074                     /* it is quite possible that there are more efficient ways
5075                      * to do this. We maintain a bitmap per level of recursion
5076                      * of which patterns we have entered so we can detect if a
5077                      * pattern creates a possible infinite loop. When we
5078                      * recurse down a level we copy the previous levels bitmap
5079                      * down. When we are at recursion level 0 we zero the top
5080                      * level bitmap. It would be nice to implement a different
5081                      * more efficient way of doing this. In particular the top
5082                      * level bitmap may be unnecessary.
5083                      */
5084                     if (!recursed_depth) {
5085                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5086                     } else {
5087                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5088                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5089                              RExC_study_chunk_recursed_bytes, U8);
5090                     }
5091                     /* we havent recursed into this paren yet, so recurse into it */
5092                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5093                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5094                     my_recursed_depth= recursed_depth + 1;
5095                 } else {
5096                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5097                     /* some form of infinite recursion, assume infinite length
5098                      * */
5099                     if (flags & SCF_DO_SUBSTR) {
5100                         scan_commit(pRExC_state, data, minlenp, is_inf);
5101                         data->cur_is_floating = 1;
5102                     }
5103                     is_inf = is_inf_internal = 1;
5104                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5105                         ssc_anything(data->start_class);
5106                     flags &= ~SCF_DO_STCLASS;
5107
5108                     start= NULL; /* reset start so we dont recurse later on. */
5109                 }
5110             } else {
5111                 paren = stopparen;
5112                 start = scan + 2;
5113                 end = regnext(scan);
5114             }
5115             if (start) {
5116                 scan_frame *newframe;
5117                 assert(end);
5118                 if (!RExC_frame_last) {
5119                     Newxz(newframe, 1, scan_frame);
5120                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5121                     RExC_frame_head= newframe;
5122                     RExC_frame_count++;
5123                 } else if (!RExC_frame_last->next_frame) {
5124                     Newxz(newframe, 1, scan_frame);
5125                     RExC_frame_last->next_frame= newframe;
5126                     newframe->prev_frame= RExC_frame_last;
5127                     RExC_frame_count++;
5128                 } else {
5129                     newframe= RExC_frame_last->next_frame;
5130                 }
5131                 RExC_frame_last= newframe;
5132
5133                 newframe->next_regnode = regnext(scan);
5134                 newframe->last_regnode = last;
5135                 newframe->stopparen = stopparen;
5136                 newframe->prev_recursed_depth = recursed_depth;
5137                 newframe->this_prev_frame= frame;
5138
5139                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5140                 DEBUG_PEEP("fnew", scan, depth, flags);
5141
5142                 frame = newframe;
5143                 scan =  start;
5144                 stopparen = paren;
5145                 last = end;
5146                 depth = depth + 1;
5147                 recursed_depth= my_recursed_depth;
5148
5149                 continue;
5150             }
5151         }
5152         else if (   OP(scan) == EXACT
5153                  || OP(scan) == EXACT_ONLY8
5154                  || OP(scan) == EXACTL)
5155         {
5156             SSize_t l = STR_LEN(scan);
5157             UV uc;
5158             assert(l);
5159             if (UTF) {
5160                 const U8 * const s = (U8*)STRING(scan);
5161                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5162                 l = utf8_length(s, s + l);
5163             } else {
5164                 uc = *((U8*)STRING(scan));
5165             }
5166             min += l;
5167             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5168                 /* The code below prefers earlier match for fixed
5169                    offset, later match for variable offset.  */
5170                 if (data->last_end == -1) { /* Update the start info. */
5171                     data->last_start_min = data->pos_min;
5172                     data->last_start_max = is_inf
5173                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
5174                 }
5175                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5176                 if (UTF)
5177                     SvUTF8_on(data->last_found);
5178                 {
5179                     SV * const sv = data->last_found;
5180                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5181                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5182                     if (mg && mg->mg_len >= 0)
5183                         mg->mg_len += utf8_length((U8*)STRING(scan),
5184                                               (U8*)STRING(scan)+STR_LEN(scan));
5185                 }
5186                 data->last_end = data->pos_min + l;
5187                 data->pos_min += l; /* As in the first entry. */
5188                 data->flags &= ~SF_BEFORE_EOL;
5189             }
5190
5191             /* ANDing the code point leaves at most it, and not in locale, and
5192              * can't match null string */
5193             if (flags & SCF_DO_STCLASS_AND) {
5194                 ssc_cp_and(data->start_class, uc);
5195                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5196                 ssc_clear_locale(data->start_class);
5197             }
5198             else if (flags & SCF_DO_STCLASS_OR) {
5199                 ssc_add_cp(data->start_class, uc);
5200                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5201
5202                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5203                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5204             }
5205             flags &= ~SCF_DO_STCLASS;
5206         }
5207         else if (PL_regkind[OP(scan)] == EXACT) {
5208             /* But OP != EXACT!, so is EXACTFish */
5209             SSize_t l = STR_LEN(scan);
5210             const U8 * s = (U8*)STRING(scan);
5211
5212             /* Search for fixed substrings supports EXACT only. */
5213             if (flags & SCF_DO_SUBSTR) {
5214                 assert(data);
5215                 scan_commit(pRExC_state, data, minlenp, is_inf);
5216             }
5217             if (UTF) {
5218                 l = utf8_length(s, s + l);
5219             }
5220             if (unfolded_multi_char) {
5221                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5222             }
5223             min += l - min_subtract;
5224             assert (min >= 0);
5225             delta += min_subtract;
5226             if (flags & SCF_DO_SUBSTR) {
5227                 data->pos_min += l - min_subtract;
5228                 if (data->pos_min < 0) {
5229                     data->pos_min = 0;
5230                 }
5231                 data->pos_delta += min_subtract;
5232                 if (min_subtract) {
5233                     data->cur_is_floating = 1; /* float */
5234                 }
5235             }
5236
5237             if (flags & SCF_DO_STCLASS) {
5238                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5239
5240                 assert(EXACTF_invlist);
5241                 if (flags & SCF_DO_STCLASS_AND) {
5242                     if (OP(scan) != EXACTFL)
5243                         ssc_clear_locale(data->start_class);
5244                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5245                     ANYOF_POSIXL_ZERO(data->start_class);
5246                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5247                 }
5248                 else {  /* SCF_DO_STCLASS_OR */
5249                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5250                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5251
5252                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5253                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5254                 }
5255                 flags &= ~SCF_DO_STCLASS;
5256                 SvREFCNT_dec(EXACTF_invlist);
5257             }
5258         }
5259         else if (REGNODE_VARIES(OP(scan))) {
5260             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5261             I32 fl = 0, f = flags;
5262             regnode * const oscan = scan;
5263             regnode_ssc this_class;
5264             regnode_ssc *oclass = NULL;
5265             I32 next_is_eval = 0;
5266
5267             switch (PL_regkind[OP(scan)]) {
5268             case WHILEM:                /* End of (?:...)* . */
5269                 scan = NEXTOPER(scan);
5270                 goto finish;
5271             case PLUS:
5272                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5273                     next = NEXTOPER(scan);
5274                     if (   OP(next) == EXACT
5275                         || OP(next) == EXACT_ONLY8
5276                         || OP(next) == EXACTL
5277                         || (flags & SCF_DO_STCLASS))
5278                     {
5279                         mincount = 1;
5280                         maxcount = REG_INFTY;
5281                         next = regnext(scan);
5282                         scan = NEXTOPER(scan);
5283                         goto do_curly;
5284                     }
5285                 }
5286                 if (flags & SCF_DO_SUBSTR)
5287                     data->pos_min++;
5288                 min++;
5289                 /* FALLTHROUGH */
5290             case STAR:
5291                 next = NEXTOPER(scan);
5292
5293                 /* This temporary node can now be turned into EXACTFU, and
5294                  * must, as regexec.c doesn't handle it */
5295                 if (OP(next) == EXACTFU_S_EDGE) {
5296                     OP(next) = EXACTFU;
5297                 }
5298
5299                 if (     STR_LEN(next) == 1
5300                     &&   isALPHA_A(* STRING(next))
5301                     && (         OP(next) == EXACTFAA
5302                         || (     OP(next) == EXACTFU
5303                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5304                 {
5305                     /* These differ in just one bit */
5306                     U8 mask = ~ ('A' ^ 'a');
5307
5308                     assert(isALPHA_A(* STRING(next)));
5309
5310                     /* Then replace it by an ANYOFM node, with
5311                     * the mask set to the complement of the
5312                     * bit that differs between upper and lower
5313                     * case, and the lowest code point of the
5314                     * pair (which the '&' forces) */
5315                     OP(next) = ANYOFM;
5316                     ARG_SET(next, *STRING(next) & mask);
5317                     FLAGS(next) = mask;
5318                 }
5319
5320                 if (flags & SCF_DO_STCLASS) {
5321                     mincount = 0;
5322                     maxcount = REG_INFTY;
5323                     next = regnext(scan);
5324                     scan = NEXTOPER(scan);
5325                     goto do_curly;
5326                 }
5327                 if (flags & SCF_DO_SUBSTR) {
5328                     scan_commit(pRExC_state, data, minlenp, is_inf);
5329                     /* Cannot extend fixed substrings */
5330                     data->cur_is_floating = 1; /* float */
5331                 }
5332                 is_inf = is_inf_internal = 1;
5333                 scan = regnext(scan);
5334                 goto optimize_curly_tail;
5335             case CURLY:
5336                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5337                     && (scan->flags == stopparen))
5338                 {
5339                     mincount = 1;
5340                     maxcount = 1;
5341                 } else {
5342                     mincount = ARG1(scan);
5343                     maxcount = ARG2(scan);
5344                 }
5345                 next = regnext(scan);
5346                 if (OP(scan) == CURLYX) {
5347                     I32 lp = (data ? *(data->last_closep) : 0);
5348                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5349                 }
5350                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5351                 next_is_eval = (OP(scan) == EVAL);
5352               do_curly:
5353                 if (flags & SCF_DO_SUBSTR) {
5354                     if (mincount == 0)
5355                         scan_commit(pRExC_state, data, minlenp, is_inf);
5356                     /* Cannot extend fixed substrings */
5357                     pos_before = data->pos_min;
5358                 }
5359                 if (data) {
5360                     fl = data->flags;
5361                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5362                     if (is_inf)
5363                         data->flags |= SF_IS_INF;
5364                 }
5365                 if (flags & SCF_DO_STCLASS) {
5366                     ssc_init(pRExC_state, &this_class);
5367                     oclass = data->start_class;
5368                     data->start_class = &this_class;
5369                     f |= SCF_DO_STCLASS_AND;
5370                     f &= ~SCF_DO_STCLASS_OR;
5371                 }
5372                 /* Exclude from super-linear cache processing any {n,m}
5373                    regops for which the combination of input pos and regex
5374                    pos is not enough information to determine if a match
5375                    will be possible.
5376
5377                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5378                    regex pos at the \s*, the prospects for a match depend not
5379                    only on the input position but also on how many (bar\s*)
5380                    repeats into the {4,8} we are. */
5381                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5382                     f &= ~SCF_WHILEM_VISITED_POS;
5383
5384                 /* This will finish on WHILEM, setting scan, or on NULL: */
5385                 /* recurse study_chunk() on loop bodies */
5386                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5387                                   last, data, stopparen, recursed_depth, NULL,
5388                                   (mincount == 0
5389                                    ? (f & ~SCF_DO_SUBSTR)
5390                                    : f)
5391                                   ,depth+1);
5392
5393                 if (flags & SCF_DO_STCLASS)
5394                     data->start_class = oclass;
5395                 if (mincount == 0 || minnext == 0) {
5396                     if (flags & SCF_DO_STCLASS_OR) {
5397                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5398                     }
5399                     else if (flags & SCF_DO_STCLASS_AND) {
5400                         /* Switch to OR mode: cache the old value of
5401                          * data->start_class */
5402                         INIT_AND_WITHP;
5403                         StructCopy(data->start_class, and_withp, regnode_ssc);
5404                         flags &= ~SCF_DO_STCLASS_AND;
5405                         StructCopy(&this_class, data->start_class, regnode_ssc);
5406                         flags |= SCF_DO_STCLASS_OR;
5407                         ANYOF_FLAGS(data->start_class)
5408                                                 |= SSC_MATCHES_EMPTY_STRING;
5409                     }
5410                 } else {                /* Non-zero len */
5411                     if (flags & SCF_DO_STCLASS_OR) {
5412                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5413                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5414                     }
5415                     else if (flags & SCF_DO_STCLASS_AND)
5416                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5417                     flags &= ~SCF_DO_STCLASS;
5418                 }
5419                 if (!scan)              /* It was not CURLYX, but CURLY. */
5420                     scan = next;
5421                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5422                     /* ? quantifier ok, except for (?{ ... }) */
5423                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5424                     && (minnext == 0) && (deltanext == 0)
5425                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5426                     && maxcount <= REG_INFTY/3) /* Complement check for big
5427                                                    count */
5428                 {
5429                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5430                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5431                             "Quantifier unexpected on zero-length expression "
5432                             "in regex m/%" UTF8f "/",
5433                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5434                                   RExC_precomp)));
5435                 }
5436
5437                 min += minnext * mincount;
5438                 is_inf_internal |= deltanext == SSize_t_MAX
5439                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5440                 is_inf |= is_inf_internal;
5441                 if (is_inf) {
5442                     delta = SSize_t_MAX;
5443                 } else {
5444                     delta += (minnext + deltanext) * maxcount
5445                              - minnext * mincount;
5446                 }
5447                 /* Try powerful optimization CURLYX => CURLYN. */
5448                 if (  OP(oscan) == CURLYX && data
5449                       && data->flags & SF_IN_PAR
5450                       && !(data->flags & SF_HAS_EVAL)
5451                       && !deltanext && minnext == 1 ) {
5452                     /* Try to optimize to CURLYN.  */
5453                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5454                     regnode * const nxt1 = nxt;
5455 #ifdef DEBUGGING
5456                     regnode *nxt2;
5457 #endif
5458
5459                     /* Skip open. */
5460                     nxt = regnext(nxt);
5461                     if (!REGNODE_SIMPLE(OP(nxt))
5462                         && !(PL_regkind[OP(nxt)] == EXACT
5463                              && STR_LEN(nxt) == 1))
5464                         goto nogo;
5465 #ifdef DEBUGGING
5466                     nxt2 = nxt;
5467 #endif
5468                     nxt = regnext(nxt);
5469                     if (OP(nxt) != CLOSE)
5470                         goto nogo;
5471                     if (RExC_open_parens) {
5472
5473                         /*open->CURLYM*/
5474                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5475
5476                         /*close->while*/
5477                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5478                     }
5479                     /* Now we know that nxt2 is the only contents: */
5480                     oscan->flags = (U8)ARG(nxt);
5481                     OP(oscan) = CURLYN;
5482                     OP(nxt1) = NOTHING; /* was OPEN. */
5483
5484 #ifdef DEBUGGING
5485                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5486                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5487                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5488                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5489                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5490                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5491 #endif
5492                 }
5493               nogo:
5494
5495                 /* Try optimization CURLYX => CURLYM. */
5496                 if (  OP(oscan) == CURLYX && data
5497                       && !(data->flags & SF_HAS_PAR)
5498                       && !(data->flags & SF_HAS_EVAL)
5499                       && !deltanext     /* atom is fixed width */
5500                       && minnext != 0   /* CURLYM can't handle zero width */
5501
5502                          /* Nor characters whose fold at run-time may be
5503                           * multi-character */
5504                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5505                 ) {
5506                     /* XXXX How to optimize if data == 0? */
5507                     /* Optimize to a simpler form.  */
5508                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5509                     regnode *nxt2;
5510
5511                     OP(oscan) = CURLYM;
5512                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5513                             && (OP(nxt2) != WHILEM))
5514                         nxt = nxt2;
5515                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5516                     /* Need to optimize away parenths. */
5517                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5518                         /* Set the parenth number.  */
5519                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5520
5521                         oscan->flags = (U8)ARG(nxt);
5522                         if (RExC_open_parens) {
5523                              /*open->CURLYM*/
5524                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5525
5526                             /*close->NOTHING*/
5527                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5528                                                          + 1;
5529                         }
5530                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5531                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5532
5533 #ifdef DEBUGGING
5534                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5535                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5536                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5537                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5538 #endif
5539 #if 0
5540                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5541                             regnode *nnxt = regnext(nxt1);
5542                             if (nnxt == nxt) {
5543                                 if (reg_off_by_arg[OP(nxt1)])
5544                                     ARG_SET(nxt1, nxt2 - nxt1);
5545                                 else if (nxt2 - nxt1 < U16_MAX)
5546                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5547                                 else
5548                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5549                             }
5550                             nxt1 = nnxt;
5551                         }
5552 #endif
5553                         /* Optimize again: */
5554                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5555                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5556                                     NULL, stopparen, recursed_depth, NULL, 0,
5557                                     depth+1);
5558                     }
5559                     else
5560                         oscan->flags = 0;
5561                 }
5562                 else if ((OP(oscan) == CURLYX)
5563                          && (flags & SCF_WHILEM_VISITED_POS)
5564                          /* See the comment on a similar expression above.
5565                             However, this time it's not a subexpression
5566                             we care about, but the expression itself. */
5567                          && (maxcount == REG_INFTY)
5568                          && data) {
5569                     /* This stays as CURLYX, we can put the count/of pair. */
5570                     /* Find WHILEM (as in regexec.c) */
5571                     regnode *nxt = oscan + NEXT_OFF(oscan);
5572
5573                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5574                         nxt += ARG(nxt);
5575                     nxt = PREVOPER(nxt);
5576                     if (nxt->flags & 0xf) {
5577                         /* we've already set whilem count on this node */
5578                     } else if (++data->whilem_c < 16) {
5579                         assert(data->whilem_c <= RExC_whilem_seen);
5580                         nxt->flags = (U8)(data->whilem_c
5581                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5582                     }
5583                 }
5584                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5585                     pars++;
5586                 if (flags & SCF_DO_SUBSTR) {
5587                     SV *last_str = NULL;
5588                     STRLEN last_chrs = 0;
5589                     int counted = mincount != 0;
5590
5591                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5592                                                                   string. */
5593                         SSize_t b = pos_before >= data->last_start_min
5594                             ? pos_before : data->last_start_min;
5595                         STRLEN l;
5596                         const char * const s = SvPV_const(data->last_found, l);
5597                         SSize_t old = b - data->last_start_min;
5598
5599                         if (UTF)
5600                             old = utf8_hop((U8*)s, old) - (U8*)s;
5601                         l -= old;
5602                         /* Get the added string: */
5603                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5604                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5605                                             (U8*)(s + old + l)) : l;
5606                         if (deltanext == 0 && pos_before == b) {
5607                             /* What was added is a constant string */
5608                             if (mincount > 1) {
5609
5610                                 SvGROW(last_str, (mincount * l) + 1);
5611                                 repeatcpy(SvPVX(last_str) + l,
5612                                           SvPVX_const(last_str), l,
5613                                           mincount - 1);
5614                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5615                                 /* Add additional parts. */
5616                                 SvCUR_set(data->last_found,
5617                                           SvCUR(data->last_found) - l);
5618                                 sv_catsv(data->last_found, last_str);
5619                                 {
5620                                     SV * sv = data->last_found;
5621                                     MAGIC *mg =
5622                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5623                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5624                                     if (mg && mg->mg_len >= 0)
5625                                         mg->mg_len += last_chrs * (mincount-1);
5626                                 }
5627                                 last_chrs *= mincount;
5628                                 data->last_end += l * (mincount - 1);
5629                             }
5630                         } else {
5631                             /* start offset must point into the last copy */
5632                             data->last_start_min += minnext * (mincount - 1);
5633                             data->last_start_max =
5634                               is_inf
5635                                ? SSize_t_MAX
5636                                : data->last_start_max +
5637                                  (maxcount - 1) * (minnext + data->pos_delta);
5638                         }
5639                     }
5640                     /* It is counted once already... */
5641                     data->pos_min += minnext * (mincount - counted);
5642 #if 0
5643 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5644                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5645                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5646     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5647     (UV)mincount);
5648 if (deltanext != SSize_t_MAX)
5649 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5650     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5651           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5652 #endif
5653                     if (deltanext == SSize_t_MAX
5654                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5655                         data->pos_delta = SSize_t_MAX;
5656                     else
5657                         data->pos_delta += - counted * deltanext +
5658                         (minnext + deltanext) * maxcount - minnext * mincount;
5659                     if (mincount != maxcount) {
5660                          /* Cannot extend fixed substrings found inside
5661                             the group.  */
5662                         scan_commit(pRExC_state, data, minlenp, is_inf);
5663                         if (mincount && last_str) {
5664                             SV * const sv = data->last_found;
5665                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5666                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5667
5668                             if (mg)
5669                                 mg->mg_len = -1;
5670                             sv_setsv(sv, last_str);
5671                             data->last_end = data->pos_min;
5672                             data->last_start_min = data->pos_min - last_chrs;
5673                             data->last_start_max = is_inf
5674                                 ? SSize_t_MAX
5675                                 : data->pos_min + data->pos_delta - last_chrs;
5676                         }
5677                         data->cur_is_floating = 1; /* float */
5678                     }
5679                     SvREFCNT_dec(last_str);
5680                 }
5681                 if (data && (fl & SF_HAS_EVAL))
5682                     data->flags |= SF_HAS_EVAL;
5683               optimize_curly_tail:
5684                 if (OP(oscan) != CURLYX) {
5685                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5686                            && NEXT_OFF(next))
5687                         NEXT_OFF(oscan) += NEXT_OFF(next);
5688                 }
5689                 continue;
5690
5691             default:
5692 #ifdef DEBUGGING
5693                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5694                                                                     OP(scan));
5695 #endif
5696             case REF:
5697             case CLUMP:
5698                 if (flags & SCF_DO_SUBSTR) {
5699                     /* Cannot expect anything... */
5700                     scan_commit(pRExC_state, data, minlenp, is_inf);
5701                     data->cur_is_floating = 1; /* float */
5702                 }
5703                 is_inf = is_inf_internal = 1;
5704                 if (flags & SCF_DO_STCLASS_OR) {
5705                     if (OP(scan) == CLUMP) {
5706                         /* Actually is any start char, but very few code points
5707                          * aren't start characters */
5708                         ssc_match_all_cp(data->start_class);
5709                     }
5710                     else {
5711                         ssc_anything(data->start_class);
5712                     }
5713                 }
5714                 flags &= ~SCF_DO_STCLASS;
5715                 break;
5716             }
5717         }
5718         else if (OP(scan) == LNBREAK) {
5719             if (flags & SCF_DO_STCLASS) {
5720                 if (flags & SCF_DO_STCLASS_AND) {
5721                     ssc_intersection(data->start_class,
5722                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5723                     ssc_clear_locale(data->start_class);
5724                     ANYOF_FLAGS(data->start_class)
5725                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5726                 }
5727                 else if (flags & SCF_DO_STCLASS_OR) {
5728                     ssc_union(data->start_class,
5729                               PL_XPosix_ptrs[_CC_VERTSPACE],
5730                               FALSE);
5731                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5732
5733                     /* See commit msg for
5734                      * 749e076fceedeb708a624933726e7989f2302f6a */
5735                     ANYOF_FLAGS(data->start_class)
5736                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5737                 }
5738                 flags &= ~SCF_DO_STCLASS;
5739             }
5740             min++;
5741             if (delta != SSize_t_MAX)
5742                 delta++;    /* Because of the 2 char string cr-lf */
5743             if (flags & SCF_DO_SUBSTR) {
5744                 /* Cannot expect anything... */
5745                 scan_commit(pRExC_state, data, minlenp, is_inf);
5746                 data->pos_min += 1;
5747                 if (data->pos_delta != SSize_t_MAX) {
5748                     data->pos_delta += 1;
5749                 }
5750                 data->cur_is_floating = 1; /* float */
5751             }
5752         }
5753         else if (REGNODE_SIMPLE(OP(scan))) {
5754
5755             if (flags & SCF_DO_SUBSTR) {
5756                 scan_commit(pRExC_state, data, minlenp, is_inf);
5757                 data->pos_min++;
5758             }
5759             min++;
5760             if (flags & SCF_DO_STCLASS) {
5761                 bool invert = 0;
5762                 SV* my_invlist = NULL;
5763                 U8 namedclass;
5764
5765                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5766                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5767
5768                 /* Some of the logic below assumes that switching
5769                    locale on will only add false positives. */
5770                 switch (OP(scan)) {
5771
5772                 default:
5773 #ifdef DEBUGGING
5774                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5775                                                                      OP(scan));
5776 #endif
5777                 case SANY:
5778                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5779                         ssc_match_all_cp(data->start_class);
5780                     break;
5781
5782                 case REG_ANY:
5783                     {
5784                         SV* REG_ANY_invlist = _new_invlist(2);
5785                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5786                                                             '\n');
5787                         if (flags & SCF_DO_STCLASS_OR) {
5788                             ssc_union(data->start_class,
5789                                       REG_ANY_invlist,
5790                                       TRUE /* TRUE => invert, hence all but \n
5791                                             */
5792                                       );
5793                         }
5794                         else if (flags & SCF_DO_STCLASS_AND) {
5795                             ssc_intersection(data->start_class,
5796                                              REG_ANY_invlist,
5797                                              TRUE  /* TRUE => invert */
5798                                              );
5799                             ssc_clear_locale(data->start_class);
5800                         }
5801                         SvREFCNT_dec_NN(REG_ANY_invlist);
5802                     }
5803                     break;
5804
5805                 case ANYOFD:
5806                 case ANYOFL:
5807                 case ANYOFPOSIXL:
5808                 case ANYOFH:
5809                 case ANYOF:
5810                     if (flags & SCF_DO_STCLASS_AND)
5811                         ssc_and(pRExC_state, data->start_class,
5812                                 (regnode_charclass *) scan);
5813                     else
5814                         ssc_or(pRExC_state, data->start_class,
5815                                                           (regnode_charclass *) scan);
5816                     break;
5817
5818                 case NANYOFM:
5819                 case ANYOFM:
5820                   {
5821                     SV* cp_list = get_ANYOFM_contents(scan);
5822
5823                     if (flags & SCF_DO_STCLASS_OR) {
5824                         ssc_union(data->start_class, cp_list, invert);
5825                     }
5826                     else if (flags & SCF_DO_STCLASS_AND) {
5827                         ssc_intersection(data->start_class, cp_list, invert);
5828                     }
5829
5830                     SvREFCNT_dec_NN(cp_list);
5831                     break;
5832                   }
5833
5834                 case NPOSIXL:
5835                     invert = 1;
5836                     /* FALLTHROUGH */
5837
5838                 case POSIXL:
5839                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5840                     if (flags & SCF_DO_STCLASS_AND) {
5841                         bool was_there = cBOOL(
5842                                           ANYOF_POSIXL_TEST(data->start_class,
5843                                                                  namedclass));
5844                         ANYOF_POSIXL_ZERO(data->start_class);
5845                         if (was_there) {    /* Do an AND */
5846                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5847                         }
5848                         /* No individual code points can now match */
5849                         data->start_class->invlist
5850                                                 = sv_2mortal(_new_invlist(0));
5851                     }
5852                     else {
5853                         int complement = namedclass + ((invert) ? -1 : 1);
5854
5855                         assert(flags & SCF_DO_STCLASS_OR);
5856
5857                         /* If the complement of this class was already there,
5858                          * the result is that they match all code points,
5859                          * (\d + \D == everything).  Remove the classes from
5860                          * future consideration.  Locale is not relevant in
5861                          * this case */
5862                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5863                             ssc_match_all_cp(data->start_class);
5864                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5865                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5866                         }
5867                         else {  /* The usual case; just add this class to the
5868                                    existing set */
5869                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5870                         }
5871                     }
5872                     break;
5873
5874                 case NPOSIXA:   /* For these, we always know the exact set of
5875                                    what's matched */
5876                     invert = 1;
5877                     /* FALLTHROUGH */
5878                 case POSIXA:
5879                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5880                     goto join_posix_and_ascii;
5881
5882                 case NPOSIXD:
5883                 case NPOSIXU:
5884                     invert = 1;
5885                     /* FALLTHROUGH */
5886                 case POSIXD:
5887                 case POSIXU:
5888                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5889
5890                     /* NPOSIXD matches all upper Latin1 code points unless the
5891                      * target string being matched is UTF-8, which is
5892                      * unknowable until match time.  Since we are going to
5893                      * invert, we want to get rid of all of them so that the
5894                      * inversion will match all */
5895                     if (OP(scan) == NPOSIXD) {
5896                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5897                                           &my_invlist);
5898                     }
5899
5900                   join_posix_and_ascii:
5901
5902                     if (flags & SCF_DO_STCLASS_AND) {
5903                         ssc_intersection(data->start_class, my_invlist, invert);
5904                         ssc_clear_locale(data->start_class);
5905                     }
5906                     else {
5907                         assert(flags & SCF_DO_STCLASS_OR);
5908                         ssc_union(data->start_class, my_invlist, invert);
5909                     }
5910                     SvREFCNT_dec(my_invlist);
5911                 }
5912                 if (flags & SCF_DO_STCLASS_OR)
5913                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5914                 flags &= ~SCF_DO_STCLASS;
5915             }
5916         }
5917         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5918             data->flags |= (OP(scan) == MEOL
5919                             ? SF_BEFORE_MEOL
5920                             : SF_BEFORE_SEOL);
5921             scan_commit(pRExC_state, data, minlenp, is_inf);
5922
5923         }
5924         else if (  PL_regkind[OP(scan)] == BRANCHJ
5925                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5926                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5927                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5928         {
5929             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5930                 || OP(scan) == UNLESSM )
5931             {
5932                 /* Negative Lookahead/lookbehind
5933                    In this case we can't do fixed string optimisation.
5934                 */
5935
5936                 SSize_t deltanext, minnext, fake = 0;
5937                 regnode *nscan;
5938                 regnode_ssc intrnl;
5939                 int f = 0;
5940
5941                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5942                 if (data) {
5943                     data_fake.whilem_c = data->whilem_c;
5944                     data_fake.last_closep = data->last_closep;
5945                 }
5946                 else
5947                     data_fake.last_closep = &fake;
5948                 data_fake.pos_delta = delta;
5949                 if ( flags & SCF_DO_STCLASS && !scan->flags
5950                      && OP(scan) == IFMATCH ) { /* Lookahead */
5951                     ssc_init(pRExC_state, &intrnl);
5952                     data_fake.start_class = &intrnl;
5953                     f |= SCF_DO_STCLASS_AND;
5954                 }
5955                 if (flags & SCF_WHILEM_VISITED_POS)
5956                     f |= SCF_WHILEM_VISITED_POS;
5957                 next = regnext(scan);
5958                 nscan = NEXTOPER(NEXTOPER(scan));
5959
5960                 /* recurse study_chunk() for lookahead body */
5961                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5962                                       last, &data_fake, stopparen,
5963                                       recursed_depth, NULL, f, depth+1);
5964                 if (scan->flags) {
5965                     if (deltanext) {
5966                         FAIL("Variable length lookbehind not implemented");
5967                     }
5968                     else if (minnext > (I32)U8_MAX) {
5969                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5970                               (UV)U8_MAX);
5971                     }
5972                     scan->flags = (U8)minnext;
5973                 }
5974                 if (data) {
5975                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5976                         pars++;
5977                     if (data_fake.flags & SF_HAS_EVAL)
5978                         data->flags |= SF_HAS_EVAL;
5979                     data->whilem_c = data_fake.whilem_c;
5980                 }
5981                 if (f & SCF_DO_STCLASS_AND) {
5982                     if (flags & SCF_DO_STCLASS_OR) {
5983                         /* OR before, AND after: ideally we would recurse with
5984                          * data_fake to get the AND applied by study of the
5985                          * remainder of the pattern, and then derecurse;
5986                          * *** HACK *** for now just treat as "no information".
5987                          * See [perl #56690].
5988                          */
5989                         ssc_init(pRExC_state, data->start_class);
5990                     }  else {
5991                         /* AND before and after: combine and continue.  These
5992                          * assertions are zero-length, so can match an EMPTY
5993                          * string */
5994                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5995                         ANYOF_FLAGS(data->start_class)
5996                                                    |= SSC_MATCHES_EMPTY_STRING;
5997                     }
5998                 }
5999             }
6000 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6001             else {
6002                 /* Positive Lookahead/lookbehind
6003                    In this case we can do fixed string optimisation,
6004                    but we must be careful about it. Note in the case of
6005                    lookbehind the positions will be offset by the minimum
6006                    length of the pattern, something we won't know about
6007                    until after the recurse.
6008                 */
6009                 SSize_t deltanext, fake = 0;
6010                 regnode *nscan;
6011                 regnode_ssc intrnl;
6012                 int f = 0;
6013                 /* We use SAVEFREEPV so that when the full compile
6014                     is finished perl will clean up the allocated
6015                     minlens when it's all done. This way we don't
6016                     have to worry about freeing them when we know
6017                     they wont be used, which would be a pain.
6018                  */
6019                 SSize_t *minnextp;
6020                 Newx( minnextp, 1, SSize_t );
6021                 SAVEFREEPV(minnextp);
6022
6023                 if (data) {
6024                     StructCopy(data, &data_fake, scan_data_t);
6025                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6026                         f |= SCF_DO_SUBSTR;
6027                         if (scan->flags)
6028                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6029                         data_fake.last_found=newSVsv(data->last_found);
6030                     }
6031                 }
6032                 else
6033                     data_fake.last_closep = &fake;
6034                 data_fake.flags = 0;
6035                 data_fake.substrs[0].flags = 0;
6036                 data_fake.substrs[1].flags = 0;
6037                 data_fake.pos_delta = delta;
6038                 if (is_inf)
6039                     data_fake.flags |= SF_IS_INF;
6040                 if ( flags & SCF_DO_STCLASS && !scan->flags
6041                      && OP(scan) == IFMATCH ) { /* Lookahead */
6042                     ssc_init(pRExC_state, &intrnl);
6043                     data_fake.start_class = &intrnl;
6044                     f |= SCF_DO_STCLASS_AND;
6045                 }
6046                 if (flags & SCF_WHILEM_VISITED_POS)
6047                     f |= SCF_WHILEM_VISITED_POS;
6048                 next = regnext(scan);
6049                 nscan = NEXTOPER(NEXTOPER(scan));
6050
6051                 /* positive lookahead study_chunk() recursion */
6052                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6053                                         &deltanext, last, &data_fake,
6054                                         stopparen, recursed_depth, NULL,
6055                                         f, depth+1);
6056                 if (scan->flags) {
6057                     if (deltanext) {
6058                         FAIL("Variable length lookbehind not implemented");
6059                     }
6060                     else if (*minnextp > (I32)U8_MAX) {
6061                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6062                               (UV)U8_MAX);
6063                     }
6064                     scan->flags = (U8)*minnextp;
6065                 }
6066
6067                 *minnextp += min;
6068
6069                 if (f & SCF_DO_STCLASS_AND) {
6070                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6071                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6072                 }
6073                 if (data) {
6074                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6075                         pars++;
6076                     if (data_fake.flags & SF_HAS_EVAL)
6077                         data->flags |= SF_HAS_EVAL;
6078                     data->whilem_c = data_fake.whilem_c;
6079                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6080                         int i;
6081                         if (RExC_rx->minlen<*minnextp)
6082                             RExC_rx->minlen=*minnextp;
6083                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6084                         SvREFCNT_dec_NN(data_fake.last_found);
6085
6086                         for (i = 0; i < 2; i++) {
6087                             if (data_fake.substrs[i].minlenp != minlenp) {
6088                                 data->substrs[i].min_offset =
6089                                             data_fake.substrs[i].min_offset;
6090                                 data->substrs[i].max_offset =
6091                                             data_fake.substrs[i].max_offset;
6092                                 data->substrs[i].minlenp =
6093                                             data_fake.substrs[i].minlenp;
6094                                 data->substrs[i].lookbehind += scan->flags;
6095                             }
6096                         }
6097                     }
6098                 }
6099             }
6100 #endif
6101         }
6102
6103         else if (OP(scan) == OPEN) {
6104             if (stopparen != (I32)ARG(scan))
6105                 pars++;
6106         }
6107         else if (OP(scan) == CLOSE) {
6108             if (stopparen == (I32)ARG(scan)) {
6109                 break;
6110             }
6111             if ((I32)ARG(scan) == is_par) {
6112                 next = regnext(scan);
6113
6114                 if ( next && (OP(next) != WHILEM) && next < last)
6115                     is_par = 0;         /* Disable optimization */
6116             }
6117             if (data)
6118                 *(data->last_closep) = ARG(scan);
6119         }
6120         else if (OP(scan) == EVAL) {
6121                 if (data)
6122                     data->flags |= SF_HAS_EVAL;
6123         }
6124         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6125             if (flags & SCF_DO_SUBSTR) {
6126                 scan_commit(pRExC_state, data, minlenp, is_inf);
6127                 flags &= ~SCF_DO_SUBSTR;
6128             }
6129             if (data && OP(scan)==ACCEPT) {
6130                 data->flags |= SCF_SEEN_ACCEPT;
6131                 if (stopmin > min)
6132                     stopmin = min;
6133             }
6134         }
6135         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6136         {
6137                 if (flags & SCF_DO_SUBSTR) {
6138                     scan_commit(pRExC_state, data, minlenp, is_inf);
6139                     data->cur_is_floating = 1; /* float */
6140                 }
6141                 is_inf = is_inf_internal = 1;
6142                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6143                     ssc_anything(data->start_class);
6144                 flags &= ~SCF_DO_STCLASS;
6145         }
6146         else if (OP(scan) == GPOS) {
6147             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6148                 !(delta || is_inf || (data && data->pos_delta)))
6149             {
6150                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6151                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6152                 if (RExC_rx->gofs < (STRLEN)min)
6153                     RExC_rx->gofs = min;
6154             } else {
6155                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6156                 RExC_rx->gofs = 0;
6157             }
6158         }
6159 #ifdef TRIE_STUDY_OPT
6160 #ifdef FULL_TRIE_STUDY
6161         else if (PL_regkind[OP(scan)] == TRIE) {
6162             /* NOTE - There is similar code to this block above for handling
6163                BRANCH nodes on the initial study.  If you change stuff here
6164                check there too. */
6165             regnode *trie_node= scan;
6166             regnode *tail= regnext(scan);
6167             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6168             SSize_t max1 = 0, min1 = SSize_t_MAX;
6169             regnode_ssc accum;
6170
6171             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6172                 /* Cannot merge strings after this. */
6173                 scan_commit(pRExC_state, data, minlenp, is_inf);
6174             }
6175             if (flags & SCF_DO_STCLASS)
6176                 ssc_init_zero(pRExC_state, &accum);
6177
6178             if (!trie->jump) {
6179                 min1= trie->minlen;
6180                 max1= trie->maxlen;
6181             } else {
6182                 const regnode *nextbranch= NULL;
6183                 U32 word;
6184
6185                 for ( word=1 ; word <= trie->wordcount ; word++)
6186                 {
6187                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6188                     regnode_ssc this_class;
6189
6190                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6191                     if (data) {
6192                         data_fake.whilem_c = data->whilem_c;
6193                         data_fake.last_closep = data->last_closep;
6194                     }
6195                     else
6196                         data_fake.last_closep = &fake;
6197                     data_fake.pos_delta = delta;
6198                     if (flags & SCF_DO_STCLASS) {
6199                         ssc_init(pRExC_state, &this_class);
6200                         data_fake.start_class = &this_class;
6201                         f = SCF_DO_STCLASS_AND;
6202                     }
6203                     if (flags & SCF_WHILEM_VISITED_POS)
6204                         f |= SCF_WHILEM_VISITED_POS;
6205
6206                     if (trie->jump[word]) {
6207                         if (!nextbranch)
6208                             nextbranch = trie_node + trie->jump[0];
6209                         scan= trie_node + trie->jump[word];
6210                         /* We go from the jump point to the branch that follows
6211                            it. Note this means we need the vestigal unused
6212                            branches even though they arent otherwise used. */
6213                         /* optimise study_chunk() for TRIE */
6214                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6215                             &deltanext, (regnode *)nextbranch, &data_fake,
6216                             stopparen, recursed_depth, NULL, f, depth+1);
6217                     }
6218                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6219                         nextbranch= regnext((regnode*)nextbranch);
6220
6221                     if (min1 > (SSize_t)(minnext + trie->minlen))
6222                         min1 = minnext + trie->minlen;
6223                     if (deltanext == SSize_t_MAX) {
6224                         is_inf = is_inf_internal = 1;
6225                         max1 = SSize_t_MAX;
6226                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6227                         max1 = minnext + deltanext + trie->maxlen;
6228
6229                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6230                         pars++;
6231                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6232                         if ( stopmin > min + min1)
6233                             stopmin = min + min1;
6234                         flags &= ~SCF_DO_SUBSTR;
6235                         if (data)
6236                             data->flags |= SCF_SEEN_ACCEPT;
6237                     }
6238                     if (data) {
6239                         if (data_fake.flags & SF_HAS_EVAL)
6240                             data->flags |= SF_HAS_EVAL;
6241                         data->whilem_c = data_fake.whilem_c;
6242                     }
6243                     if (flags & SCF_DO_STCLASS)
6244                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6245                 }
6246             }
6247             if (flags & SCF_DO_SUBSTR) {
6248                 data->pos_min += min1;
6249                 data->pos_delta += max1 - min1;
6250                 if (max1 != min1 || is_inf)
6251                     data->cur_is_floating = 1; /* float */
6252             }
6253             min += min1;
6254             if (delta != SSize_t_MAX) {
6255                 if (SSize_t_MAX - (max1 - min1) >= delta)
6256                     delta += max1 - min1;
6257                 else
6258                     delta = SSize_t_MAX;
6259             }
6260             if (flags & SCF_DO_STCLASS_OR) {
6261                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6262                 if (min1) {
6263                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6264                     flags &= ~SCF_DO_STCLASS;
6265                 }
6266             }
6267             else if (flags & SCF_DO_STCLASS_AND) {
6268                 if (min1) {
6269                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6270                     flags &= ~SCF_DO_STCLASS;
6271                 }
6272                 else {
6273                     /* Switch to OR mode: cache the old value of
6274                      * data->start_class */
6275                     INIT_AND_WITHP;
6276                     StructCopy(data->start_class, and_withp, regnode_ssc);
6277                     flags &= ~SCF_DO_STCLASS_AND;
6278                     StructCopy(&accum, data->start_class, regnode_ssc);
6279                     flags |= SCF_DO_STCLASS_OR;
6280                 }
6281             }
6282             scan= tail;
6283             continue;
6284         }
6285 #else
6286         else if (PL_regkind[OP(scan)] == TRIE) {
6287             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6288             U8*bang=NULL;
6289
6290             min += trie->minlen;
6291             delta += (trie->maxlen - trie->minlen);
6292             flags &= ~SCF_DO_STCLASS; /* xxx */
6293             if (flags & SCF_DO_SUBSTR) {
6294                 /* Cannot expect anything... */
6295                 scan_commit(pRExC_state, data, minlenp, is_inf);
6296                 data->pos_min += trie->minlen;
6297                 data->pos_delta += (trie->maxlen - trie->minlen);
6298                 if (trie->maxlen != trie->minlen)
6299                     data->cur_is_floating = 1; /* float */
6300             }
6301             if (trie->jump) /* no more substrings -- for now /grr*/
6302                flags &= ~SCF_DO_SUBSTR;
6303         }
6304 #endif /* old or new */
6305 #endif /* TRIE_STUDY_OPT */
6306
6307         /* Else: zero-length, ignore. */
6308         scan = regnext(scan);
6309     }
6310
6311   finish:
6312     if (frame) {
6313         /* we need to unwind recursion. */
6314         depth = depth - 1;
6315
6316         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6317         DEBUG_PEEP("fend", scan, depth, flags);
6318
6319         /* restore previous context */
6320         last = frame->last_regnode;
6321         scan = frame->next_regnode;
6322         stopparen = frame->stopparen;
6323         recursed_depth = frame->prev_recursed_depth;
6324
6325         RExC_frame_last = frame->prev_frame;
6326         frame = frame->this_prev_frame;
6327         goto fake_study_recurse;
6328     }
6329
6330     assert(!frame);
6331     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6332
6333     *scanp = scan;
6334     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6335
6336     if (flags & SCF_DO_SUBSTR && is_inf)
6337         data->pos_delta = SSize_t_MAX - data->pos_min;
6338     if (is_par > (I32)U8_MAX)
6339         is_par = 0;
6340     if (is_par && pars==1 && data) {
6341         data->flags |= SF_IN_PAR;
6342         data->flags &= ~SF_HAS_PAR;
6343     }
6344     else if (pars && data) {
6345         data->flags |= SF_HAS_PAR;
6346         data->flags &= ~SF_IN_PAR;
6347     }
6348     if (flags & SCF_DO_STCLASS_OR)
6349         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6350     if (flags & SCF_TRIE_RESTUDY)
6351         data->flags |=  SCF_TRIE_RESTUDY;
6352
6353     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6354
6355     {
6356         SSize_t final_minlen= min < stopmin ? min : stopmin;
6357
6358         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6359             if (final_minlen > SSize_t_MAX - delta)
6360                 RExC_maxlen = SSize_t_MAX;
6361             else if (RExC_maxlen < final_minlen + delta)
6362                 RExC_maxlen = final_minlen + delta;
6363         }
6364         return final_minlen;
6365     }
6366     NOT_REACHED; /* NOTREACHED */
6367 }
6368
6369 STATIC U32
6370 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6371 {
6372     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6373
6374     PERL_ARGS_ASSERT_ADD_DATA;
6375
6376     Renewc(RExC_rxi->data,
6377            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6378            char, struct reg_data);
6379     if(count)
6380         Renew(RExC_rxi->data->what, count + n, U8);
6381     else
6382         Newx(RExC_rxi->data->what, n, U8);
6383     RExC_rxi->data->count = count + n;
6384     Copy(s, RExC_rxi->data->what + count, n, U8);
6385     return count;
6386 }
6387
6388 /*XXX: todo make this not included in a non debugging perl, but appears to be
6389  * used anyway there, in 'use re' */
6390 #ifndef PERL_IN_XSUB_RE
6391 void
6392 Perl_reginitcolors(pTHX)
6393 {
6394     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6395     if (s) {
6396         char *t = savepv(s);
6397         int i = 0;
6398         PL_colors[0] = t;
6399         while (++i < 6) {
6400             t = strchr(t, '\t');
6401             if (t) {
6402                 *t = '\0';
6403                 PL_colors[i] = ++t;
6404             }
6405             else
6406                 PL_colors[i] = t = (char *)"";
6407         }
6408     } else {
6409         int i = 0;
6410         while (i < 6)
6411             PL_colors[i++] = (char *)"";
6412     }
6413     PL_colorset = 1;
6414 }
6415 #endif
6416
6417
6418 #ifdef TRIE_STUDY_OPT
6419 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6420     STMT_START {                                            \
6421         if (                                                \
6422               (data.flags & SCF_TRIE_RESTUDY)               \
6423               && ! restudied++                              \
6424         ) {                                                 \
6425             dOsomething;                                    \
6426             goto reStudy;                                   \
6427         }                                                   \
6428     } STMT_END
6429 #else
6430 #define CHECK_RESTUDY_GOTO_butfirst
6431 #endif
6432
6433 /*
6434  * pregcomp - compile a regular expression into internal code
6435  *
6436  * Decides which engine's compiler to call based on the hint currently in
6437  * scope
6438  */
6439
6440 #ifndef PERL_IN_XSUB_RE
6441
6442 /* return the currently in-scope regex engine (or the default if none)  */
6443
6444 regexp_engine const *
6445 Perl_current_re_engine(pTHX)
6446 {
6447     if (IN_PERL_COMPILETIME) {
6448         HV * const table = GvHV(PL_hintgv);
6449         SV **ptr;
6450
6451         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6452             return &PL_core_reg_engine;
6453         ptr = hv_fetchs(table, "regcomp", FALSE);
6454         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6455             return &PL_core_reg_engine;
6456         return INT2PTR(regexp_engine*, SvIV(*ptr));
6457     }
6458     else {
6459         SV *ptr;
6460         if (!PL_curcop->cop_hints_hash)
6461             return &PL_core_reg_engine;
6462         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6463         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6464             return &PL_core_reg_engine;
6465         return INT2PTR(regexp_engine*, SvIV(ptr));
6466     }
6467 }
6468
6469
6470 REGEXP *
6471 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6472 {
6473     regexp_engine const *eng = current_re_engine();
6474     GET_RE_DEBUG_FLAGS_DECL;
6475
6476     PERL_ARGS_ASSERT_PREGCOMP;
6477
6478     /* Dispatch a request to compile a regexp to correct regexp engine. */
6479     DEBUG_COMPILE_r({
6480         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6481                         PTR2UV(eng));
6482     });
6483     return CALLREGCOMP_ENG(eng, pattern, flags);
6484 }
6485 #endif
6486
6487 /* public(ish) entry point for the perl core's own regex compiling code.
6488  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6489  * pattern rather than a list of OPs, and uses the internal engine rather
6490  * than the current one */
6491
6492 REGEXP *
6493 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6494 {
6495     SV *pat = pattern; /* defeat constness! */
6496     PERL_ARGS_ASSERT_RE_COMPILE;
6497     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6498 #ifdef PERL_IN_XSUB_RE
6499                                 &my_reg_engine,
6500 #else
6501                                 &PL_core_reg_engine,
6502 #endif
6503                                 NULL, NULL, rx_flags, 0);
6504 }
6505
6506
6507 static void
6508 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6509 {
6510     int n;
6511
6512     if (--cbs->refcnt > 0)
6513         return;
6514     for (n = 0; n < cbs->count; n++) {
6515         REGEXP *rx = cbs->cb[n].src_regex;
6516         if (rx) {
6517             cbs->cb[n].src_regex = NULL;
6518             SvREFCNT_dec_NN(rx);
6519         }
6520     }
6521     Safefree(cbs->cb);
6522     Safefree(cbs);
6523 }
6524
6525
6526 static struct reg_code_blocks *
6527 S_alloc_code_blocks(pTHX_  int ncode)
6528 {
6529      struct reg_code_blocks *cbs;
6530     Newx(cbs, 1, struct reg_code_blocks);
6531     cbs->count = ncode;
6532     cbs->refcnt = 1;
6533     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6534     if (ncode)
6535         Newx(cbs->cb, ncode, struct reg_code_block);
6536     else
6537         cbs->cb = NULL;
6538     return cbs;
6539 }
6540
6541
6542 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6543  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6544  * point to the realloced string and length.
6545  *
6546  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6547  * stuff added */
6548
6549 static void
6550 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6551                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6552 {
6553     U8 *const src = (U8*)*pat_p;
6554     U8 *dst, *d;
6555     int n=0;
6556     STRLEN s = 0;
6557     bool do_end = 0;
6558     GET_RE_DEBUG_FLAGS_DECL;
6559
6560     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6561         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6562
6563     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6564     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6565     d = dst;
6566
6567     while (s < *plen_p) {
6568         append_utf8_from_native_byte(src[s], &d);
6569
6570         if (n < num_code_blocks) {
6571             assert(pRExC_state->code_blocks);
6572             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6573                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6574                 assert(*(d - 1) == '(');
6575                 do_end = 1;
6576             }
6577             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6578                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6579                 assert(*(d - 1) == ')');
6580                 do_end = 0;
6581                 n++;
6582             }
6583         }
6584         s++;
6585     }
6586     *d = '\0';
6587     *plen_p = d - dst;
6588     *pat_p = (char*) dst;
6589     SAVEFREEPV(*pat_p);
6590     RExC_orig_utf8 = RExC_utf8 = 1;
6591 }
6592
6593
6594
6595 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6596  * while recording any code block indices, and handling overloading,
6597  * nested qr// objects etc.  If pat is null, it will allocate a new
6598  * string, or just return the first arg, if there's only one.
6599  *
6600  * Returns the malloced/updated pat.
6601  * patternp and pat_count is the array of SVs to be concatted;
6602  * oplist is the optional list of ops that generated the SVs;
6603  * recompile_p is a pointer to a boolean that will be set if
6604  *   the regex will need to be recompiled.
6605  * delim, if non-null is an SV that will be inserted between each element
6606  */
6607
6608 static SV*
6609 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6610                 SV *pat, SV ** const patternp, int pat_count,
6611                 OP *oplist, bool *recompile_p, SV *delim)
6612 {
6613     SV **svp;
6614     int n = 0;
6615     bool use_delim = FALSE;
6616     bool alloced = FALSE;
6617
6618     /* if we know we have at least two args, create an empty string,
6619      * then concatenate args to that. For no args, return an empty string */
6620     if (!pat && pat_count != 1) {
6621         pat = newSVpvs("");
6622         SAVEFREESV(pat);
6623         alloced = TRUE;
6624     }
6625
6626     for (svp = patternp; svp < patternp + pat_count; svp++) {
6627         SV *sv;
6628         SV *rx  = NULL;
6629         STRLEN orig_patlen = 0;
6630         bool code = 0;
6631         SV *msv = use_delim ? delim : *svp;
6632         if (!msv) msv = &PL_sv_undef;
6633
6634         /* if we've got a delimiter, we go round the loop twice for each
6635          * svp slot (except the last), using the delimiter the second
6636          * time round */
6637         if (use_delim) {
6638             svp--;
6639             use_delim = FALSE;
6640         }
6641         else if (delim)
6642             use_delim = TRUE;
6643
6644         if (SvTYPE(msv) == SVt_PVAV) {
6645             /* we've encountered an interpolated array within
6646              * the pattern, e.g. /...@a..../. Expand the list of elements,
6647              * then recursively append elements.
6648              * The code in this block is based on S_pushav() */
6649
6650             AV *const av = (AV*)msv;
6651             const SSize_t maxarg = AvFILL(av) + 1;
6652             SV **array;
6653
6654             if (oplist) {
6655                 assert(oplist->op_type == OP_PADAV
6656                     || oplist->op_type == OP_RV2AV);
6657                 oplist = OpSIBLING(oplist);
6658             }
6659
6660             if (SvRMAGICAL(av)) {
6661                 SSize_t i;
6662
6663                 Newx(array, maxarg, SV*);
6664                 SAVEFREEPV(array);
6665                 for (i=0; i < maxarg; i++) {
6666                     SV ** const svp = av_fetch(av, i, FALSE);
6667                     array[i] = svp ? *svp : &PL_sv_undef;
6668                 }
6669             }
6670             else
6671                 array = AvARRAY(av);
6672
6673             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6674                                 array, maxarg, NULL, recompile_p,
6675                                 /* $" */
6676                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6677
6678             continue;
6679         }
6680
6681
6682         /* we make the assumption here that each op in the list of
6683          * op_siblings maps to one SV pushed onto the stack,
6684          * except for code blocks, with have both an OP_NULL and
6685          * and OP_CONST.
6686          * This allows us to match up the list of SVs against the
6687          * list of OPs to find the next code block.
6688          *
6689          * Note that       PUSHMARK PADSV PADSV ..
6690          * is optimised to
6691          *                 PADRANGE PADSV  PADSV  ..
6692          * so the alignment still works. */
6693
6694         if (oplist) {
6695             if (oplist->op_type == OP_NULL
6696                 && (oplist->op_flags & OPf_SPECIAL))
6697             {
6698                 assert(n < pRExC_state->code_blocks->count);
6699                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6700                 pRExC_state->code_blocks->cb[n].block = oplist;
6701                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6702                 n++;
6703                 code = 1;
6704                 oplist = OpSIBLING(oplist); /* skip CONST */
6705                 assert(oplist);
6706             }
6707             oplist = OpSIBLING(oplist);;
6708         }
6709
6710         /* apply magic and QR overloading to arg */
6711
6712         SvGETMAGIC(msv);
6713         if (SvROK(msv) && SvAMAGIC(msv)) {
6714             SV *sv = AMG_CALLunary(msv, regexp_amg);
6715             if (sv) {
6716                 if (SvROK(sv))
6717                     sv = SvRV(sv);
6718                 if (SvTYPE(sv) != SVt_REGEXP)
6719                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6720                 msv = sv;
6721             }
6722         }
6723
6724         /* try concatenation overload ... */
6725         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6726                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6727         {
6728             sv_setsv(pat, sv);
6729             /* overloading involved: all bets are off over literal
6730              * code. Pretend we haven't seen it */
6731             if (n)
6732                 pRExC_state->code_blocks->count -= n;
6733             n = 0;
6734         }
6735         else  {
6736             /* ... or failing that, try "" overload */
6737             while (SvAMAGIC(msv)
6738                     && (sv = AMG_CALLunary(msv, string_amg))
6739                     && sv != msv
6740                     &&  !(   SvROK(msv)
6741                           && SvROK(sv)
6742                           && SvRV(msv) == SvRV(sv))
6743             ) {
6744                 msv = sv;
6745                 SvGETMAGIC(msv);
6746             }
6747             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6748                 msv = SvRV(msv);
6749
6750             if (pat) {
6751                 /* this is a partially unrolled
6752                  *     sv_catsv_nomg(pat, msv);
6753                  * that allows us to adjust code block indices if
6754                  * needed */
6755                 STRLEN dlen;
6756                 char *dst = SvPV_force_nomg(pat, dlen);
6757                 orig_patlen = dlen;
6758                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6759                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6760                     sv_setpvn(pat, dst, dlen);
6761                     SvUTF8_on(pat);
6762                 }
6763                 sv_catsv_nomg(pat, msv);
6764                 rx = msv;
6765             }
6766             else {
6767                 /* We have only one SV to process, but we need to verify
6768                  * it is properly null terminated or we will fail asserts
6769                  * later. In theory we probably shouldn't get such SV's,
6770                  * but if we do we should handle it gracefully. */
6771                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6772                     /* not a string, or a string with a trailing null */
6773                     pat = msv;
6774                 } else {
6775                     /* a string with no trailing null, we need to copy it
6776                      * so it has a trailing null */
6777                     pat = sv_2mortal(newSVsv(msv));
6778                 }
6779             }
6780
6781             if (code)
6782                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6783         }
6784
6785         /* extract any code blocks within any embedded qr//'s */
6786         if (rx && SvTYPE(rx) == SVt_REGEXP
6787             && RX_ENGINE((REGEXP*)rx)->op_comp)
6788         {
6789
6790             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6791             if (ri->code_blocks && ri->code_blocks->count) {
6792                 int i;
6793                 /* the presence of an embedded qr// with code means
6794                  * we should always recompile: the text of the
6795                  * qr// may not have changed, but it may be a
6796                  * different closure than last time */
6797                 *recompile_p = 1;
6798                 if (pRExC_state->code_blocks) {
6799                     int new_count = pRExC_state->code_blocks->count
6800                             + ri->code_blocks->count;
6801                     Renew(pRExC_state->code_blocks->cb,
6802                             new_count, struct reg_code_block);
6803                     pRExC_state->code_blocks->count = new_count;
6804                 }
6805                 else
6806                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6807                                                     ri->code_blocks->count);
6808
6809                 for (i=0; i < ri->code_blocks->count; i++) {
6810                     struct reg_code_block *src, *dst;
6811                     STRLEN offset =  orig_patlen
6812                         + ReANY((REGEXP *)rx)->pre_prefix;
6813                     assert(n < pRExC_state->code_blocks->count);
6814                     src = &ri->code_blocks->cb[i];
6815                     dst = &pRExC_state->code_blocks->cb[n];
6816                     dst->start      = src->start + offset;
6817                     dst->end        = src->end   + offset;
6818                     dst->block      = src->block;
6819                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6820                                             src->src_regex
6821                                                 ? src->src_regex
6822                                                 : (REGEXP*)rx);
6823                     n++;
6824                 }
6825             }
6826         }
6827     }
6828     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6829     if (alloced)
6830         SvSETMAGIC(pat);
6831
6832     return pat;
6833 }
6834
6835
6836
6837 /* see if there are any run-time code blocks in the pattern.
6838  * False positives are allowed */
6839
6840 static bool
6841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6842                     char *pat, STRLEN plen)
6843 {
6844     int n = 0;
6845     STRLEN s;
6846
6847     PERL_UNUSED_CONTEXT;
6848
6849     for (s = 0; s < plen; s++) {
6850         if (   pRExC_state->code_blocks
6851             && n < pRExC_state->code_blocks->count
6852             && s == pRExC_state->code_blocks->cb[n].start)
6853         {
6854             s = pRExC_state->code_blocks->cb[n].end;
6855             n++;
6856             continue;
6857         }
6858         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6859          * positives here */
6860         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6861             (pat[s+2] == '{'
6862                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6863         )
6864             return 1;
6865     }
6866     return 0;
6867 }
6868
6869 /* Handle run-time code blocks. We will already have compiled any direct
6870  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6871  * copy of it, but with any literal code blocks blanked out and
6872  * appropriate chars escaped; then feed it into
6873  *
6874  *    eval "qr'modified_pattern'"
6875  *
6876  * For example,
6877  *
6878  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6879  *
6880  * becomes
6881  *
6882  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6883  *
6884  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6885  * and merge them with any code blocks of the original regexp.
6886  *
6887  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6888  * instead, just save the qr and return FALSE; this tells our caller that
6889  * the original pattern needs upgrading to utf8.
6890  */
6891
6892 static bool
6893 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6894     char *pat, STRLEN plen)
6895 {
6896     SV *qr;
6897
6898     GET_RE_DEBUG_FLAGS_DECL;
6899
6900     if (pRExC_state->runtime_code_qr) {
6901         /* this is the second time we've been called; this should
6902          * only happen if the main pattern got upgraded to utf8
6903          * during compilation; re-use the qr we compiled first time
6904          * round (which should be utf8 too)
6905          */
6906         qr = pRExC_state->runtime_code_qr;
6907         pRExC_state->runtime_code_qr = NULL;
6908         assert(RExC_utf8 && SvUTF8(qr));
6909     }
6910     else {
6911         int n = 0;
6912         STRLEN s;
6913         char *p, *newpat;
6914         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6915         SV *sv, *qr_ref;
6916         dSP;
6917
6918         /* determine how many extra chars we need for ' and \ escaping */
6919         for (s = 0; s < plen; s++) {
6920             if (pat[s] == '\'' || pat[s] == '\\')
6921                 newlen++;
6922         }
6923
6924         Newx(newpat, newlen, char);
6925         p = newpat;
6926         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6927
6928         for (s = 0; s < plen; s++) {
6929             if (   pRExC_state->code_blocks
6930                 && n < pRExC_state->code_blocks->count
6931                 && s == pRExC_state->code_blocks->cb[n].start)
6932             {
6933                 /* blank out literal code block so that they aren't
6934                  * recompiled: eg change from/to:
6935                  *     /(?{xyz})/
6936                  *     /(?=====)/
6937                  * and
6938                  *     /(??{xyz})/
6939                  *     /(?======)/
6940                  * and
6941                  *     /(?(?{xyz}))/
6942                  *     /(?(?=====))/
6943                 */
6944                 assert(pat[s]   == '(');
6945                 assert(pat[s+1] == '?');
6946                 *p++ = '(';
6947                 *p++ = '?';
6948                 s += 2;
6949                 while (s < pRExC_state->code_blocks->cb[n].end) {
6950                     *p++ = '=';
6951                     s++;
6952                 }
6953                 *p++ = ')';
6954                 n++;
6955                 continue;
6956             }
6957             if (pat[s] == '\'' || pat[s] == '\\')
6958                 *p++ = '\\';
6959             *p++ = pat[s];
6960         }
6961         *p++ = '\'';
6962         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6963             *p++ = 'x';
6964             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6965                 *p++ = 'x';
6966             }
6967         }
6968         *p++ = '\0';
6969         DEBUG_COMPILE_r({
6970             Perl_re_printf( aTHX_
6971                 "%sre-parsing pattern for runtime code:%s %s\n",
6972                 PL_colors[4], PL_colors[5], newpat);
6973         });
6974
6975         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6976         Safefree(newpat);
6977
6978         ENTER;
6979         SAVETMPS;
6980         save_re_context();
6981         PUSHSTACKi(PERLSI_REQUIRE);
6982         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6983          * parsing qr''; normally only q'' does this. It also alters
6984          * hints handling */
6985         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6986         SvREFCNT_dec_NN(sv);
6987         SPAGAIN;
6988         qr_ref = POPs;
6989         PUTBACK;
6990         {
6991             SV * const errsv = ERRSV;
6992             if (SvTRUE_NN(errsv))
6993                 /* use croak_sv ? */
6994                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6995         }
6996         assert(SvROK(qr_ref));
6997         qr = SvRV(qr_ref);
6998         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6999         /* the leaving below frees the tmp qr_ref.
7000          * Give qr a life of its own */
7001         SvREFCNT_inc(qr);
7002         POPSTACK;
7003         FREETMPS;
7004         LEAVE;
7005
7006     }
7007
7008     if (!RExC_utf8 && SvUTF8(qr)) {
7009         /* first time through; the pattern got upgraded; save the
7010          * qr for the next time through */
7011         assert(!pRExC_state->runtime_code_qr);
7012         pRExC_state->runtime_code_qr = qr;
7013         return 0;
7014     }
7015
7016
7017     /* extract any code blocks within the returned qr//  */
7018
7019
7020     /* merge the main (r1) and run-time (r2) code blocks into one */
7021     {
7022         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7023         struct reg_code_block *new_block, *dst;
7024         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7025         int i1 = 0, i2 = 0;
7026         int r1c, r2c;
7027
7028         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7029         {
7030             SvREFCNT_dec_NN(qr);
7031             return 1;
7032         }
7033
7034         if (!r1->code_blocks)
7035             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7036
7037         r1c = r1->code_blocks->count;
7038         r2c = r2->code_blocks->count;
7039
7040         Newx(new_block, r1c + r2c, struct reg_code_block);
7041
7042         dst = new_block;
7043
7044         while (i1 < r1c || i2 < r2c) {
7045             struct reg_code_block *src;
7046             bool is_qr = 0;
7047
7048             if (i1 == r1c) {
7049                 src = &r2->code_blocks->cb[i2++];
7050                 is_qr = 1;
7051             }
7052             else if (i2 == r2c)
7053                 src = &r1->code_blocks->cb[i1++];
7054             else if (  r1->code_blocks->cb[i1].start
7055                      < r2->code_blocks->cb[i2].start)
7056             {
7057                 src = &r1->code_blocks->cb[i1++];
7058                 assert(src->end < r2->code_blocks->cb[i2].start);
7059             }
7060             else {
7061                 assert(  r1->code_blocks->cb[i1].start
7062                        > r2->code_blocks->cb[i2].start);
7063                 src = &r2->code_blocks->cb[i2++];
7064                 is_qr = 1;
7065                 assert(src->end < r1->code_blocks->cb[i1].start);
7066             }
7067
7068             assert(pat[src->start] == '(');
7069             assert(pat[src->end]   == ')');
7070             dst->start      = src->start;
7071             dst->end        = src->end;
7072             dst->block      = src->block;
7073             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7074                                     : src->src_regex;
7075             dst++;
7076         }
7077         r1->code_blocks->count += r2c;
7078         Safefree(r1->code_blocks->cb);
7079         r1->code_blocks->cb = new_block;
7080     }
7081
7082     SvREFCNT_dec_NN(qr);
7083     return 1;
7084 }
7085
7086
7087 STATIC bool
7088 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7089                       struct reg_substr_datum  *rsd,
7090                       struct scan_data_substrs *sub,
7091                       STRLEN longest_length)
7092 {
7093     /* This is the common code for setting up the floating and fixed length
7094      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7095      * as to whether succeeded or not */
7096
7097     I32 t;
7098     SSize_t ml;
7099     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7100     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7101
7102     if (! (longest_length
7103            || (eol /* Can't have SEOL and MULTI */
7104                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7105           )
7106             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7107         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7108     {
7109         return FALSE;
7110     }
7111
7112     /* copy the information about the longest from the reg_scan_data
7113         over to the program. */
7114     if (SvUTF8(sub->str)) {
7115         rsd->substr      = NULL;
7116         rsd->utf8_substr = sub->str;
7117     } else {
7118         rsd->substr      = sub->str;
7119         rsd->utf8_substr = NULL;
7120     }
7121     /* end_shift is how many chars that must be matched that
7122         follow this item. We calculate it ahead of time as once the
7123         lookbehind offset is added in we lose the ability to correctly
7124         calculate it.*/
7125     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7126     rsd->end_shift = ml - sub->min_offset
7127         - longest_length
7128             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7129              * intead? - DAPM
7130             + (SvTAIL(sub->str) != 0)
7131             */
7132         + sub->lookbehind;
7133
7134     t = (eol/* Can't have SEOL and MULTI */
7135          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7136     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7137
7138     return TRUE;
7139 }
7140
7141 STATIC void
7142 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7143 {
7144     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7145      * properly wrapped with the right modifiers */
7146
7147     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7148     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7149                                                 != REGEX_DEPENDS_CHARSET);
7150
7151     /* The caret is output if there are any defaults: if not all the STD
7152         * flags are set, or if no character set specifier is needed */
7153     bool has_default =
7154                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7155                 || ! has_charset);
7156     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7157                                                 == REG_RUN_ON_COMMENT_SEEN);
7158     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7159                         >> RXf_PMf_STD_PMMOD_SHIFT);
7160     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7161     char *p;
7162     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7163
7164     /* We output all the necessary flags; we never output a minus, as all
7165         * those are defaults, so are
7166         * covered by the caret */
7167     const STRLEN wraplen = pat_len + has_p + has_runon
7168         + has_default       /* If needs a caret */
7169         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7170
7171             /* If needs a character set specifier */
7172         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7173         + (sizeof("(?:)") - 1);
7174
7175     PERL_ARGS_ASSERT_SET_REGEX_PV;
7176
7177     /* make sure PL_bitcount bounds not exceeded */
7178     assert(sizeof(STD_PAT_MODS) <= 8);
7179
7180     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7181     SvPOK_on(Rx);
7182     if (RExC_utf8)
7183         SvFLAGS(Rx) |= SVf_UTF8;
7184     *p++='('; *p++='?';
7185
7186     /* If a default, cover it using the caret */
7187     if (has_default) {
7188         *p++= DEFAULT_PAT_MOD;
7189     }
7190     if (has_charset) {
7191         STRLEN len;
7192         const char* name;
7193
7194         name = get_regex_charset_name(RExC_rx->extflags, &len);
7195         if strEQ(name, DEPENDS_PAT_MODS) {  /* /d under UTF-8 => /u */
7196             assert(RExC_utf8);
7197             name = UNICODE_PAT_MODS;
7198             len = sizeof(UNICODE_PAT_MODS) - 1;
7199         }
7200         Copy(name, p, len, char);
7201         p += len;
7202     }
7203     if (has_p)
7204         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7205     {
7206         char ch;
7207         while((ch = *fptr++)) {
7208             if(reganch & 1)
7209                 *p++ = ch;
7210             reganch >>= 1;
7211         }
7212     }
7213
7214     *p++ = ':';
7215     Copy(RExC_precomp, p, pat_len, char);
7216     assert ((RX_WRAPPED(Rx) - p) < 16);
7217     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7218     p += pat_len;
7219
7220     /* Adding a trailing \n causes this to compile properly:
7221             my $R = qr / A B C # D E/x; /($R)/
7222         Otherwise the parens are considered part of the comment */
7223     if (has_runon)
7224         *p++ = '\n';
7225     *p++ = ')';
7226     *p = 0;
7227     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7228 }
7229
7230 /*
7231  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7232  * regular expression into internal code.
7233  * The pattern may be passed either as:
7234  *    a list of SVs (patternp plus pat_count)
7235  *    a list of OPs (expr)
7236  * If both are passed, the SV list is used, but the OP list indicates
7237  * which SVs are actually pre-compiled code blocks
7238  *
7239  * The SVs in the list have magic and qr overloading applied to them (and
7240  * the list may be modified in-place with replacement SVs in the latter
7241  * case).
7242  *
7243  * If the pattern hasn't changed from old_re, then old_re will be
7244  * returned.
7245  *
7246  * eng is the current engine. If that engine has an op_comp method, then
7247  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7248  * do the initial concatenation of arguments and pass on to the external
7249  * engine.
7250  *
7251  * If is_bare_re is not null, set it to a boolean indicating whether the
7252  * arg list reduced (after overloading) to a single bare regex which has
7253  * been returned (i.e. /$qr/).
7254  *
7255  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7256  *
7257  * pm_flags contains the PMf_* flags, typically based on those from the
7258  * pm_flags field of the related PMOP. Currently we're only interested in
7259  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7260  *
7261  * For many years this code had an initial sizing pass that calculated
7262  * (sometimes incorrectly, leading to security holes) the size needed for the
7263  * compiled pattern.  That was changed by commit
7264  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7265  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7266  * references to this sizing pass.
7267  *
7268  * Now, an initial crude guess as to the size needed is made, based on the
7269  * length of the pattern.  Patches welcome to improve that guess.  That amount
7270  * of space is malloc'd and then immediately freed, and then clawed back node
7271  * by node.  This design is to minimze, to the extent possible, memory churn
7272  * when doing the the reallocs.
7273  *
7274  * A separate parentheses counting pass may be needed in some cases.
7275  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7276  * of these cases.
7277  *
7278  * The existence of a sizing pass necessitated design decisions that are no
7279  * longer needed.  There are potential areas of simplification.
7280  *
7281  * Beware that the optimization-preparation code in here knows about some
7282  * of the structure of the compiled regexp.  [I'll say.]
7283  */
7284
7285 REGEXP *
7286 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7287                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7288                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7289 {
7290     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7291     STRLEN plen;
7292     char *exp;
7293     regnode *scan;
7294     I32 flags;
7295     SSize_t minlen = 0;
7296     U32 rx_flags;
7297     SV *pat;
7298     SV** new_patternp = patternp;
7299
7300     /* these are all flags - maybe they should be turned
7301      * into a single int with different bit masks */
7302     I32 sawlookahead = 0;
7303     I32 sawplus = 0;
7304     I32 sawopen = 0;
7305     I32 sawminmod = 0;
7306
7307     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7308     bool recompile = 0;
7309     bool runtime_code = 0;
7310     scan_data_t data;
7311     RExC_state_t RExC_state;
7312     RExC_state_t * const pRExC_state = &RExC_state;
7313 #ifdef TRIE_STUDY_OPT
7314     int restudied = 0;
7315     RExC_state_t copyRExC_state;
7316 #endif
7317     GET_RE_DEBUG_FLAGS_DECL;
7318
7319     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7320
7321     DEBUG_r(if (!PL_colorset) reginitcolors());
7322
7323     /* Initialize these here instead of as-needed, as is quick and avoids
7324      * having to test them each time otherwise */
7325     if (! PL_InBitmap) {
7326 #ifdef DEBUGGING
7327         char * dump_len_string;
7328 #endif
7329
7330         /* This is calculated here, because the Perl program that generates the
7331          * static global ones doesn't currently have access to
7332          * NUM_ANYOF_CODE_POINTS */
7333         PL_InBitmap = _new_invlist(2);
7334         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7335                                                     NUM_ANYOF_CODE_POINTS - 1);
7336 #ifdef DEBUGGING
7337         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7338         if (   ! dump_len_string
7339             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7340         {
7341             PL_dump_re_max_len = 60;    /* A reasonable default */
7342         }
7343 #endif
7344     }
7345
7346     pRExC_state->warn_text = NULL;
7347     pRExC_state->code_blocks = NULL;
7348
7349     if (is_bare_re)
7350         *is_bare_re = FALSE;
7351
7352     if (expr && (expr->op_type == OP_LIST ||
7353                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7354         /* allocate code_blocks if needed */
7355         OP *o;
7356         int ncode = 0;
7357
7358         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7359             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7360                 ncode++; /* count of DO blocks */
7361
7362         if (ncode)
7363             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7364     }
7365
7366     if (!pat_count) {
7367         /* compile-time pattern with just OP_CONSTs and DO blocks */
7368
7369         int n;
7370         OP *o;
7371
7372         /* find how many CONSTs there are */
7373         assert(expr);
7374         n = 0;
7375         if (expr->op_type == OP_CONST)
7376             n = 1;
7377         else
7378             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7379                 if (o->op_type == OP_CONST)
7380                     n++;
7381             }
7382
7383         /* fake up an SV array */
7384
7385         assert(!new_patternp);
7386         Newx(new_patternp, n, SV*);
7387         SAVEFREEPV(new_patternp);
7388         pat_count = n;
7389
7390         n = 0;
7391         if (expr->op_type == OP_CONST)
7392             new_patternp[n] = cSVOPx_sv(expr);
7393         else
7394             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7395                 if (o->op_type == OP_CONST)
7396                     new_patternp[n++] = cSVOPo_sv;
7397             }
7398
7399     }
7400
7401     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7402         "Assembling pattern from %d elements%s\n", pat_count,
7403             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7404
7405     /* set expr to the first arg op */
7406
7407     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7408          && expr->op_type != OP_CONST)
7409     {
7410             expr = cLISTOPx(expr)->op_first;
7411             assert(   expr->op_type == OP_PUSHMARK
7412                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7413                    || expr->op_type == OP_PADRANGE);
7414             expr = OpSIBLING(expr);
7415     }
7416
7417     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7418                         expr, &recompile, NULL);
7419
7420     /* handle bare (possibly after overloading) regex: foo =~ $re */
7421     {
7422         SV *re = pat;
7423         if (SvROK(re))
7424             re = SvRV(re);
7425         if (SvTYPE(re) == SVt_REGEXP) {
7426             if (is_bare_re)
7427                 *is_bare_re = TRUE;
7428             SvREFCNT_inc(re);
7429             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7430                 "Precompiled pattern%s\n",
7431                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7432
7433             return (REGEXP*)re;
7434         }
7435     }
7436
7437     exp = SvPV_nomg(pat, plen);
7438
7439     if (!eng->op_comp) {
7440         if ((SvUTF8(pat) && IN_BYTES)
7441                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7442         {
7443             /* make a temporary copy; either to convert to bytes,
7444              * or to avoid repeating get-magic / overloaded stringify */
7445             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7446                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7447         }
7448         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7449     }
7450
7451     /* ignore the utf8ness if the pattern is 0 length */
7452     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7453     RExC_uni_semantics = 0;
7454     RExC_contains_locale = 0;
7455     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7456     RExC_in_script_run = 0;
7457     RExC_study_started = 0;
7458     pRExC_state->runtime_code_qr = NULL;
7459     RExC_frame_head= NULL;
7460     RExC_frame_last= NULL;
7461     RExC_frame_count= 0;
7462     RExC_latest_warn_offset = 0;
7463     RExC_use_BRANCHJ = 0;
7464     RExC_total_parens = 0;
7465     RExC_open_parens = NULL;
7466     RExC_close_parens = NULL;
7467     RExC_paren_names = NULL;
7468     RExC_size = 0;
7469     RExC_seen_d_op = FALSE;
7470 #ifdef DEBUGGING
7471     RExC_paren_name_list = NULL;
7472 #endif
7473
7474     DEBUG_r({
7475         RExC_mysv1= sv_newmortal();
7476         RExC_mysv2= sv_newmortal();
7477     });
7478
7479     DEBUG_COMPILE_r({
7480             SV *dsv= sv_newmortal();
7481             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7482             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7483                           PL_colors[4], PL_colors[5], s);
7484         });
7485
7486     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7487      * to utf8 */
7488
7489     if ((pm_flags & PMf_USE_RE_EVAL)
7490                 /* this second condition covers the non-regex literal case,
7491                  * i.e.  $foo =~ '(?{})'. */
7492                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7493     )
7494         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7495
7496   redo_parse:
7497     /* return old regex if pattern hasn't changed */
7498     /* XXX: note in the below we have to check the flags as well as the
7499      * pattern.
7500      *
7501      * Things get a touch tricky as we have to compare the utf8 flag
7502      * independently from the compile flags.  */
7503
7504     if (   old_re
7505         && !recompile
7506         && !!RX_UTF8(old_re) == !!RExC_utf8
7507         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7508         && RX_PRECOMP(old_re)
7509         && RX_PRELEN(old_re) == plen
7510         && memEQ(RX_PRECOMP(old_re), exp, plen)
7511         && !runtime_code /* with runtime code, always recompile */ )
7512     {
7513         return old_re;
7514     }
7515
7516     /* Allocate the pattern's SV */
7517     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7518     RExC_rx = ReANY(Rx);
7519     if ( RExC_rx == NULL )
7520         FAIL("Regexp out of space");
7521
7522     rx_flags = orig_rx_flags;
7523
7524     if (   (UTF || RExC_uni_semantics)
7525         && initial_charset == REGEX_DEPENDS_CHARSET)
7526     {
7527
7528         /* Set to use unicode semantics if the pattern is in utf8 and has the
7529          * 'depends' charset specified, as it means unicode when utf8  */
7530         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7531         RExC_uni_semantics = 1;
7532     }
7533
7534     RExC_pm_flags = pm_flags;
7535
7536     if (runtime_code) {
7537         assert(TAINTING_get || !TAINT_get);
7538         if (TAINT_get)
7539             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7540
7541         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7542             /* whoops, we have a non-utf8 pattern, whilst run-time code
7543              * got compiled as utf8. Try again with a utf8 pattern */
7544             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7545                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7546             goto redo_parse;
7547         }
7548     }
7549     assert(!pRExC_state->runtime_code_qr);
7550
7551     RExC_sawback = 0;
7552
7553     RExC_seen = 0;
7554     RExC_maxlen = 0;
7555     RExC_in_lookbehind = 0;
7556     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7557 #ifdef EBCDIC
7558     RExC_recode_x_to_native = 0;
7559 #endif
7560     RExC_in_multi_char_class = 0;
7561
7562     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7563     RExC_precomp_end = RExC_end = exp + plen;
7564     RExC_nestroot = 0;
7565     RExC_whilem_seen = 0;
7566     RExC_end_op = NULL;
7567     RExC_recurse = NULL;
7568     RExC_study_chunk_recursed = NULL;
7569     RExC_study_chunk_recursed_bytes= 0;
7570     RExC_recurse_count = 0;
7571     pRExC_state->code_index = 0;
7572
7573     /* Initialize the string in the compiled pattern.  This is so that there is
7574      * something to output if necessary */
7575     set_regex_pv(pRExC_state, Rx);
7576
7577     DEBUG_PARSE_r({
7578         Perl_re_printf( aTHX_
7579             "Starting parse and generation\n");
7580         RExC_lastnum=0;
7581         RExC_lastparse=NULL;
7582     });
7583
7584     /* Allocate space and zero-initialize. Note, the two step process
7585        of zeroing when in debug mode, thus anything assigned has to
7586        happen after that */
7587     if (!  RExC_size) {
7588
7589         /* On the first pass of the parse, we guess how big this will be.  Then
7590          * we grow in one operation to that amount and then give it back.  As
7591          * we go along, we re-allocate what we need.
7592          *
7593          * XXX Currently the guess is essentially that the pattern will be an
7594          * EXACT node with one byte input, one byte output.  This is crude, and
7595          * better heuristics are welcome.
7596          *
7597          * On any subsequent passes, we guess what we actually computed in the
7598          * latest earlier pass.  Such a pass probably didn't complete so is
7599          * missing stuff.  We could improve those guesses by knowing where the
7600          * parse stopped, and use the length so far plus apply the above
7601          * assumption to what's left. */
7602         RExC_size = STR_SZ(RExC_end - RExC_start);
7603     }
7604
7605     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7606     if ( RExC_rxi == NULL )
7607         FAIL("Regexp out of space");
7608
7609     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7610     RXi_SET( RExC_rx, RExC_rxi );
7611
7612     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7613      * node parsed will give back any excess memory we have allocated so far).
7614      * */
7615     RExC_size = 0;
7616
7617     /* non-zero initialization begins here */
7618     RExC_rx->engine= eng;
7619     RExC_rx->extflags = rx_flags;
7620     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7621
7622     if (pm_flags & PMf_IS_QR) {
7623         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7624         if (RExC_rxi->code_blocks) {
7625             RExC_rxi->code_blocks->refcnt++;
7626         }
7627     }
7628
7629     RExC_rx->intflags = 0;
7630
7631     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7632     RExC_parse = exp;
7633
7634     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7635      * code makes sure the final byte is an uncounted NUL.  But should this
7636      * ever not be the case, lots of things could read beyond the end of the
7637      * buffer: loops like
7638      *      while(isFOO(*RExC_parse)) RExC_parse++;
7639      *      strchr(RExC_parse, "foo");
7640      * etc.  So it is worth noting. */
7641     assert(*RExC_end == '\0');
7642
7643     RExC_naughty = 0;
7644     RExC_npar = 1;
7645     RExC_emit_start = RExC_rxi->program;
7646     pRExC_state->code_index = 0;
7647
7648     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7649     RExC_emit = 1;
7650
7651     /* Do the parse */
7652     if (reg(pRExC_state, 0, &flags, 1)) {
7653
7654         /* Success!, But if RExC_total_parens < 0, we need to redo the parse
7655          * knowing how many parens there actually are */
7656         if (RExC_total_parens < 0) {
7657             flags |= RESTART_PARSE;
7658         }
7659
7660         /* We have that number in RExC_npar */
7661         RExC_total_parens = RExC_npar;
7662     }
7663     else if (! MUST_RESTART(flags)) {
7664         ReREFCNT_dec(Rx);
7665         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7666     }
7667
7668     /* Here, we either have success, or we have to redo the parse for some reason */
7669     if (MUST_RESTART(flags)) {
7670
7671         /* It's possible to write a regexp in ascii that represents Unicode
7672         codepoints outside of the byte range, such as via \x{100}. If we
7673         detect such a sequence we have to convert the entire pattern to utf8
7674         and then recompile, as our sizing calculation will have been based
7675         on 1 byte == 1 character, but we will need to use utf8 to encode
7676         at least some part of the pattern, and therefore must convert the whole
7677         thing.
7678         -- dmq */
7679         if (flags & NEED_UTF8) {
7680
7681             /* We have stored the offset of the final warning output so far.
7682              * That must be adjusted.  Any variant characters between the start
7683              * of the pattern and this warning count for 2 bytes in the final,
7684              * so just add them again */
7685             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7686                 RExC_latest_warn_offset +=
7687                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7688                                                 + RExC_latest_warn_offset);
7689             }
7690             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7691             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7692             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7693         }
7694         else {
7695             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7696         }
7697
7698         if (RExC_total_parens > 0) {
7699             /* Make enough room for all the known parens, and zero it */
7700             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7701             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7702             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7703
7704             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7705             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7706         }
7707         else { /* Parse did not complete.  Reinitialize the parentheses
7708                   structures */
7709             RExC_total_parens = 0;
7710             if (RExC_open_parens) {
7711                 Safefree(RExC_open_parens);
7712                 RExC_open_parens = NULL;
7713             }
7714             if (RExC_close_parens) {
7715                 Safefree(RExC_close_parens);
7716                 RExC_close_parens = NULL;
7717             }
7718         }
7719
7720         /* Clean up what we did in this parse */
7721         SvREFCNT_dec_NN(RExC_rx_sv);
7722
7723         goto redo_parse;
7724     }
7725
7726     /* Here, we have successfully parsed and generated the pattern's program
7727      * for the regex engine.  We are ready to finish things up and look for
7728      * optimizations. */
7729
7730     /* Update the string to compile, with correct modifiers, etc */
7731     set_regex_pv(pRExC_state, Rx);
7732
7733     RExC_rx->nparens = RExC_total_parens - 1;
7734
7735     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7736     if (RExC_whilem_seen > 15)
7737         RExC_whilem_seen = 15;
7738
7739     DEBUG_PARSE_r({
7740         Perl_re_printf( aTHX_
7741             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7742         RExC_lastnum=0;
7743         RExC_lastparse=NULL;
7744     });
7745
7746 #ifdef RE_TRACK_PATTERN_OFFSETS
7747     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7748                           "%s %" UVuf " bytes for offset annotations.\n",
7749                           RExC_offsets ? "Got" : "Couldn't get",
7750                           (UV)((RExC_offsets[0] * 2 + 1))));
7751     DEBUG_OFFSETS_r(if (RExC_offsets) {
7752         const STRLEN len = RExC_offsets[0];
7753         STRLEN i;
7754         GET_RE_DEBUG_FLAGS_DECL;
7755         Perl_re_printf( aTHX_
7756                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7757         for (i = 1; i <= len; i++) {
7758             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7759                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7760                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7761         }
7762         Perl_re_printf( aTHX_  "\n");
7763     });
7764
7765 #else
7766     SetProgLen(RExC_rxi,RExC_size);
7767 #endif
7768
7769     DEBUG_OPTIMISE_r(
7770         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7771     );
7772
7773     /* XXXX To minimize changes to RE engine we always allocate
7774        3-units-long substrs field. */
7775     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7776     if (RExC_recurse_count) {
7777         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7778         SAVEFREEPV(RExC_recurse);
7779     }
7780
7781     if (RExC_seen & REG_RECURSE_SEEN) {
7782         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7783          * So its 1 if there are no parens. */
7784         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7785                                          ((RExC_total_parens & 0x07) != 0);
7786         Newx(RExC_study_chunk_recursed,
7787              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7788         SAVEFREEPV(RExC_study_chunk_recursed);
7789     }
7790
7791   reStudy:
7792     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7793     DEBUG_r(
7794         RExC_study_chunk_recursed_count= 0;
7795     );
7796     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7797     if (RExC_study_chunk_recursed) {
7798         Zero(RExC_study_chunk_recursed,
7799              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7800     }
7801
7802
7803 #ifdef TRIE_STUDY_OPT
7804     if (!restudied) {
7805         StructCopy(&zero_scan_data, &data, scan_data_t);
7806         copyRExC_state = RExC_state;
7807     } else {
7808         U32 seen=RExC_seen;
7809         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7810
7811         RExC_state = copyRExC_state;
7812         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7813             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7814         else
7815             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7816         StructCopy(&zero_scan_data, &data, scan_data_t);
7817     }
7818 #else
7819     StructCopy(&zero_scan_data, &data, scan_data_t);
7820 #endif
7821
7822     /* Dig out information for optimizations. */
7823     RExC_rx->extflags = RExC_flags; /* was pm_op */
7824     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7825
7826     if (UTF)
7827         SvUTF8_on(Rx);  /* Unicode in it? */
7828     RExC_rxi->regstclass = NULL;
7829     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7830         RExC_rx->intflags |= PREGf_NAUGHTY;
7831     scan = RExC_rxi->program + 1;               /* First BRANCH. */
7832
7833     /* testing for BRANCH here tells us whether there is "must appear"
7834        data in the pattern. If there is then we can use it for optimisations */
7835     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7836                                                   */
7837         SSize_t fake;
7838         STRLEN longest_length[2];
7839         regnode_ssc ch_class; /* pointed to by data */
7840         int stclass_flag;
7841         SSize_t last_close = 0; /* pointed to by data */
7842         regnode *first= scan;
7843         regnode *first_next= regnext(first);
7844         int i;
7845
7846         /*
7847          * Skip introductions and multiplicators >= 1
7848          * so that we can extract the 'meat' of the pattern that must
7849          * match in the large if() sequence following.
7850          * NOTE that EXACT is NOT covered here, as it is normally
7851          * picked up by the optimiser separately.
7852          *
7853          * This is unfortunate as the optimiser isnt handling lookahead
7854          * properly currently.
7855          *
7856          */
7857         while ((OP(first) == OPEN && (sawopen = 1)) ||
7858                /* An OR of *one* alternative - should not happen now. */
7859             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7860             /* for now we can't handle lookbehind IFMATCH*/
7861             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7862             (OP(first) == PLUS) ||
7863             (OP(first) == MINMOD) ||
7864                /* An {n,m} with n>0 */
7865             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7866             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7867         {
7868                 /*
7869                  * the only op that could be a regnode is PLUS, all the rest
7870                  * will be regnode_1 or regnode_2.
7871                  *
7872                  * (yves doesn't think this is true)
7873                  */
7874                 if (OP(first) == PLUS)
7875                     sawplus = 1;
7876                 else {
7877                     if (OP(first) == MINMOD)
7878                         sawminmod = 1;
7879                     first += regarglen[OP(first)];
7880                 }
7881                 first = NEXTOPER(first);
7882                 first_next= regnext(first);
7883         }
7884
7885         /* Starting-point info. */
7886       again:
7887         DEBUG_PEEP("first:", first, 0, 0);
7888         /* Ignore EXACT as we deal with it later. */
7889         if (PL_regkind[OP(first)] == EXACT) {
7890             if (   OP(first) == EXACT
7891                 || OP(first) == EXACT_ONLY8
7892                 || OP(first) == EXACTL)
7893             {
7894                 NOOP;   /* Empty, get anchored substr later. */
7895             }
7896             else
7897                 RExC_rxi->regstclass = first;
7898         }
7899 #ifdef TRIE_STCLASS
7900         else if (PL_regkind[OP(first)] == TRIE &&
7901                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7902         {
7903             /* this can happen only on restudy */
7904             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7905         }
7906 #endif
7907         else if (REGNODE_SIMPLE(OP(first)))
7908             RExC_rxi->regstclass = first;
7909         else if (PL_regkind[OP(first)] == BOUND ||
7910                  PL_regkind[OP(first)] == NBOUND)
7911             RExC_rxi->regstclass = first;
7912         else if (PL_regkind[OP(first)] == BOL) {
7913             RExC_rx->intflags |= (OP(first) == MBOL
7914                            ? PREGf_ANCH_MBOL
7915                            : PREGf_ANCH_SBOL);
7916             first = NEXTOPER(first);
7917             goto again;
7918         }
7919         else if (OP(first) == GPOS) {
7920             RExC_rx->intflags |= PREGf_ANCH_GPOS;
7921             first = NEXTOPER(first);
7922             goto again;
7923         }
7924         else if ((!sawopen || !RExC_sawback) &&
7925             !sawlookahead &&
7926             (OP(first) == STAR &&
7927             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7928             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7929         {
7930             /* turn .* into ^.* with an implied $*=1 */
7931             const int type =
7932                 (OP(NEXTOPER(first)) == REG_ANY)
7933                     ? PREGf_ANCH_MBOL
7934                     : PREGf_ANCH_SBOL;
7935             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
7936             first = NEXTOPER(first);
7937             goto again;
7938         }
7939         if (sawplus && !sawminmod && !sawlookahead
7940             && (!sawopen || !RExC_sawback)
7941             && !pRExC_state->code_blocks) /* May examine pos and $& */
7942             /* x+ must match at the 1st pos of run of x's */
7943             RExC_rx->intflags |= PREGf_SKIP;
7944
7945         /* Scan is after the zeroth branch, first is atomic matcher. */
7946 #ifdef TRIE_STUDY_OPT
7947         DEBUG_PARSE_r(
7948             if (!restudied)
7949                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7950                               (IV)(first - scan + 1))
7951         );
7952 #else
7953         DEBUG_PARSE_r(
7954             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7955                 (IV)(first - scan + 1))
7956         );
7957 #endif
7958
7959
7960         /*
7961         * If there's something expensive in the r.e., find the
7962         * longest literal string that must appear and make it the
7963         * regmust.  Resolve ties in favor of later strings, since
7964         * the regstart check works with the beginning of the r.e.
7965         * and avoiding duplication strengthens checking.  Not a
7966         * strong reason, but sufficient in the absence of others.
7967         * [Now we resolve ties in favor of the earlier string if
7968         * it happens that c_offset_min has been invalidated, since the
7969         * earlier string may buy us something the later one won't.]
7970         */
7971
7972         data.substrs[0].str = newSVpvs("");
7973         data.substrs[1].str = newSVpvs("");
7974         data.last_found = newSVpvs("");
7975         data.cur_is_floating = 0; /* initially any found substring is fixed */
7976         ENTER_with_name("study_chunk");
7977         SAVEFREESV(data.substrs[0].str);
7978         SAVEFREESV(data.substrs[1].str);
7979         SAVEFREESV(data.last_found);
7980         first = scan;
7981         if (!RExC_rxi->regstclass) {
7982             ssc_init(pRExC_state, &ch_class);
7983             data.start_class = &ch_class;
7984             stclass_flag = SCF_DO_STCLASS_AND;
7985         } else                          /* XXXX Check for BOUND? */
7986             stclass_flag = 0;
7987         data.last_closep = &last_close;
7988
7989         DEBUG_RExC_seen();
7990         /*
7991          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7992          * (NO top level branches)
7993          */
7994         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7995                              scan + RExC_size, /* Up to end */
7996             &data, -1, 0, NULL,
7997             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7998                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7999             0);
8000
8001
8002         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8003
8004
8005         if ( RExC_total_parens == 1 && !data.cur_is_floating
8006              && data.last_start_min == 0 && data.last_end > 0
8007              && !RExC_seen_zerolen
8008              && !(RExC_seen & REG_VERBARG_SEEN)
8009              && !(RExC_seen & REG_GPOS_SEEN)
8010         ){
8011             RExC_rx->extflags |= RXf_CHECK_ALL;
8012         }
8013         scan_commit(pRExC_state, &data,&minlen, 0);
8014
8015
8016         /* XXX this is done in reverse order because that's the way the
8017          * code was before it was parameterised. Don't know whether it
8018          * actually needs doing in reverse order. DAPM */
8019         for (i = 1; i >= 0; i--) {
8020             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8021
8022             if (   !(   i
8023                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8024                      &&    data.substrs[0].min_offset
8025                         == data.substrs[1].min_offset
8026                      &&    SvCUR(data.substrs[0].str)
8027                         == SvCUR(data.substrs[1].str)
8028                     )
8029                 && S_setup_longest (aTHX_ pRExC_state,
8030                                         &(RExC_rx->substrs->data[i]),
8031                                         &(data.substrs[i]),
8032                                         longest_length[i]))
8033             {
8034                 RExC_rx->substrs->data[i].min_offset =
8035                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8036
8037                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8038                 /* Don't offset infinity */
8039                 if (data.substrs[i].max_offset < SSize_t_MAX)
8040                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8041                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8042             }
8043             else {
8044                 RExC_rx->substrs->data[i].substr      = NULL;
8045                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8046                 longest_length[i] = 0;
8047             }
8048         }
8049
8050         LEAVE_with_name("study_chunk");
8051
8052         if (RExC_rxi->regstclass
8053             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8054             RExC_rxi->regstclass = NULL;
8055
8056         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8057               || RExC_rx->substrs->data[0].min_offset)
8058             && stclass_flag
8059             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8060             && is_ssc_worth_it(pRExC_state, data.start_class))
8061         {
8062             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8063
8064             ssc_finalize(pRExC_state, data.start_class);
8065
8066             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8067             StructCopy(data.start_class,
8068                        (regnode_ssc*)RExC_rxi->data->data[n],
8069                        regnode_ssc);
8070             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8071             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8072             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8073                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8074                       Perl_re_printf( aTHX_
8075                                     "synthetic stclass \"%s\".\n",
8076                                     SvPVX_const(sv));});
8077             data.start_class = NULL;
8078         }
8079
8080         /* A temporary algorithm prefers floated substr to fixed one of
8081          * same length to dig more info. */
8082         i = (longest_length[0] <= longest_length[1]);
8083         RExC_rx->substrs->check_ix = i;
8084         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8085         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8086         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8087         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8088         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8089         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8090             RExC_rx->intflags |= PREGf_NOSCAN;
8091
8092         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8093             RExC_rx->extflags |= RXf_USE_INTUIT;
8094             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8095                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8096         }
8097
8098         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8099         if ( (STRLEN)minlen < longest_length[1] )
8100             minlen= longest_length[1];
8101         if ( (STRLEN)minlen < longest_length[0] )
8102             minlen= longest_length[0];
8103         */
8104     }
8105     else {
8106         /* Several toplevels. Best we can is to set minlen. */
8107         SSize_t fake;
8108         regnode_ssc ch_class;
8109         SSize_t last_close = 0;
8110
8111         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8112
8113         scan = RExC_rxi->program + 1;
8114         ssc_init(pRExC_state, &ch_class);
8115         data.start_class = &ch_class;
8116         data.last_closep = &last_close;
8117
8118         DEBUG_RExC_seen();
8119         /*
8120          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8121          * (patterns WITH top level branches)
8122          */
8123         minlen = study_chunk(pRExC_state,
8124             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8125             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8126                                                       ? SCF_TRIE_DOING_RESTUDY
8127                                                       : 0),
8128             0);
8129
8130         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8131
8132         RExC_rx->check_substr = NULL;
8133         RExC_rx->check_utf8 = NULL;
8134         RExC_rx->substrs->data[0].substr      = NULL;
8135         RExC_rx->substrs->data[0].utf8_substr = NULL;
8136         RExC_rx->substrs->data[1].substr      = NULL;
8137         RExC_rx->substrs->data[1].utf8_substr = NULL;
8138
8139         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8140             && is_ssc_worth_it(pRExC_state, data.start_class))
8141         {
8142             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8143
8144             ssc_finalize(pRExC_state, data.start_class);
8145
8146             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8147             StructCopy(data.start_class,
8148                        (regnode_ssc*)RExC_rxi->data->data[n],
8149                        regnode_ssc);
8150             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8151             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8152             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8153                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8154                       Perl_re_printf( aTHX_
8155                                     "synthetic stclass \"%s\".\n",
8156                                     SvPVX_const(sv));});
8157             data.start_class = NULL;
8158         }
8159     }
8160
8161     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8162         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8163         RExC_rx->maxlen = REG_INFTY;
8164     }
8165     else {
8166         RExC_rx->maxlen = RExC_maxlen;
8167     }
8168
8169     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8170        the "real" pattern. */
8171     DEBUG_OPTIMISE_r({
8172         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8173                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8174     });
8175     RExC_rx->minlenret = minlen;
8176     if (RExC_rx->minlen < minlen)
8177         RExC_rx->minlen = minlen;
8178
8179     if (RExC_seen & REG_RECURSE_SEEN ) {
8180         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8181         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8182     }
8183     if (RExC_seen & REG_GPOS_SEEN)
8184         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8185     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8186         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8187                                                 lookbehind */
8188     if (pRExC_state->code_blocks)
8189         RExC_rx->extflags |= RXf_EVAL_SEEN;
8190     if (RExC_seen & REG_VERBARG_SEEN)
8191     {
8192         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8193         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8194     }
8195     if (RExC_seen & REG_CUTGROUP_SEEN)
8196         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8197     if (pm_flags & PMf_USE_RE_EVAL)
8198         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8199     if (RExC_paren_names)
8200         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8201     else
8202         RXp_PAREN_NAMES(RExC_rx) = NULL;
8203
8204     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8205      * so it can be used in pp.c */
8206     if (RExC_rx->intflags & PREGf_ANCH)
8207         RExC_rx->extflags |= RXf_IS_ANCHORED;
8208
8209
8210     {
8211         /* this is used to identify "special" patterns that might result
8212          * in Perl NOT calling the regex engine and instead doing the match "itself",
8213          * particularly special cases in split//. By having the regex compiler
8214          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8215          * we avoid weird issues with equivalent patterns resulting in different behavior,
8216          * AND we allow non Perl engines to get the same optimizations by the setting the
8217          * flags appropriately - Yves */
8218         regnode *first = RExC_rxi->program + 1;
8219         U8 fop = OP(first);
8220         regnode *next = regnext(first);
8221         U8 nop = OP(next);
8222
8223         if (PL_regkind[fop] == NOTHING && nop == END)
8224             RExC_rx->extflags |= RXf_NULL;
8225         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8226             /* when fop is SBOL first->flags will be true only when it was
8227              * produced by parsing /\A/, and not when parsing /^/. This is
8228              * very important for the split code as there we want to
8229              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8230              * See rt #122761 for more details. -- Yves */
8231             RExC_rx->extflags |= RXf_START_ONLY;
8232         else if (fop == PLUS
8233                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8234                  && nop == END)
8235             RExC_rx->extflags |= RXf_WHITE;
8236         else if ( RExC_rx->extflags & RXf_SPLIT
8237                   && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8238                   && STR_LEN(first) == 1
8239                   && *(STRING(first)) == ' '
8240                   && nop == END )
8241             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8242
8243     }
8244
8245     if (RExC_contains_locale) {
8246         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8247     }
8248
8249 #ifdef DEBUGGING
8250     if (RExC_paren_names) {
8251         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8252         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8253                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8254     } else
8255 #endif
8256     RExC_rxi->name_list_idx = 0;
8257
8258     while ( RExC_recurse_count > 0 ) {
8259         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8260         /*
8261          * This data structure is set up in study_chunk() and is used
8262          * to calculate the distance between a GOSUB regopcode and
8263          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8264          * it refers to.
8265          *
8266          * If for some reason someone writes code that optimises
8267          * away a GOSUB opcode then the assert should be changed to
8268          * an if(scan) to guard the ARG2L_SET() - Yves
8269          *
8270          */
8271         assert(scan && OP(scan) == GOSUB);
8272         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8273     }
8274
8275     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8276     /* assume we don't need to swap parens around before we match */
8277     DEBUG_TEST_r({
8278         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8279             (unsigned long)RExC_study_chunk_recursed_count);
8280     });
8281     DEBUG_DUMP_r({
8282         DEBUG_RExC_seen();
8283         Perl_re_printf( aTHX_ "Final program:\n");
8284         regdump(RExC_rx);
8285     });
8286
8287     if (RExC_open_parens) {
8288         Safefree(RExC_open_parens);
8289         RExC_open_parens = NULL;
8290     }
8291     if (RExC_close_parens) {
8292         Safefree(RExC_close_parens);
8293         RExC_close_parens = NULL;
8294     }
8295
8296 #ifdef USE_ITHREADS
8297     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8298      * by setting the regexp SV to readonly-only instead. If the
8299      * pattern's been recompiled, the USEDness should remain. */
8300     if (old_re && SvREADONLY(old_re))
8301         SvREADONLY_on(Rx);
8302 #endif
8303     return Rx;
8304 }
8305
8306
8307 SV*
8308 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8309                     const U32 flags)
8310 {
8311     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8312
8313     PERL_UNUSED_ARG(value);
8314
8315     if (flags & RXapif_FETCH) {
8316         return reg_named_buff_fetch(rx, key, flags);
8317     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8318         Perl_croak_no_modify();
8319         return NULL;
8320     } else if (flags & RXapif_EXISTS) {
8321         return reg_named_buff_exists(rx, key, flags)
8322             ? &PL_sv_yes
8323             : &PL_sv_no;
8324     } else if (flags & RXapif_REGNAMES) {
8325         return reg_named_buff_all(rx, flags);
8326     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8327         return reg_named_buff_scalar(rx, flags);
8328     } else {
8329         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8330         return NULL;
8331     }
8332 }
8333
8334 SV*
8335 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8336                          const U32 flags)
8337 {
8338     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8339     PERL_UNUSED_ARG(lastkey);
8340
8341     if (flags & RXapif_FIRSTKEY)
8342         return reg_named_buff_firstkey(rx, flags);
8343     else if (flags & RXapif_NEXTKEY)
8344         return reg_named_buff_nextkey(rx, flags);
8345     else {
8346         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8347                                             (int)flags);
8348         return NULL;
8349     }
8350 }
8351
8352 SV*
8353 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8354                           const U32 flags)
8355 {
8356     SV *ret;
8357     struct regexp *const rx = ReANY(r);
8358
8359     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8360
8361     if (rx && RXp_PAREN_NAMES(rx)) {
8362         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8363         if (he_str) {
8364             IV i;
8365             SV* sv_dat=HeVAL(he_str);
8366             I32 *nums=(I32*)SvPVX(sv_dat);
8367             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8368             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8369                 if ((I32)(rx->nparens) >= nums[i]
8370                     && rx->offs[nums[i]].start != -1
8371                     && rx->offs[nums[i]].end != -1)
8372                 {
8373                     ret = newSVpvs("");
8374                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8375                     if (!retarray)
8376                         return ret;
8377                 } else {
8378                     if (retarray)
8379                         ret = newSVsv(&PL_sv_undef);
8380                 }
8381                 if (retarray)
8382                     av_push(retarray, ret);
8383             }
8384             if (retarray)
8385                 return newRV_noinc(MUTABLE_SV(retarray));
8386         }
8387     }
8388     return NULL;
8389 }
8390
8391 bool
8392 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8393                            const U32 flags)
8394 {
8395     struct regexp *const rx = ReANY(r);
8396
8397     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8398
8399     if (rx && RXp_PAREN_NAMES(rx)) {
8400         if (flags & RXapif_ALL) {
8401             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8402         } else {
8403             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8404             if (sv) {
8405                 SvREFCNT_dec_NN(sv);
8406                 return TRUE;
8407             } else {
8408                 return FALSE;
8409             }
8410         }
8411     } else {
8412         return FALSE;
8413     }
8414 }
8415
8416 SV*
8417 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8418 {
8419     struct regexp *const rx = ReANY(r);
8420
8421     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8422
8423     if ( rx && RXp_PAREN_NAMES(rx) ) {
8424         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8425
8426         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8427     } else {
8428         return FALSE;
8429     }
8430 }
8431
8432 SV*
8433 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8434 {
8435     struct regexp *const rx = ReANY(r);
8436     GET_RE_DEBUG_FLAGS_DECL;
8437
8438     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8439
8440     if (rx && RXp_PAREN_NAMES(rx)) {
8441         HV *hv = RXp_PAREN_NAMES(rx);
8442         HE *temphe;
8443         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8444             IV i;
8445             IV parno = 0;
8446             SV* sv_dat = HeVAL(temphe);
8447             I32 *nums = (I32*)SvPVX(sv_dat);
8448             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8449                 if ((I32)(rx->lastparen) >= nums[i] &&
8450                     rx->offs[nums[i]].start != -1 &&
8451                     rx->offs[nums[i]].end != -1)
8452                 {
8453                     parno = nums[i];
8454                     break;
8455                 }
8456             }
8457             if (parno || flags & RXapif_ALL) {
8458                 return newSVhek(HeKEY_hek(temphe));
8459             }
8460         }
8461     }
8462     return NULL;
8463 }
8464
8465 SV*
8466 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8467 {
8468     SV *ret;
8469     AV *av;
8470     SSize_t length;
8471     struct regexp *const rx = ReANY(r);
8472
8473     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8474
8475     if (rx && RXp_PAREN_NAMES(rx)) {
8476         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8477             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8478         } else if (flags & RXapif_ONE) {
8479             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8480             av = MUTABLE_AV(SvRV(ret));
8481             length = av_tindex(av);
8482             SvREFCNT_dec_NN(ret);
8483             return newSViv(length + 1);
8484         } else {
8485             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8486                                                 (int)flags);
8487             return NULL;
8488         }
8489     }
8490     return &PL_sv_undef;
8491 }
8492
8493 SV*
8494 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8495 {
8496     struct regexp *const rx = ReANY(r);
8497     AV *av = newAV();
8498
8499     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8500
8501     if (rx && RXp_PAREN_NAMES(rx)) {
8502         HV *hv= RXp_PAREN_NAMES(rx);
8503         HE *temphe;
8504         (void)hv_iterinit(hv);
8505         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8506             IV i;
8507             IV parno = 0;
8508             SV* sv_dat = HeVAL(temphe);
8509             I32 *nums = (I32*)SvPVX(sv_dat);
8510             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8511                 if ((I32)(rx->lastparen) >= nums[i] &&
8512                     rx->offs[nums[i]].start != -1 &&
8513                     rx->offs[nums[i]].end != -1)
8514                 {
8515                     parno = nums[i];
8516                     break;
8517                 }
8518             }
8519             if (parno || flags & RXapif_ALL) {
8520                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8521             }
8522         }
8523     }
8524
8525     return newRV_noinc(MUTABLE_SV(av));
8526 }
8527
8528 void
8529 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8530                              SV * const sv)
8531 {
8532     struct regexp *const rx = ReANY(r);
8533     char *s = NULL;
8534     SSize_t i = 0;
8535     SSize_t s1, t1;
8536     I32 n = paren;
8537
8538     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8539
8540     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8541            || n == RX_BUFF_IDX_CARET_FULLMATCH
8542            || n == RX_BUFF_IDX_CARET_POSTMATCH
8543        )
8544     {
8545         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8546         if (!keepcopy) {
8547             /* on something like
8548              *    $r = qr/.../;
8549              *    /$qr/p;
8550              * the KEEPCOPY is set on the PMOP rather than the regex */
8551             if (PL_curpm && r == PM_GETRE(PL_curpm))
8552                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8553         }
8554         if (!keepcopy)
8555             goto ret_undef;
8556     }
8557
8558     if (!rx->subbeg)
8559         goto ret_undef;
8560
8561     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8562         /* no need to distinguish between them any more */
8563         n = RX_BUFF_IDX_FULLMATCH;
8564
8565     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8566         && rx->offs[0].start != -1)
8567     {
8568         /* $`, ${^PREMATCH} */
8569         i = rx->offs[0].start;
8570         s = rx->subbeg;
8571     }
8572     else
8573     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8574         && rx->offs[0].end != -1)
8575     {
8576         /* $', ${^POSTMATCH} */
8577         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8578         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8579     }
8580     else
8581     if ( 0 <= n && n <= (I32)rx->nparens &&
8582         (s1 = rx->offs[n].start) != -1 &&
8583         (t1 = rx->offs[n].end) != -1)
8584     {
8585         /* $&, ${^MATCH},  $1 ... */
8586         i = t1 - s1;
8587         s = rx->subbeg + s1 - rx->suboffset;
8588     } else {
8589         goto ret_undef;
8590     }
8591
8592     assert(s >= rx->subbeg);
8593     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8594     if (i >= 0) {
8595 #ifdef NO_TAINT_SUPPORT
8596         sv_setpvn(sv, s, i);
8597 #else
8598         const int oldtainted = TAINT_get;
8599         TAINT_NOT;
8600         sv_setpvn(sv, s, i);
8601         TAINT_set(oldtainted);
8602 #endif
8603         if (RXp_MATCH_UTF8(rx))
8604             SvUTF8_on(sv);
8605         else
8606             SvUTF8_off(sv);
8607         if (TAINTING_get) {
8608             if (RXp_MATCH_TAINTED(rx)) {
8609                 if (SvTYPE(sv) >= SVt_PVMG) {
8610                     MAGIC* const mg = SvMAGIC(sv);
8611                     MAGIC* mgt;
8612                     TAINT;
8613                     SvMAGIC_set(sv, mg->mg_moremagic);
8614                     SvTAINT(sv);
8615                     if ((mgt = SvMAGIC(sv))) {
8616                         mg->mg_moremagic = mgt;
8617                         SvMAGIC_set(sv, mg);
8618                     }
8619                 } else {
8620                     TAINT;
8621                     SvTAINT(sv);
8622                 }
8623             } else
8624                 SvTAINTED_off(sv);
8625         }
8626     } else {
8627       ret_undef:
8628         sv_set_undef(sv);
8629         return;
8630     }
8631 }
8632
8633 void
8634 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8635                                                          SV const * const value)
8636 {
8637     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8638
8639     PERL_UNUSED_ARG(rx);
8640     PERL_UNUSED_ARG(paren);
8641     PERL_UNUSED_ARG(value);
8642
8643     if (!PL_localizing)
8644         Perl_croak_no_modify();
8645 }
8646
8647 I32
8648 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8649                               const I32 paren)
8650 {
8651     struct regexp *const rx = ReANY(r);
8652     I32 i;
8653     I32 s1, t1;
8654
8655     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8656
8657     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8658         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8659         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8660     )
8661     {
8662         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8663         if (!keepcopy) {
8664             /* on something like
8665              *    $r = qr/.../;
8666              *    /$qr/p;
8667              * the KEEPCOPY is set on the PMOP rather than the regex */
8668             if (PL_curpm && r == PM_GETRE(PL_curpm))
8669                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8670         }
8671         if (!keepcopy)
8672             goto warn_undef;
8673     }
8674
8675     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8676     switch (paren) {
8677       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8678       case RX_BUFF_IDX_PREMATCH:       /* $` */
8679         if (rx->offs[0].start != -1) {
8680                         i = rx->offs[0].start;
8681                         if (i > 0) {
8682                                 s1 = 0;
8683                                 t1 = i;
8684                                 goto getlen;
8685                         }
8686             }
8687         return 0;
8688
8689       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8690       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8691             if (rx->offs[0].end != -1) {
8692                         i = rx->sublen - rx->offs[0].end;
8693                         if (i > 0) {
8694                                 s1 = rx->offs[0].end;
8695                                 t1 = rx->sublen;
8696                                 goto getlen;
8697                         }
8698             }
8699         return 0;
8700
8701       default: /* $& / ${^MATCH}, $1, $2, ... */
8702             if (paren <= (I32)rx->nparens &&
8703             (s1 = rx->offs[paren].start) != -1 &&
8704             (t1 = rx->offs[paren].end) != -1)
8705             {
8706             i = t1 - s1;
8707             goto getlen;
8708         } else {
8709           warn_undef:
8710             if (ckWARN(WARN_UNINITIALIZED))
8711                 report_uninit((const SV *)sv);
8712             return 0;
8713         }
8714     }
8715   getlen:
8716     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8717         const char * const s = rx->subbeg - rx->suboffset + s1;
8718         const U8 *ep;
8719         STRLEN el;
8720
8721         i = t1 - s1;
8722         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8723                         i = el;
8724     }
8725     return i;
8726 }
8727
8728 SV*
8729 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8730 {
8731     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8732         PERL_UNUSED_ARG(rx);
8733         if (0)
8734             return NULL;
8735         else
8736             return newSVpvs("Regexp");
8737 }
8738
8739 /* Scans the name of a named buffer from the pattern.
8740  * If flags is REG_RSN_RETURN_NULL returns null.
8741  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8742  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8743  * to the parsed name as looked up in the RExC_paren_names hash.
8744  * If there is an error throws a vFAIL().. type exception.
8745  */
8746
8747 #define REG_RSN_RETURN_NULL    0
8748 #define REG_RSN_RETURN_NAME    1
8749 #define REG_RSN_RETURN_DATA    2
8750
8751 STATIC SV*
8752 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8753 {
8754     char *name_start = RExC_parse;
8755     SV* sv_name;
8756
8757     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8758
8759     assert (RExC_parse <= RExC_end);
8760     if (RExC_parse == RExC_end) NOOP;
8761     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8762          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8763           * using do...while */
8764         if (UTF)
8765             do {
8766                 RExC_parse += UTF8SKIP(RExC_parse);
8767             } while (   RExC_parse < RExC_end
8768                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8769         else
8770             do {
8771                 RExC_parse++;
8772             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8773     } else {
8774         RExC_parse++; /* so the <- from the vFAIL is after the offending
8775                          character */
8776         vFAIL("Group name must start with a non-digit word character");
8777     }
8778     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8779                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8780     if ( flags == REG_RSN_RETURN_NAME)
8781         return sv_name;
8782     else if (flags==REG_RSN_RETURN_DATA) {
8783         HE *he_str = NULL;
8784         SV *sv_dat = NULL;
8785         if ( ! sv_name )      /* should not happen*/
8786             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8787         if (RExC_paren_names)
8788             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8789         if ( he_str )
8790             sv_dat = HeVAL(he_str);
8791         if ( ! sv_dat ) {   /* Didn't find group */
8792
8793             /* It might be a forward reference; we can't fail until we
8794                 * know, by completing the parse to get all the groups, and
8795                 * then reparsing */
8796             if (RExC_total_parens > 0)  {
8797                 vFAIL("Reference to nonexistent named group");
8798             }
8799             else {
8800                 REQUIRE_PARENS_PASS;
8801             }
8802         }
8803         return sv_dat;
8804     }
8805
8806     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8807                      (unsigned long) flags);
8808 }
8809
8810 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8811     if (RExC_lastparse!=RExC_parse) {                           \
8812         Perl_re_printf( aTHX_  "%s",                            \
8813             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8814                 RExC_end - RExC_parse, 16,                      \
8815                 "", "",                                         \
8816                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8817                 PERL_PV_PRETTY_ELLIPSES   |                     \
8818                 PERL_PV_PRETTY_LTGT       |                     \
8819                 PERL_PV_ESCAPE_RE         |                     \
8820                 PERL_PV_PRETTY_EXACTSIZE                        \
8821             )                                                   \
8822         );                                                      \
8823     } else                                                      \
8824         Perl_re_printf( aTHX_ "%16s","");                       \
8825                                                                 \
8826     if (RExC_lastnum!=RExC_emit)                                \
8827        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8828     else                                                        \
8829        Perl_re_printf( aTHX_ "|%4s","");                        \
8830     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8831         (int)((depth*2)), "",                                   \
8832         (funcname)                                              \
8833     );                                                          \
8834     RExC_lastnum=RExC_emit;                                     \
8835     RExC_lastparse=RExC_parse;                                  \
8836 })
8837
8838
8839
8840 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8841     DEBUG_PARSE_MSG((funcname));                            \
8842     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8843 })
8844 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8845     DEBUG_PARSE_MSG((funcname));                            \
8846     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8847 })
8848
8849 /* This section of code defines the inversion list object and its methods.  The
8850  * interfaces are highly subject to change, so as much as possible is static to
8851  * this file.  An inversion list is here implemented as a malloc'd C UV array
8852  * as an SVt_INVLIST scalar.
8853  *
8854  * An inversion list for Unicode is an array of code points, sorted by ordinal
8855  * number.  Each element gives the code point that begins a range that extends
8856  * up-to but not including the code point given by the next element.  The final
8857  * element gives the first code point of a range that extends to the platform's
8858  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8859  * ...) give ranges whose code points are all in the inversion list.  We say
8860  * that those ranges are in the set.  The odd-numbered elements give ranges
8861  * whose code points are not in the inversion list, and hence not in the set.
8862  * Thus, element [0] is the first code point in the list.  Element [1]
8863  * is the first code point beyond that not in the list; and element [2] is the
8864  * first code point beyond that that is in the list.  In other words, the first
8865  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8866  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8867  * all code points in that range are not in the inversion list.  The third
8868  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8869  * list, and so forth.  Thus every element whose index is divisible by two
8870  * gives the beginning of a range that is in the list, and every element whose
8871  * index is not divisible by two gives the beginning of a range not in the
8872  * list.  If the final element's index is divisible by two, the inversion list
8873  * extends to the platform's infinity; otherwise the highest code point in the
8874  * inversion list is the contents of that element minus 1.
8875  *
8876  * A range that contains just a single code point N will look like
8877  *  invlist[i]   == N
8878  *  invlist[i+1] == N+1
8879  *
8880  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8881  * impossible to represent, so element [i+1] is omitted.  The single element
8882  * inversion list
8883  *  invlist[0] == UV_MAX
8884  * contains just UV_MAX, but is interpreted as matching to infinity.
8885  *
8886  * Taking the complement (inverting) an inversion list is quite simple, if the
8887  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8888  * This implementation reserves an element at the beginning of each inversion
8889  * list to always contain 0; there is an additional flag in the header which
8890  * indicates if the list begins at the 0, or is offset to begin at the next
8891  * element.  This means that the inversion list can be inverted without any
8892  * copying; just flip the flag.
8893  *
8894  * More about inversion lists can be found in "Unicode Demystified"
8895  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8896  *
8897  * The inversion list data structure is currently implemented as an SV pointing
8898  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8899  * array of UV whose memory management is automatically handled by the existing
8900  * facilities for SV's.
8901  *
8902  * Some of the methods should always be private to the implementation, and some
8903  * should eventually be made public */
8904
8905 /* The header definitions are in F<invlist_inline.h> */
8906
8907 #ifndef PERL_IN_XSUB_RE
8908
8909 PERL_STATIC_INLINE UV*
8910 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8911 {
8912     /* Returns a pointer to the first element in the inversion list's array.
8913      * This is called upon initialization of an inversion list.  Where the
8914      * array begins depends on whether the list has the code point U+0000 in it
8915      * or not.  The other parameter tells it whether the code that follows this
8916      * call is about to put a 0 in the inversion list or not.  The first
8917      * element is either the element reserved for 0, if TRUE, or the element
8918      * after it, if FALSE */
8919
8920     bool* offset = get_invlist_offset_addr(invlist);
8921     UV* zero_addr = (UV *) SvPVX(invlist);
8922
8923     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8924
8925     /* Must be empty */
8926     assert(! _invlist_len(invlist));
8927
8928     *zero_addr = 0;
8929
8930     /* 1^1 = 0; 1^0 = 1 */
8931     *offset = 1 ^ will_have_0;
8932     return zero_addr + *offset;
8933 }
8934
8935 PERL_STATIC_INLINE void
8936 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8937 {
8938     /* Sets the current number of elements stored in the inversion list.
8939      * Updates SvCUR correspondingly */
8940     PERL_UNUSED_CONTEXT;
8941     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8942
8943     assert(is_invlist(invlist));
8944
8945     SvCUR_set(invlist,
8946               (len == 0)
8947                ? 0
8948                : TO_INTERNAL_SIZE(len + offset));
8949     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8950 }
8951
8952 STATIC void
8953 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8954 {
8955     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
8956      * steals the list from 'src', so 'src' is made to have a NULL list.  This
8957      * is similar to what SvSetMagicSV() would do, if it were implemented on
8958      * inversion lists, though this routine avoids a copy */
8959
8960     const UV src_len          = _invlist_len(src);
8961     const bool src_offset     = *get_invlist_offset_addr(src);
8962     const STRLEN src_byte_len = SvLEN(src);
8963     char * array              = SvPVX(src);
8964
8965     const int oldtainted = TAINT_get;
8966
8967     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8968
8969     assert(is_invlist(src));
8970     assert(is_invlist(dest));
8971     assert(! invlist_is_iterating(src));
8972     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8973
8974     /* Make sure it ends in the right place with a NUL, as our inversion list
8975      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8976      * asserts it */
8977     array[src_byte_len - 1] = '\0';
8978
8979     TAINT_NOT;      /* Otherwise it breaks */
8980     sv_usepvn_flags(dest,
8981                     (char *) array,
8982                     src_byte_len - 1,
8983
8984                     /* This flag is documented to cause a copy to be avoided */
8985                     SV_HAS_TRAILING_NUL);
8986     TAINT_set(oldtainted);
8987     SvPV_set(src, 0);
8988     SvLEN_set(src, 0);
8989     SvCUR_set(src, 0);
8990
8991     /* Finish up copying over the other fields in an inversion list */
8992     *get_invlist_offset_addr(dest) = src_offset;
8993     invlist_set_len(dest, src_len, src_offset);
8994     *get_invlist_previous_index_addr(dest) = 0;
8995     invlist_iterfinish(dest);
8996 }
8997
8998 PERL_STATIC_INLINE IV*
8999 S_get_invlist_previous_index_addr(SV* invlist)
9000 {
9001     /* Return the address of the IV that is reserved to hold the cached index
9002      * */
9003     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9004
9005     assert(is_invlist(invlist));
9006
9007     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9008 }
9009
9010 PERL_STATIC_INLINE IV
9011 S_invlist_previous_index(SV* const invlist)
9012 {
9013     /* Returns cached index of previous search */
9014
9015     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9016
9017     return *get_invlist_previous_index_addr(invlist);
9018 }
9019
9020 PERL_STATIC_INLINE void
9021 S_invlist_set_previous_index(SV* const invlist, const IV index)
9022 {
9023     /* Caches <index> for later retrieval */
9024
9025     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9026
9027     assert(index == 0 || index < (int) _invlist_len(invlist));
9028
9029     *get_invlist_previous_index_addr(invlist) = index;
9030 }
9031
9032 PERL_STATIC_INLINE void
9033 S_invlist_trim(SV* invlist)
9034 {
9035     /* Free the not currently-being-used space in an inversion list */
9036
9037     /* But don't free up the space needed for the 0 UV that is always at the
9038      * beginning of the list, nor the trailing NUL */
9039     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9040
9041     PERL_ARGS_ASSERT_INVLIST_TRIM;
9042
9043     assert(is_invlist(invlist));
9044
9045     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9046 }
9047
9048 PERL_STATIC_INLINE void
9049 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9050 {
9051     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9052
9053     assert(is_invlist(invlist));
9054
9055     invlist_set_len(invlist, 0, 0);
9056     invlist_trim(invlist);
9057 }
9058
9059 #endif /* ifndef PERL_IN_XSUB_RE */
9060
9061 PERL_STATIC_INLINE bool
9062 S_invlist_is_iterating(SV* const invlist)
9063 {
9064     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9065
9066     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9067 }
9068
9069 #ifndef PERL_IN_XSUB_RE
9070
9071 PERL_STATIC_INLINE UV
9072 S_invlist_max(SV* const invlist)
9073 {
9074     /* Returns the maximum number of elements storable in the inversion list's
9075      * array, without having to realloc() */
9076
9077     PERL_ARGS_ASSERT_INVLIST_MAX;
9078
9079     assert(is_invlist(invlist));
9080
9081     /* Assumes worst case, in which the 0 element is not counted in the
9082      * inversion list, so subtracts 1 for that */
9083     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9084            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9085            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9086 }
9087
9088 STATIC void
9089 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9090 {
9091     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9092
9093     /* First 1 is in case the zero element isn't in the list; second 1 is for
9094      * trailing NUL */
9095     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9096     invlist_set_len(invlist, 0, 0);
9097
9098     /* Force iterinit() to be used to get iteration to work */
9099     invlist_iterfinish(invlist);
9100
9101     *get_invlist_previous_index_addr(invlist) = 0;
9102 }
9103
9104 SV*
9105 Perl__new_invlist(pTHX_ IV initial_size)
9106 {
9107
9108     /* Return a pointer to a newly constructed inversion list, with enough
9109      * space to store 'initial_size' elements.  If that number is negative, a
9110      * system default is used instead */
9111
9112     SV* new_list;
9113
9114     if (initial_size < 0) {
9115         initial_size = 10;
9116     }
9117
9118     /* Allocate the initial space */
9119     new_list = newSV_type(SVt_INVLIST);
9120
9121     initialize_invlist_guts(new_list, initial_size);
9122
9123     return new_list;
9124 }
9125
9126 SV*
9127 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9128 {
9129     /* Return a pointer to a newly constructed inversion list, initialized to
9130      * point to <list>, which has to be in the exact correct inversion list
9131      * form, including internal fields.  Thus this is a dangerous routine that
9132      * should not be used in the wrong hands.  The passed in 'list' contains
9133      * several header fields at the beginning that are not part of the
9134      * inversion list body proper */
9135
9136     const STRLEN length = (STRLEN) list[0];
9137     const UV version_id =          list[1];
9138     const bool offset   =    cBOOL(list[2]);
9139 #define HEADER_LENGTH 3
9140     /* If any of the above changes in any way, you must change HEADER_LENGTH
9141      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9142      *      perl -E 'say int(rand 2**31-1)'
9143      */
9144 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9145                                         data structure type, so that one being
9146                                         passed in can be validated to be an
9147                                         inversion list of the correct vintage.
9148                                        */
9149
9150     SV* invlist = newSV_type(SVt_INVLIST);
9151
9152     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9153
9154     if (version_id != INVLIST_VERSION_ID) {
9155         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9156     }
9157
9158     /* The generated array passed in includes header elements that aren't part
9159      * of the list proper, so start it just after them */
9160     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9161
9162     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9163                                shouldn't touch it */
9164
9165     *(get_invlist_offset_addr(invlist)) = offset;
9166
9167     /* The 'length' passed to us is the physical number of elements in the
9168      * inversion list.  But if there is an offset the logical number is one
9169      * less than that */
9170     invlist_set_len(invlist, length  - offset, offset);
9171
9172     invlist_set_previous_index(invlist, 0);
9173
9174     /* Initialize the iteration pointer. */
9175     invlist_iterfinish(invlist);
9176
9177     SvREADONLY_on(invlist);
9178
9179     return invlist;
9180 }
9181
9182 STATIC void
9183 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9184 {
9185     /* Grow the maximum size of an inversion list */
9186
9187     PERL_ARGS_ASSERT_INVLIST_EXTEND;
9188
9189     assert(is_invlist(invlist));
9190
9191     /* Add one to account for the zero element at the beginning which may not
9192      * be counted by the calling parameters */
9193     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9194 }
9195
9196 STATIC void
9197 S__append_range_to_invlist(pTHX_ SV* const invlist,
9198                                  const UV start, const UV end)
9199 {
9200    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9201     * the end of the inversion list.  The range must be above any existing
9202     * ones. */
9203
9204     UV* array;
9205     UV max = invlist_max(invlist);
9206     UV len = _invlist_len(invlist);
9207     bool offset;
9208
9209     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9210
9211     if (len == 0) { /* Empty lists must be initialized */
9212         offset = start != 0;
9213         array = _invlist_array_init(invlist, ! offset);
9214     }
9215     else {
9216         /* Here, the existing list is non-empty. The current max entry in the
9217          * list is generally the first value not in the set, except when the
9218          * set extends to the end of permissible values, in which case it is
9219          * the first entry in that final set, and so this call is an attempt to
9220          * append out-of-order */
9221
9222         UV final_element = len - 1;
9223         array = invlist_array(invlist);
9224         if (   array[final_element] > start
9225             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9226         {
9227             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9228                      array[final_element], start,
9229                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9230         }
9231
9232         /* Here, it is a legal append.  If the new range begins 1 above the end
9233          * of the range below it, it is extending the range below it, so the
9234          * new first value not in the set is one greater than the newly
9235          * extended range.  */
9236         offset = *get_invlist_offset_addr(invlist);
9237         if (array[final_element] == start) {
9238             if (end != UV_MAX) {
9239                 array[final_element] = end + 1;
9240             }
9241             else {
9242                 /* But if the end is the maximum representable on the machine,
9243                  * assume that infinity was actually what was meant.  Just let
9244                  * the range that this would extend to have no end */
9245                 invlist_set_len(invlist, len - 1, offset);
9246             }
9247             return;
9248         }
9249     }
9250
9251     /* Here the new range doesn't extend any existing set.  Add it */
9252
9253     len += 2;   /* Includes an element each for the start and end of range */
9254
9255     /* If wll overflow the existing space, extend, which may cause the array to
9256      * be moved */
9257     if (max < len) {
9258         invlist_extend(invlist, len);
9259
9260         /* Have to set len here to avoid assert failure in invlist_array() */
9261         invlist_set_len(invlist, len, offset);
9262
9263         array = invlist_array(invlist);
9264     }
9265     else {
9266         invlist_set_len(invlist, len, offset);
9267     }
9268
9269     /* The next item on the list starts the range, the one after that is
9270      * one past the new range.  */
9271     array[len - 2] = start;
9272     if (end != UV_MAX) {
9273         array[len - 1] = end + 1;
9274     }
9275     else {
9276         /* But if the end is the maximum representable on the machine, just let
9277          * the range have no end */
9278         invlist_set_len(invlist, len - 1, offset);
9279     }
9280 }
9281
9282 SSize_t
9283 Perl__invlist_search(SV* const invlist, const UV cp)
9284 {
9285     /* Searches the inversion list for the entry that contains the input code
9286      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9287      * return value is the index into the list's array of the range that
9288      * contains <cp>, that is, 'i' such that
9289      *  array[i] <= cp < array[i+1]
9290      */
9291
9292     IV low = 0;
9293     IV mid;
9294     IV high = _invlist_len(invlist);
9295     const IV highest_element = high - 1;
9296     const UV* array;
9297
9298     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9299
9300     /* If list is empty, return failure. */
9301     if (high == 0) {
9302         return -1;
9303     }
9304
9305     /* (We can't get the array unless we know the list is non-empty) */
9306     array = invlist_array(invlist);
9307
9308     mid = invlist_previous_index(invlist);
9309     assert(mid >=0);
9310     if (mid > highest_element) {
9311         mid = highest_element;
9312     }
9313
9314     /* <mid> contains the cache of the result of the previous call to this
9315      * function (0 the first time).  See if this call is for the same result,
9316      * or if it is for mid-1.  This is under the theory that calls to this
9317      * function will often be for related code points that are near each other.
9318      * And benchmarks show that caching gives better results.  We also test
9319      * here if the code point is within the bounds of the list.  These tests
9320      * replace others that would have had to be made anyway to make sure that
9321      * the array bounds were not exceeded, and these give us extra information
9322      * at the same time */
9323     if (cp >= array[mid]) {
9324         if (cp >= array[highest_element]) {
9325             return highest_element;
9326         }
9327
9328         /* Here, array[mid] <= cp < array[highest_element].  This means that
9329          * the final element is not the answer, so can exclude it; it also
9330          * means that <mid> is not the final element, so can refer to 'mid + 1'
9331          * safely */
9332         if (cp < array[mid + 1]) {
9333             return mid;
9334         }
9335         high--;
9336         low = mid + 1;
9337     }
9338     else { /* cp < aray[mid] */
9339         if (cp < array[0]) { /* Fail if outside the array */
9340             return -1;
9341         }
9342         high = mid;
9343         if (cp >= array[mid - 1]) {
9344             goto found_entry;
9345         }
9346     }
9347
9348     /* Binary search.  What we are looking for is <i> such that
9349      *  array[i] <= cp < array[i+1]
9350      * The loop below converges on the i+1.  Note that there may not be an
9351      * (i+1)th element in the array, and things work nonetheless */
9352     while (low < high) {
9353         mid = (low + high) / 2;
9354         assert(mid <= highest_element);
9355         if (array[mid] <= cp) { /* cp >= array[mid] */
9356             low = mid + 1;
9357
9358             /* We could do this extra test to exit the loop early.
9359             if (cp < array[low]) {
9360                 return mid;
9361             }
9362             */
9363         }
9364         else { /* cp < array[mid] */
9365             high = mid;
9366         }
9367     }
9368
9369   found_entry:
9370     high--;
9371     invlist_set_previous_index(invlist, high);
9372     return high;
9373 }
9374
9375 void
9376 Perl__invlist_populate_swatch(SV* const invlist,
9377                               const UV start, const UV end, U8* swatch)
9378 {
9379     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
9380      * but is used when the swash has an inversion list.  This makes this much
9381      * faster, as it uses a binary search instead of a linear one.  This is
9382      * intimately tied to that function, and perhaps should be in utf8.c,
9383      * except it is intimately tied to inversion lists as well.  It assumes
9384      * that <swatch> is all 0's on input */
9385
9386     UV current = start;
9387     const IV len = _invlist_len(invlist);
9388     IV i;
9389     const UV * array;
9390
9391     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
9392
9393     if (len == 0) { /* Empty inversion list */
9394         return;
9395     }
9396
9397     array = invlist_array(invlist);
9398
9399     /* Find which element it is */
9400     i = _invlist_search(invlist, start);
9401
9402     /* We populate from <start> to <end> */
9403     while (current < end) {
9404         UV upper;
9405
9406         /* The inversion list gives the results for every possible code point
9407          * after the first one in the list.  Only those ranges whose index is
9408          * even are ones that the inversion list matches.  For the odd ones,
9409          * and if the initial code point is not in the list, we have to skip
9410          * forward to the next element */
9411         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9412             i++;
9413             if (i >= len) { /* Finished if beyond the end of the array */
9414                 return;
9415             }
9416             current = array[i];
9417             if (current >= end) {   /* Finished if beyond the end of what we
9418                                        are populating */
9419                 if (LIKELY(end < UV_MAX)) {
9420                     return;
9421                 }
9422
9423                 /* We get here when the upper bound is the maximum
9424                  * representable on the machine, and we are looking for just
9425                  * that code point.  Have to special case it */
9426                 i = len;
9427                 goto join_end_of_list;
9428             }
9429         }
9430         assert(current >= start);
9431
9432         /* The current range ends one below the next one, except don't go past
9433          * <end> */
9434         i++;
9435         upper = (i < len && array[i] < end) ? array[i] : end;
9436
9437         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
9438          * for each code point in it */
9439         for (; current < upper; current++) {
9440             const STRLEN offset = (STRLEN)(current - start);
9441             swatch[offset >> 3] |= 1 << (offset & 7);
9442         }
9443
9444       join_end_of_list:
9445
9446         /* Quit if at the end of the list */
9447         if (i >= len) {
9448
9449             /* But first, have to deal with the highest possible code point on
9450              * the platform.  The previous code assumes that <end> is one
9451              * beyond where we want to populate, but that is impossible at the
9452              * platform's infinity, so have to handle it specially */
9453             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9454             {
9455                 const STRLEN offset = (STRLEN)(end - start);
9456                 swatch[offset >> 3] |= 1 << (offset & 7);
9457             }
9458             return;
9459         }
9460
9461         /* Advance to the next range, which will be for code points not in the
9462          * inversion list */
9463         current = array[i];
9464     }
9465
9466     return;
9467 }
9468
9469 void
9470 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9471                                          const bool complement_b, SV** output)
9472 {
9473     /* Take the union of two inversion lists and point '*output' to it.  On
9474      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9475      * even 'a' or 'b').  If to an inversion list, the contents of the original
9476      * list will be replaced by the union.  The first list, 'a', may be
9477      * NULL, in which case a copy of the second list is placed in '*output'.
9478      * If 'complement_b' is TRUE, the union is taken of the complement
9479      * (inversion) of 'b' instead of b itself.
9480      *
9481      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9482      * Richard Gillam, published by Addison-Wesley, and explained at some
9483      * length there.  The preface says to incorporate its examples into your
9484      * code at your own risk.
9485      *
9486      * The algorithm is like a merge sort. */
9487
9488     const UV* array_a;    /* a's array */
9489     const UV* array_b;
9490     UV len_a;       /* length of a's array */
9491     UV len_b;
9492
9493     SV* u;                      /* the resulting union */
9494     UV* array_u;
9495     UV len_u = 0;
9496
9497     UV i_a = 0;             /* current index into a's array */
9498     UV i_b = 0;
9499     UV i_u = 0;
9500
9501     /* running count, as explained in the algorithm source book; items are
9502      * stopped accumulating and are output when the count changes to/from 0.
9503      * The count is incremented when we start a range that's in an input's set,
9504      * and decremented when we start a range that's not in a set.  So this
9505      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9506      * and hence nothing goes into the union; 1, just one of the inputs is in
9507      * its set (and its current range gets added to the union); and 2 when both
9508      * inputs are in their sets.  */
9509     UV count = 0;
9510
9511     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9512     assert(a != b);
9513     assert(*output == NULL || is_invlist(*output));
9514
9515     len_b = _invlist_len(b);
9516     if (len_b == 0) {
9517
9518         /* Here, 'b' is empty, hence it's complement is all possible code
9519          * points.  So if the union includes the complement of 'b', it includes
9520          * everything, and we need not even look at 'a'.  It's easiest to
9521          * create a new inversion list that matches everything.  */
9522         if (complement_b) {
9523             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9524
9525             if (*output == NULL) { /* If the output didn't exist, just point it
9526                                       at the new list */
9527                 *output = everything;
9528             }
9529             else { /* Otherwise, replace its contents with the new list */
9530                 invlist_replace_list_destroys_src(*output, everything);
9531                 SvREFCNT_dec_NN(everything);
9532             }
9533
9534             return;
9535         }
9536
9537         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9538          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9539          * output will be empty */
9540
9541         if (a == NULL || _invlist_len(a) == 0) {
9542             if (*output == NULL) {
9543                 *output = _new_invlist(0);
9544             }
9545             else {
9546                 invlist_clear(*output);
9547             }
9548             return;
9549         }
9550
9551         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9552          * union.  We can just return a copy of 'a' if '*output' doesn't point
9553          * to an existing list */
9554         if (*output == NULL) {
9555             *output = invlist_clone(a, NULL);
9556             return;
9557         }
9558
9559         /* If the output is to overwrite 'a', we have a no-op, as it's
9560          * already in 'a' */
9561         if (*output == a) {
9562             return;
9563         }
9564
9565         /* Here, '*output' is to be overwritten by 'a' */
9566         u = invlist_clone(a, NULL);
9567         invlist_replace_list_destroys_src(*output, u);
9568         SvREFCNT_dec_NN(u);
9569
9570         return;
9571     }
9572
9573     /* Here 'b' is not empty.  See about 'a' */
9574
9575     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9576
9577         /* Here, 'a' is empty (and b is not).  That means the union will come
9578          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9579          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9580          * the clone */
9581
9582         SV ** dest = (*output == NULL) ? output : &u;
9583         *dest = invlist_clone(b, NULL);
9584         if (complement_b) {
9585             _invlist_invert(*dest);
9586         }
9587
9588         if (dest == &u) {
9589             invlist_replace_list_destroys_src(*output, u);
9590             SvREFCNT_dec_NN(u);
9591         }
9592
9593         return;
9594     }
9595
9596     /* Here both lists exist and are non-empty */
9597     array_a = invlist_array(a);
9598     array_b = invlist_array(b);
9599
9600     /* If are to take the union of 'a' with the complement of b, set it
9601      * up so are looking at b's complement. */
9602     if (complement_b) {
9603
9604         /* To complement, we invert: if the first element is 0, remove it.  To
9605          * do this, we just pretend the array starts one later */
9606         if (array_b[0] == 0) {
9607             array_b++;
9608             len_b--;
9609         }
9610         else {
9611
9612             /* But if the first element is not zero, we pretend the list starts
9613              * at the 0 that is always stored immediately before the array. */
9614             array_b--;
9615             len_b++;
9616         }
9617     }
9618
9619     /* Size the union for the worst case: that the sets are completely
9620      * disjoint */
9621     u = _new_invlist(len_a + len_b);
9622
9623     /* Will contain U+0000 if either component does */
9624     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9625                                       || (len_b > 0 && array_b[0] == 0));
9626
9627     /* Go through each input list item by item, stopping when have exhausted
9628      * one of them */
9629     while (i_a < len_a && i_b < len_b) {
9630         UV cp;      /* The element to potentially add to the union's array */
9631         bool cp_in_set;   /* is it in the the input list's set or not */
9632
9633         /* We need to take one or the other of the two inputs for the union.
9634          * Since we are merging two sorted lists, we take the smaller of the
9635          * next items.  In case of a tie, we take first the one that is in its
9636          * set.  If we first took the one not in its set, it would decrement
9637          * the count, possibly to 0 which would cause it to be output as ending
9638          * the range, and the next time through we would take the same number,
9639          * and output it again as beginning the next range.  By doing it the
9640          * opposite way, there is no possibility that the count will be
9641          * momentarily decremented to 0, and thus the two adjoining ranges will
9642          * be seamlessly merged.  (In a tie and both are in the set or both not
9643          * in the set, it doesn't matter which we take first.) */
9644         if (       array_a[i_a] < array_b[i_b]
9645             || (   array_a[i_a] == array_b[i_b]
9646                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9647         {
9648             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9649             cp = array_a[i_a++];
9650         }
9651         else {
9652             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9653             cp = array_b[i_b++];
9654         }
9655
9656         /* Here, have chosen which of the two inputs to look at.  Only output
9657          * if the running count changes to/from 0, which marks the
9658          * beginning/end of a range that's in the set */
9659         if (cp_in_set) {
9660             if (count == 0) {
9661                 array_u[i_u++] = cp;
9662             }
9663             count++;
9664         }
9665         else {
9666             count--;
9667             if (count == 0) {
9668                 array_u[i_u++] = cp;
9669             }
9670         }
9671     }
9672
9673
9674     /* The loop above increments the index into exactly one of the input lists
9675      * each iteration, and ends when either index gets to its list end.  That
9676      * means the other index is lower than its end, and so something is
9677      * remaining in that one.  We decrement 'count', as explained below, if
9678      * that list is in its set.  (i_a and i_b each currently index the element
9679      * beyond the one we care about.) */
9680     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9681         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9682     {
9683         count--;
9684     }
9685
9686     /* Above we decremented 'count' if the list that had unexamined elements in
9687      * it was in its set.  This has made it so that 'count' being non-zero
9688      * means there isn't anything left to output; and 'count' equal to 0 means
9689      * that what is left to output is precisely that which is left in the
9690      * non-exhausted input list.
9691      *
9692      * To see why, note first that the exhausted input obviously has nothing
9693      * left to add to the union.  If it was in its set at its end, that means
9694      * the set extends from here to the platform's infinity, and hence so does
9695      * the union and the non-exhausted set is irrelevant.  The exhausted set
9696      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9697      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9698      * 'count' remains at 1.  This is consistent with the decremented 'count'
9699      * != 0 meaning there's nothing left to add to the union.
9700      *
9701      * But if the exhausted input wasn't in its set, it contributed 0 to
9702      * 'count', and the rest of the union will be whatever the other input is.
9703      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9704      * otherwise it gets decremented to 0.  This is consistent with 'count'
9705      * == 0 meaning the remainder of the union is whatever is left in the
9706      * non-exhausted list. */
9707     if (count != 0) {
9708         len_u = i_u;
9709     }
9710     else {
9711         IV copy_count = len_a - i_a;
9712         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9713             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9714         }
9715         else { /* The non-exhausted input is b */
9716             copy_count = len_b - i_b;
9717             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9718         }
9719         len_u = i_u + copy_count;
9720     }
9721
9722     /* Set the result to the final length, which can change the pointer to
9723      * array_u, so re-find it.  (Note that it is unlikely that this will
9724      * change, as we are shrinking the space, not enlarging it) */
9725     if (len_u != _invlist_len(u)) {
9726         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9727         invlist_trim(u);
9728         array_u = invlist_array(u);
9729     }
9730
9731     if (*output == NULL) {  /* Simply return the new inversion list */
9732         *output = u;
9733     }
9734     else {
9735         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9736          * could instead free '*output', and then set it to 'u', but experience
9737          * has shown [perl #127392] that if the input is a mortal, we can get a
9738          * huge build-up of these during regex compilation before they get
9739          * freed. */
9740         invlist_replace_list_destroys_src(*output, u);
9741         SvREFCNT_dec_NN(u);
9742     }
9743
9744     return;
9745 }
9746
9747 void
9748 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9749                                                const bool complement_b, SV** i)
9750 {
9751     /* Take the intersection of two inversion lists and point '*i' to it.  On
9752      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9753      * even 'a' or 'b').  If to an inversion list, the contents of the original
9754      * list will be replaced by the intersection.  The first list, 'a', may be
9755      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9756      * TRUE, the result will be the intersection of 'a' and the complement (or
9757      * inversion) of 'b' instead of 'b' directly.
9758      *
9759      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9760      * Richard Gillam, published by Addison-Wesley, and explained at some
9761      * length there.  The preface says to incorporate its examples into your
9762      * code at your own risk.  In fact, it had bugs
9763      *
9764      * The algorithm is like a merge sort, and is essentially the same as the
9765      * union above
9766      */
9767
9768     const UV* array_a;          /* a's array */
9769     const UV* array_b;
9770     UV len_a;   /* length of a's array */
9771     UV len_b;
9772
9773     SV* r;                   /* the resulting intersection */
9774     UV* array_r;
9775     UV len_r = 0;
9776
9777     UV i_a = 0;             /* current index into a's array */
9778     UV i_b = 0;
9779     UV i_r = 0;
9780
9781     /* running count of how many of the two inputs are postitioned at ranges
9782      * that are in their sets.  As explained in the algorithm source book,
9783      * items are stopped accumulating and are output when the count changes
9784      * to/from 2.  The count is incremented when we start a range that's in an
9785      * input's set, and decremented when we start a range that's not in a set.
9786      * Only when it is 2 are we in the intersection. */
9787     UV count = 0;
9788
9789     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9790     assert(a != b);
9791     assert(*i == NULL || is_invlist(*i));
9792
9793     /* Special case if either one is empty */
9794     len_a = (a == NULL) ? 0 : _invlist_len(a);
9795     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9796         if (len_a != 0 && complement_b) {
9797
9798             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9799              * must be empty.  Here, also we are using 'b's complement, which
9800              * hence must be every possible code point.  Thus the intersection
9801              * is simply 'a'. */
9802
9803             if (*i == a) {  /* No-op */
9804                 return;
9805             }
9806
9807             if (*i == NULL) {
9808                 *i = invlist_clone(a, NULL);
9809                 return;
9810             }
9811
9812             r = invlist_clone(a, NULL);
9813             invlist_replace_list_destroys_src(*i, r);
9814             SvREFCNT_dec_NN(r);
9815             return;
9816         }
9817
9818         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9819          * intersection must be empty */
9820         if (*i == NULL) {
9821             *i = _new_invlist(0);
9822             return;
9823         }
9824
9825         invlist_clear(*i);
9826         return;
9827     }
9828
9829     /* Here both lists exist and are non-empty */
9830     array_a = invlist_array(a);
9831     array_b = invlist_array(b);
9832
9833     /* If are to take the intersection of 'a' with the complement of b, set it
9834      * up so are looking at b's complement. */
9835     if (complement_b) {
9836
9837         /* To complement, we invert: if the first element is 0, remove it.  To
9838          * do this, we just pretend the array starts one later */
9839         if (array_b[0] == 0) {
9840             array_b++;
9841             len_b--;
9842         }
9843         else {
9844
9845             /* But if the first element is not zero, we pretend the list starts
9846              * at the 0 that is always stored immediately before the array. */
9847             array_b--;
9848             len_b++;
9849         }
9850     }
9851
9852     /* Size the intersection for the worst case: that the intersection ends up
9853      * fragmenting everything to be completely disjoint */
9854     r= _new_invlist(len_a + len_b);
9855
9856     /* Will contain U+0000 iff both components do */
9857     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9858                                      && len_b > 0 && array_b[0] == 0);
9859
9860     /* Go through each list item by item, stopping when have exhausted one of
9861      * them */
9862     while (i_a < len_a && i_b < len_b) {
9863         UV cp;      /* The element to potentially add to the intersection's
9864                        array */
9865         bool cp_in_set; /* Is it in the input list's set or not */
9866
9867         /* We need to take one or the other of the two inputs for the
9868          * intersection.  Since we are merging two sorted lists, we take the
9869          * smaller of the next items.  In case of a tie, we take first the one
9870          * that is not in its set (a difference from the union algorithm).  If
9871          * we first took the one in its set, it would increment the count,
9872          * possibly to 2 which would cause it to be output as starting a range
9873          * in the intersection, and the next time through we would take that
9874          * same number, and output it again as ending the set.  By doing the
9875          * opposite of this, there is no possibility that the count will be
9876          * momentarily incremented to 2.  (In a tie and both are in the set or
9877          * both not in the set, it doesn't matter which we take first.) */
9878         if (       array_a[i_a] < array_b[i_b]
9879             || (   array_a[i_a] == array_b[i_b]
9880                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9881         {
9882             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9883             cp = array_a[i_a++];
9884         }
9885         else {
9886             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9887             cp= array_b[i_b++];
9888         }
9889
9890         /* Here, have chosen which of the two inputs to look at.  Only output
9891          * if the running count changes to/from 2, which marks the
9892          * beginning/end of a range that's in the intersection */
9893         if (cp_in_set) {
9894             count++;
9895             if (count == 2) {
9896                 array_r[i_r++] = cp;
9897             }
9898         }
9899         else {
9900             if (count == 2) {
9901                 array_r[i_r++] = cp;
9902             }
9903             count--;
9904         }
9905
9906     }
9907
9908     /* The loop above increments the index into exactly one of the input lists
9909      * each iteration, and ends when either index gets to its list end.  That
9910      * means the other index is lower than its end, and so something is
9911      * remaining in that one.  We increment 'count', as explained below, if the
9912      * exhausted list was in its set.  (i_a and i_b each currently index the
9913      * element beyond the one we care about.) */
9914     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9915         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9916     {
9917         count++;
9918     }
9919
9920     /* Above we incremented 'count' if the exhausted list was in its set.  This
9921      * has made it so that 'count' being below 2 means there is nothing left to
9922      * output; otheriwse what's left to add to the intersection is precisely
9923      * that which is left in the non-exhausted input list.
9924      *
9925      * To see why, note first that the exhausted input obviously has nothing
9926      * left to affect the intersection.  If it was in its set at its end, that
9927      * means the set extends from here to the platform's infinity, and hence
9928      * anything in the non-exhausted's list will be in the intersection, and
9929      * anything not in it won't be.  Hence, the rest of the intersection is
9930      * precisely what's in the non-exhausted list  The exhausted set also
9931      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9932      * it means 'count' is now at least 2.  This is consistent with the
9933      * incremented 'count' being >= 2 means to add the non-exhausted list to
9934      * the intersection.
9935      *
9936      * But if the exhausted input wasn't in its set, it contributed 0 to
9937      * 'count', and the intersection can't include anything further; the
9938      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9939      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9940      * further to add to the intersection. */
9941     if (count < 2) { /* Nothing left to put in the intersection. */
9942         len_r = i_r;
9943     }
9944     else { /* copy the non-exhausted list, unchanged. */
9945         IV copy_count = len_a - i_a;
9946         if (copy_count > 0) {   /* a is the one with stuff left */
9947             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9948         }
9949         else {  /* b is the one with stuff left */
9950             copy_count = len_b - i_b;
9951             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9952         }
9953         len_r = i_r + copy_count;
9954     }
9955
9956     /* Set the result to the final length, which can change the pointer to
9957      * array_r, so re-find it.  (Note that it is unlikely that this will
9958      * change, as we are shrinking the space, not enlarging it) */
9959     if (len_r != _invlist_len(r)) {
9960         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9961         invlist_trim(r);
9962         array_r = invlist_array(r);
9963     }
9964
9965     if (*i == NULL) { /* Simply return the calculated intersection */
9966         *i = r;
9967     }
9968     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9969               instead free '*i', and then set it to 'r', but experience has
9970               shown [perl #127392] that if the input is a mortal, we can get a
9971               huge build-up of these during regex compilation before they get
9972               freed. */
9973         if (len_r) {
9974             invlist_replace_list_destroys_src(*i, r);
9975         }
9976         else {
9977             invlist_clear(*i);
9978         }
9979         SvREFCNT_dec_NN(r);
9980     }
9981
9982     return;
9983 }
9984
9985 SV*
9986 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9987 {
9988     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9989      * set.  A pointer to the inversion list is returned.  This may actually be
9990      * a new list, in which case the passed in one has been destroyed.  The
9991      * passed-in inversion list can be NULL, in which case a new one is created
9992      * with just the one range in it.  The new list is not necessarily
9993      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9994      * result of this function.  The gain would not be large, and in many
9995      * cases, this is called multiple times on a single inversion list, so
9996      * anything freed may almost immediately be needed again.
9997      *
9998      * This used to mostly call the 'union' routine, but that is much more
9999      * heavyweight than really needed for a single range addition */
10000
10001     UV* array;              /* The array implementing the inversion list */
10002     UV len;                 /* How many elements in 'array' */
10003     SSize_t i_s;            /* index into the invlist array where 'start'
10004                                should go */
10005     SSize_t i_e = 0;        /* And the index where 'end' should go */
10006     UV cur_highest;         /* The highest code point in the inversion list
10007                                upon entry to this function */
10008
10009     /* This range becomes the whole inversion list if none already existed */
10010     if (invlist == NULL) {
10011         invlist = _new_invlist(2);
10012         _append_range_to_invlist(invlist, start, end);
10013         return invlist;
10014     }
10015
10016     /* Likewise, if the inversion list is currently empty */
10017     len = _invlist_len(invlist);
10018     if (len == 0) {
10019         _append_range_to_invlist(invlist, start, end);
10020         return invlist;
10021     }
10022
10023     /* Starting here, we have to know the internals of the list */
10024     array = invlist_array(invlist);
10025
10026     /* If the new range ends higher than the current highest ... */
10027     cur_highest = invlist_highest(invlist);
10028     if (end > cur_highest) {
10029
10030         /* If the whole range is higher, we can just append it */
10031         if (start > cur_highest) {
10032             _append_range_to_invlist(invlist, start, end);
10033             return invlist;
10034         }
10035
10036         /* Otherwise, add the portion that is higher ... */
10037         _append_range_to_invlist(invlist, cur_highest + 1, end);
10038
10039         /* ... and continue on below to handle the rest.  As a result of the
10040          * above append, we know that the index of the end of the range is the
10041          * final even numbered one of the array.  Recall that the final element
10042          * always starts a range that extends to infinity.  If that range is in
10043          * the set (meaning the set goes from here to infinity), it will be an
10044          * even index, but if it isn't in the set, it's odd, and the final
10045          * range in the set is one less, which is even. */
10046         if (end == UV_MAX) {
10047             i_e = len;
10048         }
10049         else {
10050             i_e = len - 2;
10051         }
10052     }
10053
10054     /* We have dealt with appending, now see about prepending.  If the new
10055      * range starts lower than the current lowest ... */
10056     if (start < array[0]) {
10057
10058         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10059          * Let the union code handle it, rather than having to know the
10060          * trickiness in two code places.  */
10061         if (UNLIKELY(start == 0)) {
10062             SV* range_invlist;
10063
10064             range_invlist = _new_invlist(2);
10065             _append_range_to_invlist(range_invlist, start, end);
10066
10067             _invlist_union(invlist, range_invlist, &invlist);
10068
10069             SvREFCNT_dec_NN(range_invlist);
10070
10071             return invlist;
10072         }
10073
10074         /* If the whole new range comes before the first entry, and doesn't
10075          * extend it, we have to insert it as an additional range */
10076         if (end < array[0] - 1) {
10077             i_s = i_e = -1;
10078             goto splice_in_new_range;
10079         }
10080
10081         /* Here the new range adjoins the existing first range, extending it
10082          * downwards. */
10083         array[0] = start;
10084
10085         /* And continue on below to handle the rest.  We know that the index of
10086          * the beginning of the range is the first one of the array */
10087         i_s = 0;
10088     }
10089     else { /* Not prepending any part of the new range to the existing list.
10090             * Find where in the list it should go.  This finds i_s, such that:
10091             *     invlist[i_s] <= start < array[i_s+1]
10092             */
10093         i_s = _invlist_search(invlist, start);
10094     }
10095
10096     /* At this point, any extending before the beginning of the inversion list
10097      * and/or after the end has been done.  This has made it so that, in the
10098      * code below, each endpoint of the new range is either in a range that is
10099      * in the set, or is in a gap between two ranges that are.  This means we
10100      * don't have to worry about exceeding the array bounds.
10101      *
10102      * Find where in the list the new range ends (but we can skip this if we
10103      * have already determined what it is, or if it will be the same as i_s,
10104      * which we already have computed) */
10105     if (i_e == 0) {
10106         i_e = (start == end)
10107               ? i_s
10108               : _invlist_search(invlist, end);
10109     }
10110
10111     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10112      * is a range that goes to infinity there is no element at invlist[i_e+1],
10113      * so only the first relation holds. */
10114
10115     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10116
10117         /* Here, the ranges on either side of the beginning of the new range
10118          * are in the set, and this range starts in the gap between them.
10119          *
10120          * The new range extends the range above it downwards if the new range
10121          * ends at or above that range's start */
10122         const bool extends_the_range_above = (   end == UV_MAX
10123                                               || end + 1 >= array[i_s+1]);
10124
10125         /* The new range extends the range below it upwards if it begins just
10126          * after where that range ends */
10127         if (start == array[i_s]) {
10128
10129             /* If the new range fills the entire gap between the other ranges,
10130              * they will get merged together.  Other ranges may also get
10131              * merged, depending on how many of them the new range spans.  In
10132              * the general case, we do the merge later, just once, after we
10133              * figure out how many to merge.  But in the case where the new
10134              * range exactly spans just this one gap (possibly extending into
10135              * the one above), we do the merge here, and an early exit.  This
10136              * is done here to avoid having to special case later. */
10137             if (i_e - i_s <= 1) {
10138
10139                 /* If i_e - i_s == 1, it means that the new range terminates
10140                  * within the range above, and hence 'extends_the_range_above'
10141                  * must be true.  (If the range above it extends to infinity,
10142                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10143                  * will be 0, so no harm done.) */
10144                 if (extends_the_range_above) {
10145                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10146                     invlist_set_len(invlist,
10147                                     len - 2,
10148                                     *(get_invlist_offset_addr(invlist)));
10149                     return invlist;
10150                 }
10151
10152                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10153                  * to the same range, and below we are about to decrement i_s
10154                  * */
10155                 i_e--;
10156             }
10157
10158             /* Here, the new range is adjacent to the one below.  (It may also
10159              * span beyond the range above, but that will get resolved later.)
10160              * Extend the range below to include this one. */
10161             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10162             i_s--;
10163             start = array[i_s];
10164         }
10165         else if (extends_the_range_above) {
10166
10167             /* Here the new range only extends the range above it, but not the
10168              * one below.  It merges with the one above.  Again, we keep i_e
10169              * and i_s in sync if they point to the same range */
10170             if (i_e == i_s) {
10171                 i_e++;
10172             }
10173             i_s++;
10174             array[i_s] = start;
10175         }
10176     }
10177
10178     /* Here, we've dealt with the new range start extending any adjoining
10179      * existing ranges.
10180      *
10181      * If the new range extends to infinity, it is now the final one,
10182      * regardless of what was there before */
10183     if (UNLIKELY(end == UV_MAX)) {
10184         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10185         return invlist;
10186     }
10187
10188     /* If i_e started as == i_s, it has also been dealt with,
10189      * and been updated to the new i_s, which will fail the following if */
10190     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10191
10192         /* Here, the ranges on either side of the end of the new range are in
10193          * the set, and this range ends in the gap between them.
10194          *
10195          * If this range is adjacent to (hence extends) the range above it, it
10196          * becomes part of that range; likewise if it extends the range below,
10197          * it becomes part of that range */
10198         if (end + 1 == array[i_e+1]) {
10199             i_e++;
10200             array[i_e] = start;
10201         }
10202         else if (start <= array[i_e]) {
10203             array[i_e] = end + 1;
10204             i_e--;
10205         }
10206     }
10207
10208     if (i_s == i_e) {
10209
10210         /* If the range fits entirely in an existing range (as possibly already
10211          * extended above), it doesn't add anything new */
10212         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10213             return invlist;
10214         }
10215
10216         /* Here, no part of the range is in the list.  Must add it.  It will
10217          * occupy 2 more slots */
10218       splice_in_new_range:
10219
10220         invlist_extend(invlist, len + 2);
10221         array = invlist_array(invlist);
10222         /* Move the rest of the array down two slots. Don't include any
10223          * trailing NUL */
10224         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10225
10226         /* Do the actual splice */
10227         array[i_e+1] = start;
10228         array[i_e+2] = end + 1;
10229         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10230         return invlist;
10231     }
10232
10233     /* Here the new range crossed the boundaries of a pre-existing range.  The
10234      * code above has adjusted things so that both ends are in ranges that are
10235      * in the set.  This means everything in between must also be in the set.
10236      * Just squash things together */
10237     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10238     invlist_set_len(invlist,
10239                     len - i_e + i_s,
10240                     *(get_invlist_offset_addr(invlist)));
10241
10242     return invlist;
10243 }
10244
10245 SV*
10246 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10247                                  UV** other_elements_ptr)
10248 {
10249     /* Create and return an inversion list whose contents are to be populated
10250      * by the caller.  The caller gives the number of elements (in 'size') and
10251      * the very first element ('element0').  This function will set
10252      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10253      * are to be placed.
10254      *
10255      * Obviously there is some trust involved that the caller will properly
10256      * fill in the other elements of the array.
10257      *
10258      * (The first element needs to be passed in, as the underlying code does
10259      * things differently depending on whether it is zero or non-zero) */
10260
10261     SV* invlist = _new_invlist(size);
10262     bool offset;
10263
10264     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10265
10266     invlist = add_cp_to_invlist(invlist, element0);
10267     offset = *get_invlist_offset_addr(invlist);
10268
10269     invlist_set_len(invlist, size, offset);
10270     *other_elements_ptr = invlist_array(invlist) + 1;
10271     return invlist;
10272 }
10273
10274 #endif
10275
10276 PERL_STATIC_INLINE SV*
10277 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10278     return _add_range_to_invlist(invlist, cp, cp);
10279 }
10280
10281 #ifndef PERL_IN_XSUB_RE
10282 void
10283 Perl__invlist_invert(pTHX_ SV* const invlist)
10284 {
10285     /* Complement the input inversion list.  This adds a 0 if the list didn't
10286      * have a zero; removes it otherwise.  As described above, the data
10287      * structure is set up so that this is very efficient */
10288
10289     PERL_ARGS_ASSERT__INVLIST_INVERT;
10290
10291     assert(! invlist_is_iterating(invlist));
10292
10293     /* The inverse of matching nothing is matching everything */
10294     if (_invlist_len(invlist) == 0) {
10295         _append_range_to_invlist(invlist, 0, UV_MAX);
10296         return;
10297     }
10298
10299     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10300 }
10301
10302 SV*
10303 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10304 {
10305
10306     /* Return a new inversion list that is a copy of the input one, which is
10307      * unchanged.  The new list will not be mortal even if the old one was. */
10308
10309     const STRLEN nominal_length = _invlist_len(invlist);    /* Why not +1 XXX */
10310     const STRLEN physical_length = SvCUR(invlist);
10311     const bool offset = *(get_invlist_offset_addr(invlist));
10312
10313     PERL_ARGS_ASSERT_INVLIST_CLONE;
10314
10315     /* Need to allocate extra space to accommodate Perl's addition of a
10316      * trailing NUL to SvPV's, since it thinks they are always strings */
10317     if (new_invlist == NULL) {
10318         new_invlist = _new_invlist(nominal_length);
10319     }
10320     else {
10321         sv_upgrade(new_invlist, SVt_INVLIST);
10322         initialize_invlist_guts(new_invlist, nominal_length);
10323     }
10324
10325     *(get_invlist_offset_addr(new_invlist)) = offset;
10326     invlist_set_len(new_invlist, nominal_length, offset);
10327     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10328
10329     return new_invlist;
10330 }
10331
10332 #endif
10333
10334 PERL_STATIC_INLINE STRLEN*
10335 S_get_invlist_iter_addr(SV* invlist)
10336 {
10337     /* Return the address of the UV that contains the current iteration
10338      * position */
10339
10340     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10341
10342     assert(is_invlist(invlist));
10343
10344     return &(((XINVLIST*) SvANY(invlist))->iterator);
10345 }
10346
10347 PERL_STATIC_INLINE void
10348 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10349 {
10350     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10351
10352     *get_invlist_iter_addr(invlist) = 0;
10353 }
10354
10355 PERL_STATIC_INLINE void
10356 S_invlist_iterfinish(SV* invlist)
10357 {
10358     /* Terminate iterator for invlist.  This is to catch development errors.
10359      * Any iteration that is interrupted before completed should call this
10360      * function.  Functions that add code points anywhere else but to the end
10361      * of an inversion list assert that they are not in the middle of an
10362      * iteration.  If they were, the addition would make the iteration
10363      * problematical: if the iteration hadn't reached the place where things
10364      * were being added, it would be ok */
10365
10366     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10367
10368     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10369 }
10370
10371 STATIC bool
10372 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10373 {
10374     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10375      * This call sets in <*start> and <*end>, the next range in <invlist>.
10376      * Returns <TRUE> if successful and the next call will return the next
10377      * range; <FALSE> if was already at the end of the list.  If the latter,
10378      * <*start> and <*end> are unchanged, and the next call to this function
10379      * will start over at the beginning of the list */
10380
10381     STRLEN* pos = get_invlist_iter_addr(invlist);
10382     UV len = _invlist_len(invlist);
10383     UV *array;
10384
10385     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10386
10387     if (*pos >= len) {
10388         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10389         return FALSE;
10390     }
10391
10392     array = invlist_array(invlist);
10393
10394     *start = array[(*pos)++];
10395
10396     if (*pos >= len) {
10397         *end = UV_MAX;
10398     }
10399     else {
10400         *end = array[(*pos)++] - 1;
10401     }
10402
10403     return TRUE;
10404 }
10405
10406 PERL_STATIC_INLINE UV
10407 S_invlist_highest(SV* const invlist)
10408 {
10409     /* Returns the highest code point that matches an inversion list.  This API
10410      * has an ambiguity, as it returns 0 under either the highest is actually
10411      * 0, or if the list is empty.  If this distinction matters to you, check
10412      * for emptiness before calling this function */
10413
10414     UV len = _invlist_len(invlist);
10415     UV *array;
10416
10417     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10418
10419     if (len == 0) {
10420         return 0;
10421     }
10422
10423     array = invlist_array(invlist);
10424
10425     /* The last element in the array in the inversion list always starts a
10426      * range that goes to infinity.  That range may be for code points that are
10427      * matched in the inversion list, or it may be for ones that aren't
10428      * matched.  In the latter case, the highest code point in the set is one
10429      * less than the beginning of this range; otherwise it is the final element
10430      * of this range: infinity */
10431     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10432            ? UV_MAX
10433            : array[len - 1] - 1;
10434 }
10435
10436 STATIC SV *
10437 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10438 {
10439     /* Get the contents of an inversion list into a string SV so that they can
10440      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10441      * traditionally done for debug tracing; otherwise it uses a format
10442      * suitable for just copying to the output, with blanks between ranges and
10443      * a dash between range components */
10444
10445     UV start, end;
10446     SV* output;
10447     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10448     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10449
10450     if (traditional_style) {
10451         output = newSVpvs("\n");
10452     }
10453     else {
10454         output = newSVpvs("");
10455     }
10456
10457     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10458
10459     assert(! invlist_is_iterating(invlist));
10460
10461     invlist_iterinit(invlist);
10462     while (invlist_iternext(invlist, &start, &end)) {
10463         if (end == UV_MAX) {
10464             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10465                                           start, intra_range_delimiter,
10466                                                  inter_range_delimiter);
10467         }
10468         else if (end != start) {
10469             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10470                                           start,
10471                                                    intra_range_delimiter,
10472                                                   end, inter_range_delimiter);
10473         }
10474         else {
10475             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10476                                           start, inter_range_delimiter);
10477         }
10478     }
10479
10480     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10481         SvCUR_set(output, SvCUR(output) - 1);
10482     }
10483
10484     return output;
10485 }
10486
10487 #ifndef PERL_IN_XSUB_RE
10488 void
10489 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10490                          const char * const indent, SV* const invlist)
10491 {
10492     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10493      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10494      * the string 'indent'.  The output looks like this:
10495          [0] 0x000A .. 0x000D
10496          [2] 0x0085
10497          [4] 0x2028 .. 0x2029
10498          [6] 0x3104 .. INFTY
10499      * This means that the first range of code points matched by the list are
10500      * 0xA through 0xD; the second range contains only the single code point
10501      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10502      * are used to define each range (except if the final range extends to
10503      * infinity, only a single element is needed).  The array index of the
10504      * first element for the corresponding range is given in brackets. */
10505
10506     UV start, end;
10507     STRLEN count = 0;
10508
10509     PERL_ARGS_ASSERT__INVLIST_DUMP;
10510
10511     if (invlist_is_iterating(invlist)) {
10512         Perl_dump_indent(aTHX_ level, file,
10513              "%sCan't dump inversion list because is in middle of iterating\n",
10514              indent);
10515         return;
10516     }
10517
10518     invlist_iterinit(invlist);
10519     while (invlist_iternext(invlist, &start, &end)) {
10520         if (end == UV_MAX) {
10521             Perl_dump_indent(aTHX_ level, file,
10522                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10523                                    indent, (UV)count, start);
10524         }
10525         else if (end != start) {
10526             Perl_dump_indent(aTHX_ level, file,
10527                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10528                                 indent, (UV)count, start,         end);
10529         }
10530         else {
10531             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10532                                             indent, (UV)count, start);
10533         }
10534         count += 2;
10535     }
10536 }
10537
10538 #endif
10539
10540 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10541 bool
10542 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10543 {
10544     /* Return a boolean as to if the two passed in inversion lists are
10545      * identical.  The final argument, if TRUE, says to take the complement of
10546      * the second inversion list before doing the comparison */
10547
10548     const UV len_a = _invlist_len(a);
10549     UV len_b = _invlist_len(b);
10550
10551     const UV* array_a = NULL;
10552     const UV* array_b = NULL;
10553
10554     PERL_ARGS_ASSERT__INVLISTEQ;
10555
10556     /* This code avoids accessing the arrays unless it knows the length is
10557      * non-zero */
10558
10559     if (len_a == 0) {
10560         if (len_b == 0) {
10561             return ! complement_b;
10562         }
10563     }
10564     else {
10565         array_a = invlist_array(a);
10566     }
10567
10568     if (len_b != 0) {
10569         array_b = invlist_array(b);
10570     }
10571
10572     /* If are to compare 'a' with the complement of b, set it
10573      * up so are looking at b's complement. */
10574     if (complement_b) {
10575
10576         /* The complement of nothing is everything, so <a> would have to have
10577          * just one element, starting at zero (ending at infinity) */
10578         if (len_b == 0) {
10579             return (len_a == 1 && array_a[0] == 0);
10580         }
10581         if (array_b[0] == 0) {
10582
10583             /* Otherwise, to complement, we invert.  Here, the first element is
10584              * 0, just remove it.  To do this, we just pretend the array starts
10585              * one later */
10586
10587             array_b++;
10588             len_b--;
10589         }
10590         else {
10591
10592             /* But if the first element is not zero, we pretend the list starts
10593              * at the 0 that is always stored immediately before the array. */
10594             array_b--;
10595             len_b++;
10596         }
10597     }
10598
10599     return    len_a == len_b
10600            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10601
10602 }
10603 #endif
10604
10605 /*
10606  * As best we can, determine the characters that can match the start of
10607  * the given EXACTF-ish node.
10608  *
10609  * Returns the invlist as a new SV*; it is the caller's responsibility to
10610  * call SvREFCNT_dec() when done with it.
10611  */
10612 STATIC SV*
10613 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10614 {
10615     const U8 * s = (U8*)STRING(node);
10616     SSize_t bytelen = STR_LEN(node);
10617     UV uc;
10618     /* Start out big enough for 2 separate code points */
10619     SV* invlist = _new_invlist(4);
10620
10621     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10622
10623     if (! UTF) {
10624         uc = *s;
10625
10626         /* We punt and assume can match anything if the node begins
10627          * with a multi-character fold.  Things are complicated.  For
10628          * example, /ffi/i could match any of:
10629          *  "\N{LATIN SMALL LIGATURE FFI}"
10630          *  "\N{LATIN SMALL LIGATURE FF}I"
10631          *  "F\N{LATIN SMALL LIGATURE FI}"
10632          *  plus several other things; and making sure we have all the
10633          *  possibilities is hard. */
10634         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10635             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10636         }
10637         else {
10638             /* Any Latin1 range character can potentially match any
10639              * other depending on the locale */
10640             if (OP(node) == EXACTFL) {
10641                 _invlist_union(invlist, PL_Latin1, &invlist);
10642             }
10643             else {
10644                 /* But otherwise, it matches at least itself.  We can
10645                  * quickly tell if it has a distinct fold, and if so,
10646                  * it matches that as well */
10647                 invlist = add_cp_to_invlist(invlist, uc);
10648                 if (IS_IN_SOME_FOLD_L1(uc))
10649                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10650             }
10651
10652             /* Some characters match above-Latin1 ones under /i.  This
10653              * is true of EXACTFL ones when the locale is UTF-8 */
10654             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10655                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10656                                     && OP(node) != EXACTFAA_NO_TRIE)))
10657             {
10658                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10659             }
10660         }
10661     }
10662     else {  /* Pattern is UTF-8 */
10663         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10664         const U8* e = s + bytelen;
10665         IV fc;
10666
10667         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10668
10669         /* The only code points that aren't folded in a UTF EXACTFish
10670          * node are are the problematic ones in EXACTFL nodes */
10671         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10672             /* We need to check for the possibility that this EXACTFL
10673              * node begins with a multi-char fold.  Therefore we fold
10674              * the first few characters of it so that we can make that
10675              * check */
10676             U8 *d = folded;
10677             int i;
10678
10679             fc = -1;
10680             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10681                 if (isASCII(*s)) {
10682                     *(d++) = (U8) toFOLD(*s);
10683                     if (fc < 0) {       /* Save the first fold */
10684                         fc = *(d-1);
10685                     }
10686                     s++;
10687                 }
10688                 else {
10689                     STRLEN len;
10690                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10691                     if (fc < 0) {       /* Save the first fold */
10692                         fc = fold;
10693                     }
10694                     d += len;
10695                     s += UTF8SKIP(s);
10696                 }
10697             }
10698
10699             /* And set up so the code below that looks in this folded
10700              * buffer instead of the node's string */
10701             e = d;
10702             s = folded;
10703         }
10704
10705         /* When we reach here 's' points to the fold of the first
10706          * character(s) of the node; and 'e' points to far enough along
10707          * the folded string to be just past any possible multi-char
10708          * fold.
10709          *
10710          * Unlike the non-UTF-8 case, the macro for determining if a
10711          * string is a multi-char fold requires all the characters to
10712          * already be folded.  This is because of all the complications
10713          * if not.  Note that they are folded anyway, except in EXACTFL
10714          * nodes.  Like the non-UTF case above, we punt if the node
10715          * begins with a multi-char fold  */
10716
10717         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10718             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10719         }
10720         else {  /* Single char fold */
10721             unsigned int k;
10722             unsigned int first_fold;
10723             const unsigned int * remaining_folds;
10724             Size_t folds_count;
10725
10726             /* It matches itself */
10727             invlist = add_cp_to_invlist(invlist, fc);
10728
10729             /* ... plus all the things that fold to it, which are found in
10730              * PL_utf8_foldclosures */
10731             folds_count = _inverse_folds(fc, &first_fold,
10732                                                 &remaining_folds);
10733             for (k = 0; k < folds_count; k++) {
10734                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10735
10736                 /* /aa doesn't allow folds between ASCII and non- */
10737                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10738                     && isASCII(c) != isASCII(fc))
10739                 {
10740                     continue;
10741                 }
10742
10743                 invlist = add_cp_to_invlist(invlist, c);
10744             }
10745         }
10746     }
10747
10748     return invlist;
10749 }
10750
10751 #undef HEADER_LENGTH
10752 #undef TO_INTERNAL_SIZE
10753 #undef FROM_INTERNAL_SIZE
10754 #undef INVLIST_VERSION_ID
10755
10756 /* End of inversion list object */
10757
10758 STATIC void
10759 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10760 {
10761     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10762      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10763      * should point to the first flag; it is updated on output to point to the
10764      * final ')' or ':'.  There needs to be at least one flag, or this will
10765      * abort */
10766
10767     /* for (?g), (?gc), and (?o) warnings; warning
10768        about (?c) will warn about (?g) -- japhy    */
10769
10770 #define WASTED_O  0x01
10771 #define WASTED_G  0x02
10772 #define WASTED_C  0x04
10773 #define WASTED_GC (WASTED_G|WASTED_C)
10774     I32 wastedflags = 0x00;
10775     U32 posflags = 0, negflags = 0;
10776     U32 *flagsp = &posflags;
10777     char has_charset_modifier = '\0';
10778     regex_charset cs;
10779     bool has_use_defaults = FALSE;
10780     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10781     int x_mod_count = 0;
10782
10783     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10784
10785     /* '^' as an initial flag sets certain defaults */
10786     if (UCHARAT(RExC_parse) == '^') {
10787         RExC_parse++;
10788         has_use_defaults = TRUE;
10789         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10790         cs = (RExC_uni_semantics)
10791              ? REGEX_UNICODE_CHARSET
10792              : REGEX_DEPENDS_CHARSET;
10793         set_regex_charset(&RExC_flags, cs);
10794     }
10795     else {
10796         cs = get_regex_charset(RExC_flags);
10797         if (   cs == REGEX_DEPENDS_CHARSET
10798             && RExC_uni_semantics)
10799         {
10800             cs = REGEX_UNICODE_CHARSET;
10801         }
10802     }
10803
10804     while (RExC_parse < RExC_end) {
10805         /* && strchr("iogcmsx", *RExC_parse) */
10806         /* (?g), (?gc) and (?o) are useless here
10807            and must be globally applied -- japhy */
10808         switch (*RExC_parse) {
10809
10810             /* Code for the imsxn flags */
10811             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10812
10813             case LOCALE_PAT_MOD:
10814                 if (has_charset_modifier) {
10815                     goto excess_modifier;
10816                 }
10817                 else if (flagsp == &negflags) {
10818                     goto neg_modifier;
10819                 }
10820                 cs = REGEX_LOCALE_CHARSET;
10821                 has_charset_modifier = LOCALE_PAT_MOD;
10822                 break;
10823             case UNICODE_PAT_MOD:
10824                 if (has_charset_modifier) {
10825                     goto excess_modifier;
10826                 }
10827                 else if (flagsp == &negflags) {
10828                     goto neg_modifier;
10829                 }
10830                 cs = REGEX_UNICODE_CHARSET;
10831                 has_charset_modifier = UNICODE_PAT_MOD;
10832                 break;
10833             case ASCII_RESTRICT_PAT_MOD:
10834                 if (flagsp == &negflags) {
10835                     goto neg_modifier;
10836                 }
10837                 if (has_charset_modifier) {
10838                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10839                         goto excess_modifier;
10840                     }
10841                     /* Doubled modifier implies more restricted */
10842                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10843                 }
10844                 else {
10845                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10846                 }
10847                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10848                 break;
10849             case DEPENDS_PAT_MOD:
10850                 if (has_use_defaults) {
10851                     goto fail_modifiers;
10852                 }
10853                 else if (flagsp == &negflags) {
10854                     goto neg_modifier;
10855                 }
10856                 else if (has_charset_modifier) {
10857                     goto excess_modifier;
10858                 }
10859
10860                 /* The dual charset means unicode semantics if the
10861                  * pattern (or target, not known until runtime) are
10862                  * utf8, or something in the pattern indicates unicode
10863                  * semantics */
10864                 cs = (RExC_uni_semantics)
10865                      ? REGEX_UNICODE_CHARSET
10866                      : REGEX_DEPENDS_CHARSET;
10867                 has_charset_modifier = DEPENDS_PAT_MOD;
10868                 break;
10869               excess_modifier:
10870                 RExC_parse++;
10871                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10872                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10873                 }
10874                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10875                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10876                                         *(RExC_parse - 1));
10877                 }
10878                 else {
10879                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10880                 }
10881                 NOT_REACHED; /*NOTREACHED*/
10882               neg_modifier:
10883                 RExC_parse++;
10884                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10885                                     *(RExC_parse - 1));
10886                 NOT_REACHED; /*NOTREACHED*/
10887             case ONCE_PAT_MOD: /* 'o' */
10888             case GLOBAL_PAT_MOD: /* 'g' */
10889                 if (ckWARN(WARN_REGEXP)) {
10890                     const I32 wflagbit = *RExC_parse == 'o'
10891                                          ? WASTED_O
10892                                          : WASTED_G;
10893                     if (! (wastedflags & wflagbit) ) {
10894                         wastedflags |= wflagbit;
10895                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10896                         vWARN5(
10897                             RExC_parse + 1,
10898                             "Useless (%s%c) - %suse /%c modifier",
10899                             flagsp == &negflags ? "?-" : "?",
10900                             *RExC_parse,
10901                             flagsp == &negflags ? "don't " : "",
10902                             *RExC_parse
10903                         );
10904                     }
10905                 }
10906                 break;
10907
10908             case CONTINUE_PAT_MOD: /* 'c' */
10909                 if (ckWARN(WARN_REGEXP)) {
10910                     if (! (wastedflags & WASTED_C) ) {
10911                         wastedflags |= WASTED_GC;
10912                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10913                         vWARN3(
10914                             RExC_parse + 1,
10915                             "Useless (%sc) - %suse /gc modifier",
10916                             flagsp == &negflags ? "?-" : "?",
10917                             flagsp == &negflags ? "don't " : ""
10918                         );
10919                     }
10920                 }
10921                 break;
10922             case KEEPCOPY_PAT_MOD: /* 'p' */
10923                 if (flagsp == &negflags) {
10924                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10925                 } else {
10926                     *flagsp |= RXf_PMf_KEEPCOPY;
10927                 }
10928                 break;
10929             case '-':
10930                 /* A flag is a default iff it is following a minus, so
10931                  * if there is a minus, it means will be trying to
10932                  * re-specify a default which is an error */
10933                 if (has_use_defaults || flagsp == &negflags) {
10934                     goto fail_modifiers;
10935                 }
10936                 flagsp = &negflags;
10937                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10938                 x_mod_count = 0;
10939                 break;
10940             case ':':
10941             case ')':
10942
10943                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10944                     negflags |= RXf_PMf_EXTENDED_MORE;
10945                 }
10946                 RExC_flags |= posflags;
10947
10948                 if (negflags & RXf_PMf_EXTENDED) {
10949                     negflags |= RXf_PMf_EXTENDED_MORE;
10950                 }
10951                 RExC_flags &= ~negflags;
10952                 set_regex_charset(&RExC_flags, cs);
10953
10954                 return;
10955             default:
10956               fail_modifiers:
10957                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10958                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10959                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10960                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10961                 NOT_REACHED; /*NOTREACHED*/
10962         }
10963
10964         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10965     }
10966
10967     vFAIL("Sequence (?... not terminated");
10968 }
10969
10970 /*
10971  - reg - regular expression, i.e. main body or parenthesized thing
10972  *
10973  * Caller must absorb opening parenthesis.
10974  *
10975  * Combining parenthesis handling with the base level of regular expression
10976  * is a trifle forced, but the need to tie the tails of the branches to what
10977  * follows makes it hard to avoid.
10978  */
10979 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10980 #ifdef DEBUGGING
10981 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10982 #else
10983 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10984 #endif
10985
10986 PERL_STATIC_INLINE regnode_offset
10987 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10988                              I32 *flagp,
10989                              char * parse_start,
10990                              char ch
10991                       )
10992 {
10993     regnode_offset ret;
10994     char* name_start = RExC_parse;
10995     U32 num = 0;
10996     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10997     GET_RE_DEBUG_FLAGS_DECL;
10998
10999     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11000
11001     if (RExC_parse == name_start || *RExC_parse != ch) {
11002         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11003         vFAIL2("Sequence %.3s... not terminated", parse_start);
11004     }
11005
11006     if (sv_dat) {
11007         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11008         RExC_rxi->data->data[num]=(void*)sv_dat;
11009         SvREFCNT_inc_simple_void_NN(sv_dat);
11010     }
11011     RExC_sawback = 1;
11012     ret = reganode(pRExC_state,
11013                    ((! FOLD)
11014                      ? NREF
11015                      : (ASCII_FOLD_RESTRICTED)
11016                        ? NREFFA
11017                        : (AT_LEAST_UNI_SEMANTICS)
11018                          ? NREFFU
11019                          : (LOC)
11020                            ? NREFFL
11021                            : NREFF),
11022                     num);
11023     *flagp |= HASWIDTH;
11024
11025     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11026     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11027
11028     nextchar(pRExC_state);
11029     return ret;
11030 }
11031
11032 /* On success, returns the offset at which any next node should be placed into
11033  * the regex engine program being compiled.
11034  *
11035  * Returns 0 otherwise, with *flagp set to indicate why:
11036  *  TRYAGAIN        at the end of (?) that only sets flags.
11037  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11038  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11039  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11040  *  happen.  */
11041 STATIC regnode_offset
11042 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11043     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11044      * 2 is like 1, but indicates that nextchar() has been called to advance
11045      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11046      * this flag alerts us to the need to check for that */
11047 {
11048     regnode_offset ret = 0;    /* Will be the head of the group. */
11049     regnode_offset br;
11050     regnode_offset lastbr;
11051     regnode_offset ender = 0;
11052     I32 parno = 0;
11053     I32 flags;
11054     U32 oregflags = RExC_flags;
11055     bool have_branch = 0;
11056     bool is_open = 0;
11057     I32 freeze_paren = 0;
11058     I32 after_freeze = 0;
11059     I32 num; /* numeric backreferences */
11060
11061     char * parse_start = RExC_parse; /* MJD */
11062     char * const oregcomp_parse = RExC_parse;
11063
11064     GET_RE_DEBUG_FLAGS_DECL;
11065
11066     PERL_ARGS_ASSERT_REG;
11067     DEBUG_PARSE("reg ");
11068
11069     *flagp = 0;                         /* Tentatively. */
11070
11071     /* Having this true makes it feasible to have a lot fewer tests for the
11072      * parse pointer being in scope.  For example, we can write
11073      *      while(isFOO(*RExC_parse)) RExC_parse++;
11074      * instead of
11075      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11076      */
11077     assert(*RExC_end == '\0');
11078
11079     /* Make an OPEN node, if parenthesized. */
11080     if (paren) {
11081
11082         /* Under /x, space and comments can be gobbled up between the '(' and
11083          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11084          * intervening space, as the sequence is a token, and a token should be
11085          * indivisible */
11086         bool has_intervening_patws = (paren == 2)
11087                                   && *(RExC_parse - 1) != '(';
11088
11089         if (RExC_parse >= RExC_end) {
11090             vFAIL("Unmatched (");
11091         }
11092
11093         if (paren == 'r') {     /* Atomic script run */
11094             paren = '>';
11095             goto parse_rest;
11096         }
11097         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11098             char *start_verb = RExC_parse + 1;
11099             STRLEN verb_len;
11100             char *start_arg = NULL;
11101             unsigned char op = 0;
11102             int arg_required = 0;
11103             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11104             bool has_upper = FALSE;
11105
11106             if (has_intervening_patws) {
11107                 RExC_parse++;   /* past the '*' */
11108
11109                 /* For strict backwards compatibility, don't change the message
11110                  * now that we also have lowercase operands */
11111                 if (isUPPER(*RExC_parse)) {
11112                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11113                 }
11114                 else {
11115                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11116                 }
11117             }
11118             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11119                 if ( *RExC_parse == ':' ) {
11120                     start_arg = RExC_parse + 1;
11121                     break;
11122                 }
11123                 else if (! UTF) {
11124                     if (isUPPER(*RExC_parse)) {
11125                         has_upper = TRUE;
11126                     }
11127                     RExC_parse++;
11128                 }
11129                 else {
11130                     RExC_parse += UTF8SKIP(RExC_parse);
11131                 }
11132             }
11133             verb_len = RExC_parse - start_verb;
11134             if ( start_arg ) {
11135                 if (RExC_parse >= RExC_end) {
11136                     goto unterminated_verb_pattern;
11137                 }
11138
11139                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11140                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11141                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11142                 }
11143                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11144                   unterminated_verb_pattern:
11145                     if (has_upper) {
11146                         vFAIL("Unterminated verb pattern argument");
11147                     }
11148                     else {
11149                         vFAIL("Unterminated '(*...' argument");
11150                     }
11151                 }
11152             } else {
11153                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11154                     if (has_upper) {
11155                         vFAIL("Unterminated verb pattern");
11156                     }
11157                     else {
11158                         vFAIL("Unterminated '(*...' construct");
11159                     }
11160                 }
11161             }
11162
11163             /* Here, we know that RExC_parse < RExC_end */
11164
11165             switch ( *start_verb ) {
11166             case 'A':  /* (*ACCEPT) */
11167                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11168                     op = ACCEPT;
11169                     internal_argval = RExC_nestroot;
11170                 }
11171                 break;
11172             case 'C':  /* (*COMMIT) */
11173                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11174                     op = COMMIT;
11175                 break;
11176             case 'F':  /* (*FAIL) */
11177                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11178                     op = OPFAIL;
11179                 }
11180                 break;
11181             case ':':  /* (*:NAME) */
11182             case 'M':  /* (*MARK:NAME) */
11183                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11184                     op = MARKPOINT;
11185                     arg_required = 1;
11186                 }
11187                 break;
11188             case 'P':  /* (*PRUNE) */
11189                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11190                     op = PRUNE;
11191                 break;
11192             case 'S':   /* (*SKIP) */
11193                 if ( memEQs(start_verb, verb_len,"SKIP") )
11194                     op = SKIP;
11195                 break;
11196             case 'T':  /* (*THEN) */
11197                 /* [19:06] <TimToady> :: is then */
11198                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11199                     op = CUTGROUP;
11200                     RExC_seen |= REG_CUTGROUP_SEEN;
11201                 }
11202                 break;
11203             case 'a':
11204                 if (   memEQs(start_verb, verb_len, "asr")
11205                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11206                 {
11207                     paren = 'r';        /* Mnemonic: recursed run */
11208                     goto script_run;
11209                 }
11210                 else if (memEQs(start_verb, verb_len, "atomic")) {
11211                     paren = 't';    /* AtOMIC */
11212                     goto alpha_assertions;
11213                 }
11214                 break;
11215             case 'p':
11216                 if (   memEQs(start_verb, verb_len, "plb")
11217                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11218                 {
11219                     paren = 'b';
11220                     goto lookbehind_alpha_assertions;
11221                 }
11222                 else if (   memEQs(start_verb, verb_len, "pla")
11223                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11224                 {
11225                     paren = 'a';
11226                     goto alpha_assertions;
11227                 }
11228                 break;
11229             case 'n':
11230                 if (   memEQs(start_verb, verb_len, "nlb")
11231                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11232                 {
11233                     paren = 'B';
11234                     goto lookbehind_alpha_assertions;
11235                 }
11236                 else if (   memEQs(start_verb, verb_len, "nla")
11237                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11238                 {
11239                     paren = 'A';
11240                     goto alpha_assertions;
11241                 }
11242                 break;
11243             case 's':
11244                 if (   memEQs(start_verb, verb_len, "sr")
11245                     || memEQs(start_verb, verb_len, "script_run"))
11246                 {
11247                     regnode_offset atomic;
11248
11249                     paren = 's';
11250
11251                    script_run:
11252
11253                     /* This indicates Unicode rules. */
11254                     REQUIRE_UNI_RULES(flagp, 0);
11255
11256                     if (! start_arg) {
11257                         goto no_colon;
11258                     }
11259
11260                     RExC_parse = start_arg;
11261
11262                     if (RExC_in_script_run) {
11263
11264                         /*  Nested script runs are treated as no-ops, because
11265                          *  if the nested one fails, the outer one must as
11266                          *  well.  It could fail sooner, and avoid (??{} with
11267                          *  side effects, but that is explicitly documented as
11268                          *  undefined behavior. */
11269
11270                         ret = 0;
11271
11272                         if (paren == 's') {
11273                             paren = ':';
11274                             goto parse_rest;
11275                         }
11276
11277                         /* But, the atomic part of a nested atomic script run
11278                          * isn't a no-op, but can be treated just like a '(?>'
11279                          * */
11280                         paren = '>';
11281                         goto parse_rest;
11282                     }
11283
11284                     /* By doing this here, we avoid extra warnings for nested
11285                      * script runs */
11286                     ckWARNexperimental(RExC_parse,
11287                         WARN_EXPERIMENTAL__SCRIPT_RUN,
11288                         "The script_run feature is experimental");
11289
11290                     if (paren == 's') {
11291                         /* Here, we're starting a new regular script run */
11292                         ret = reg_node(pRExC_state, SROPEN);
11293                         RExC_in_script_run = 1;
11294                         is_open = 1;
11295                         goto parse_rest;
11296                     }
11297
11298                     /* Here, we are starting an atomic script run.  This is
11299                      * handled by recursing to deal with the atomic portion
11300                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11301
11302                     ret = reg_node(pRExC_state, SROPEN);
11303
11304                     RExC_in_script_run = 1;
11305
11306                     atomic = reg(pRExC_state, 'r', &flags, depth);
11307                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11308                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11309                         return 0;
11310                     }
11311
11312                     REGTAIL(pRExC_state, ret, atomic);
11313
11314                     REGTAIL(pRExC_state, atomic,
11315                            reg_node(pRExC_state, SRCLOSE));
11316
11317                     RExC_in_script_run = 0;
11318                     return ret;
11319                 }
11320
11321                 break;
11322
11323             lookbehind_alpha_assertions:
11324                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11325                 RExC_in_lookbehind++;
11326                 /*FALLTHROUGH*/
11327
11328             alpha_assertions:
11329                 ckWARNexperimental(RExC_parse,
11330                         WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11331                         "The alpha_assertions feature is experimental");
11332
11333                 RExC_seen_zerolen++;
11334
11335                 if (! start_arg) {
11336                     goto no_colon;
11337                 }
11338
11339                 /* An empty negative lookahead assertion simply is failure */
11340                 if (paren == 'A' && RExC_parse == start_arg) {
11341                     ret=reganode(pRExC_state, OPFAIL, 0);
11342                     nextchar(pRExC_state);
11343                     return ret;
11344                 }
11345
11346                 RExC_parse = start_arg;
11347                 goto parse_rest;
11348
11349               no_colon:
11350                 vFAIL2utf8f(
11351                 "'(*%" UTF8f "' requires a terminating ':'",
11352                 UTF8fARG(UTF, verb_len, start_verb));
11353                 NOT_REACHED; /*NOTREACHED*/
11354
11355             } /* End of switch */
11356             if ( ! op ) {
11357                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11358                 if (has_upper || verb_len == 0) {
11359                     vFAIL2utf8f(
11360                     "Unknown verb pattern '%" UTF8f "'",
11361                     UTF8fARG(UTF, verb_len, start_verb));
11362                 }
11363                 else {
11364                     vFAIL2utf8f(
11365                     "Unknown '(*...)' construct '%" UTF8f "'",
11366                     UTF8fARG(UTF, verb_len, start_verb));
11367                 }
11368             }
11369             if ( RExC_parse == start_arg ) {
11370                 start_arg = NULL;
11371             }
11372             if ( arg_required && !start_arg ) {
11373                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11374                     verb_len, start_verb);
11375             }
11376             if (internal_argval == -1) {
11377                 ret = reganode(pRExC_state, op, 0);
11378             } else {
11379                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11380             }
11381             RExC_seen |= REG_VERBARG_SEEN;
11382             if (start_arg) {
11383                 SV *sv = newSVpvn( start_arg,
11384                                     RExC_parse - start_arg);
11385                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11386                                         STR_WITH_LEN("S"));
11387                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11388                 FLAGS(REGNODE_p(ret)) = 1;
11389             } else {
11390                 FLAGS(REGNODE_p(ret)) = 0;
11391             }
11392             if ( internal_argval != -1 )
11393                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11394             nextchar(pRExC_state);
11395             return ret;
11396         }
11397         else if (*RExC_parse == '?') { /* (?...) */
11398             bool is_logical = 0;
11399             const char * const seqstart = RExC_parse;
11400             const char * endptr;
11401             if (has_intervening_patws) {
11402                 RExC_parse++;
11403                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11404             }
11405
11406             RExC_parse++;           /* past the '?' */
11407             paren = *RExC_parse;    /* might be a trailing NUL, if not
11408                                        well-formed */
11409             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11410             if (RExC_parse > RExC_end) {
11411                 paren = '\0';
11412             }
11413             ret = 0;                    /* For look-ahead/behind. */
11414             switch (paren) {
11415
11416             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11417                 paren = *RExC_parse;
11418                 if ( paren == '<') {    /* (?P<...>) named capture */
11419                     RExC_parse++;
11420                     if (RExC_parse >= RExC_end) {
11421                         vFAIL("Sequence (?P<... not terminated");
11422                     }
11423                     goto named_capture;
11424                 }
11425                 else if (paren == '>') {   /* (?P>name) named recursion */
11426                     RExC_parse++;
11427                     if (RExC_parse >= RExC_end) {
11428                         vFAIL("Sequence (?P>... not terminated");
11429                     }
11430                     goto named_recursion;
11431                 }
11432                 else if (paren == '=') {   /* (?P=...)  named backref */
11433                     RExC_parse++;
11434                     return handle_named_backref(pRExC_state, flagp,
11435                                                 parse_start, ')');
11436                 }
11437                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11438                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11439                 vFAIL3("Sequence (%.*s...) not recognized",
11440                                 RExC_parse-seqstart, seqstart);
11441                 NOT_REACHED; /*NOTREACHED*/
11442             case '<':           /* (?<...) */
11443                 if (*RExC_parse == '!')
11444                     paren = ',';
11445                 else if (*RExC_parse != '=')
11446               named_capture:
11447                 {               /* (?<...>) */
11448                     char *name_start;
11449                     SV *svname;
11450                     paren= '>';
11451                 /* FALLTHROUGH */
11452             case '\'':          /* (?'...') */
11453                     name_start = RExC_parse;
11454                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11455                     if (   RExC_parse == name_start
11456                         || RExC_parse >= RExC_end
11457                         || *RExC_parse != paren)
11458                     {
11459                         vFAIL2("Sequence (?%c... not terminated",
11460                             paren=='>' ? '<' : paren);
11461                     }
11462                     {
11463                         HE *he_str;
11464                         SV *sv_dat = NULL;
11465                         if (!svname) /* shouldn't happen */
11466                             Perl_croak(aTHX_
11467                                 "panic: reg_scan_name returned NULL");
11468                         if (!RExC_paren_names) {
11469                             RExC_paren_names= newHV();
11470                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11471 #ifdef DEBUGGING
11472                             RExC_paren_name_list= newAV();
11473                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11474 #endif
11475                         }
11476                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11477                         if ( he_str )
11478                             sv_dat = HeVAL(he_str);
11479                         if ( ! sv_dat ) {
11480                             /* croak baby croak */
11481                             Perl_croak(aTHX_
11482                                 "panic: paren_name hash element allocation failed");
11483                         } else if ( SvPOK(sv_dat) ) {
11484                             /* (?|...) can mean we have dupes so scan to check
11485                                its already been stored. Maybe a flag indicating
11486                                we are inside such a construct would be useful,
11487                                but the arrays are likely to be quite small, so
11488                                for now we punt -- dmq */
11489                             IV count = SvIV(sv_dat);
11490                             I32 *pv = (I32*)SvPVX(sv_dat);
11491                             IV i;
11492                             for ( i = 0 ; i < count ; i++ ) {
11493                                 if ( pv[i] == RExC_npar ) {
11494                                     count = 0;
11495                                     break;
11496                                 }
11497                             }
11498                             if ( count ) {
11499                                 pv = (I32*)SvGROW(sv_dat,
11500                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11501                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11502                                 pv[count] = RExC_npar;
11503                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11504                             }
11505                         } else {
11506                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11507                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11508                                                                 sizeof(I32));
11509                             SvIOK_on(sv_dat);
11510                             SvIV_set(sv_dat, 1);
11511                         }
11512 #ifdef DEBUGGING
11513                         /* Yes this does cause a memory leak in debugging Perls
11514                          * */
11515                         if (!av_store(RExC_paren_name_list,
11516                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11517                             SvREFCNT_dec_NN(svname);
11518 #endif
11519
11520                         /*sv_dump(sv_dat);*/
11521                     }
11522                     nextchar(pRExC_state);
11523                     paren = 1;
11524                     goto capturing_parens;
11525                 }
11526
11527                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11528                 RExC_in_lookbehind++;
11529                 RExC_parse++;
11530                 if (RExC_parse >= RExC_end) {
11531                     vFAIL("Sequence (?... not terminated");
11532                 }
11533
11534                 /* FALLTHROUGH */
11535             case '=':           /* (?=...) */
11536                 RExC_seen_zerolen++;
11537                 break;
11538             case '!':           /* (?!...) */
11539                 RExC_seen_zerolen++;
11540                 /* check if we're really just a "FAIL" assertion */
11541                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11542                                         FALSE /* Don't force to /x */ );
11543                 if (*RExC_parse == ')') {
11544                     ret=reganode(pRExC_state, OPFAIL, 0);
11545                     nextchar(pRExC_state);
11546                     return ret;
11547                 }
11548                 break;
11549             case '|':           /* (?|...) */
11550                 /* branch reset, behave like a (?:...) except that
11551                    buffers in alternations share the same numbers */
11552                 paren = ':';
11553                 after_freeze = freeze_paren = RExC_npar;
11554
11555                 /* XXX This construct currently requires an extra pass.
11556                  * Investigation would be required to see if that could be
11557                  * changed */
11558                 REQUIRE_PARENS_PASS;
11559                 break;
11560             case ':':           /* (?:...) */
11561             case '>':           /* (?>...) */
11562                 break;
11563             case '$':           /* (?$...) */
11564             case '@':           /* (?@...) */
11565                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11566                 break;
11567             case '0' :           /* (?0) */
11568             case 'R' :           /* (?R) */
11569                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11570                     FAIL("Sequence (?R) not terminated");
11571                 num = 0;
11572                 RExC_seen |= REG_RECURSE_SEEN;
11573
11574                 /* XXX These constructs currently require an extra pass.
11575                  * It probably could be changed */
11576                 REQUIRE_PARENS_PASS;
11577
11578                 *flagp |= POSTPONED;
11579                 goto gen_recurse_regop;
11580                 /*notreached*/
11581             /* named and numeric backreferences */
11582             case '&':            /* (?&NAME) */
11583                 parse_start = RExC_parse - 1;
11584               named_recursion:
11585                 {
11586                     SV *sv_dat = reg_scan_name(pRExC_state,
11587                                                REG_RSN_RETURN_DATA);
11588                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11589                 }
11590                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11591                     vFAIL("Sequence (?&... not terminated");
11592                 goto gen_recurse_regop;
11593                 /* NOTREACHED */
11594             case '+':
11595                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11596                     RExC_parse++;
11597                     vFAIL("Illegal pattern");
11598                 }
11599                 goto parse_recursion;
11600                 /* NOTREACHED*/
11601             case '-': /* (?-1) */
11602                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11603                     RExC_parse--; /* rewind to let it be handled later */
11604                     goto parse_flags;
11605                 }
11606                 /* FALLTHROUGH */
11607             case '1': case '2': case '3': case '4': /* (?1) */
11608             case '5': case '6': case '7': case '8': case '9':
11609                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11610               parse_recursion:
11611                 {
11612                     bool is_neg = FALSE;
11613                     UV unum;
11614                     parse_start = RExC_parse - 1; /* MJD */
11615                     if (*RExC_parse == '-') {
11616                         RExC_parse++;
11617                         is_neg = TRUE;
11618                     }
11619                     endptr = RExC_end;
11620                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11621                         && unum <= I32_MAX
11622                     ) {
11623                         num = (I32)unum;
11624                         RExC_parse = (char*)endptr;
11625                     } else
11626                         num = I32_MAX;
11627                     if (is_neg) {
11628                         /* Some limit for num? */
11629                         num = -num;
11630                     }
11631                 }
11632                 if (*RExC_parse!=')')
11633                     vFAIL("Expecting close bracket");
11634
11635               gen_recurse_regop:
11636                 if ( paren == '-' ) {
11637                     /*
11638                     Diagram of capture buffer numbering.
11639                     Top line is the normal capture buffer numbers
11640                     Bottom line is the negative indexing as from
11641                     the X (the (?-2))
11642
11643                     +   1 2    3 4 5 X          6 7
11644                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11645                     -   5 4    3 2 1 X          x x
11646
11647                     */
11648                     num = RExC_npar + num;
11649                     if (num < 1)  {
11650
11651                         /* It might be a forward reference; we can't fail until
11652                          * we know, by completing the parse to get all the
11653                          * groups, and then reparsing */
11654                         if (RExC_total_parens > 0)  {
11655                             RExC_parse++;
11656                             vFAIL("Reference to nonexistent group");
11657                         }
11658                         else {
11659                             REQUIRE_PARENS_PASS;
11660                         }
11661                     }
11662                 } else if ( paren == '+' ) {
11663                     num = RExC_npar + num - 1;
11664                 }
11665                 /* We keep track how many GOSUB items we have produced.
11666                    To start off the ARG2L() of the GOSUB holds its "id",
11667                    which is used later in conjunction with RExC_recurse
11668                    to calculate the offset we need to jump for the GOSUB,
11669                    which it will store in the final representation.
11670                    We have to defer the actual calculation until much later
11671                    as the regop may move.
11672                  */
11673
11674                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11675                 if (num >= RExC_npar) {
11676
11677                     /* It might be a forward reference; we can't fail until we
11678                      * know, by completing the parse to get all the groups, and
11679                      * then reparsing */
11680                     if (RExC_total_parens > 0)  {
11681                         if (num >= RExC_total_parens) {
11682                             RExC_parse++;
11683                             vFAIL("Reference to nonexistent group");
11684                         }
11685                     }
11686                     else {
11687                         REQUIRE_PARENS_PASS;
11688                     }
11689                 }
11690                 RExC_recurse_count++;
11691                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11692                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11693                             22, "|    |", (int)(depth * 2 + 1), "",
11694                             (UV)ARG(REGNODE_p(ret)),
11695                             (IV)ARG2L(REGNODE_p(ret))));
11696                 RExC_seen |= REG_RECURSE_SEEN;
11697
11698                 Set_Node_Length(REGNODE_p(ret),
11699                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11700                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11701
11702                 *flagp |= POSTPONED;
11703                 assert(*RExC_parse == ')');
11704                 nextchar(pRExC_state);
11705                 return ret;
11706
11707             /* NOTREACHED */
11708
11709             case '?':           /* (??...) */
11710                 is_logical = 1;
11711                 if (*RExC_parse != '{') {
11712                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
11713                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11714                     vFAIL2utf8f(
11715                         "Sequence (%" UTF8f "...) not recognized",
11716                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11717                     NOT_REACHED; /*NOTREACHED*/
11718                 }
11719                 *flagp |= POSTPONED;
11720                 paren = '{';
11721                 RExC_parse++;
11722                 /* FALLTHROUGH */
11723             case '{':           /* (?{...}) */
11724             {
11725                 U32 n = 0;
11726                 struct reg_code_block *cb;
11727                 OP * o;
11728
11729                 RExC_seen_zerolen++;
11730
11731                 if (   !pRExC_state->code_blocks
11732                     || pRExC_state->code_index
11733                                         >= pRExC_state->code_blocks->count
11734                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11735                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11736                             - RExC_start)
11737                 ) {
11738                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11739                         FAIL("panic: Sequence (?{...}): no code block found\n");
11740                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11741                 }
11742                 /* this is a pre-compiled code block (?{...}) */
11743                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11744                 RExC_parse = RExC_start + cb->end;
11745                 o = cb->block;
11746                 if (cb->src_regex) {
11747                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11748                     RExC_rxi->data->data[n] =
11749                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11750                     RExC_rxi->data->data[n+1] = (void*)o;
11751                 }
11752                 else {
11753                     n = add_data(pRExC_state,
11754                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11755                     RExC_rxi->data->data[n] = (void*)o;
11756                 }
11757                 pRExC_state->code_index++;
11758                 nextchar(pRExC_state);
11759
11760                 if (is_logical) {
11761                     regnode_offset eval;
11762                     ret = reg_node(pRExC_state, LOGICAL);
11763
11764                     eval = reg2Lanode(pRExC_state, EVAL,
11765                                        n,
11766
11767                                        /* for later propagation into (??{})
11768                                         * return value */
11769                                        RExC_flags & RXf_PMf_COMPILETIME
11770                                       );
11771                     FLAGS(REGNODE_p(ret)) = 2;
11772                     REGTAIL(pRExC_state, ret, eval);
11773                     /* deal with the length of this later - MJD */
11774                     return ret;
11775                 }
11776                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11777                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11778                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11779                 return ret;
11780             }
11781             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11782             {
11783                 int is_define= 0;
11784                 const int DEFINE_len = sizeof("DEFINE") - 1;
11785                 if (    RExC_parse < RExC_end - 1
11786                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11787                             && (   RExC_parse[1] == '='
11788                                 || RExC_parse[1] == '!'
11789                                 || RExC_parse[1] == '<'
11790                                 || RExC_parse[1] == '{'))
11791                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11792                             && (   memBEGINs(RExC_parse + 1,
11793                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11794                                          "pla:")
11795                                 || memBEGINs(RExC_parse + 1,
11796                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11797                                          "plb:")
11798                                 || memBEGINs(RExC_parse + 1,
11799                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11800                                          "nla:")
11801                                 || memBEGINs(RExC_parse + 1,
11802                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11803                                          "nlb:")
11804                                 || memBEGINs(RExC_parse + 1,
11805                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11806                                          "positive_lookahead:")
11807                                 || memBEGINs(RExC_parse + 1,
11808                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11809                                          "positive_lookbehind:")
11810                                 || memBEGINs(RExC_parse + 1,
11811                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11812                                          "negative_lookahead:")
11813                                 || memBEGINs(RExC_parse + 1,
11814                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11815                                          "negative_lookbehind:"))))
11816                 ) { /* Lookahead or eval. */
11817                     I32 flag;
11818                     regnode_offset tail;
11819
11820                     ret = reg_node(pRExC_state, LOGICAL);
11821                     FLAGS(REGNODE_p(ret)) = 1;
11822
11823                     tail = reg(pRExC_state, 1, &flag, depth+1);
11824                     RETURN_FAIL_ON_RESTART(flag, flagp);
11825                     REGTAIL(pRExC_state, ret, tail);
11826                     goto insert_if;
11827                 }
11828                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11829                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11830                 {
11831                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11832                     char *name_start= RExC_parse++;
11833                     U32 num = 0;
11834                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11835                     if (   RExC_parse == name_start
11836                         || RExC_parse >= RExC_end
11837                         || *RExC_parse != ch)
11838                     {
11839                         vFAIL2("Sequence (?(%c... not terminated",
11840                             (ch == '>' ? '<' : ch));
11841                     }
11842                     RExC_parse++;
11843                     if (sv_dat) {
11844                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11845                         RExC_rxi->data->data[num]=(void*)sv_dat;
11846                         SvREFCNT_inc_simple_void_NN(sv_dat);
11847                     }
11848                     ret = reganode(pRExC_state, NGROUPP, num);
11849                     goto insert_if_check_paren;
11850                 }
11851                 else if (memBEGINs(RExC_parse,
11852                                    (STRLEN) (RExC_end - RExC_parse),
11853                                    "DEFINE"))
11854                 {
11855                     ret = reganode(pRExC_state, DEFINEP, 0);
11856                     RExC_parse += DEFINE_len;
11857                     is_define = 1;
11858                     goto insert_if_check_paren;
11859                 }
11860                 else if (RExC_parse[0] == 'R') {
11861                     RExC_parse++;
11862                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11863                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11864                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11865                      */
11866                     parno = 0;
11867                     if (RExC_parse[0] == '0') {
11868                         parno = 1;
11869                         RExC_parse++;
11870                     }
11871                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11872                         UV uv;
11873                         endptr = RExC_end;
11874                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11875                             && uv <= I32_MAX
11876                         ) {
11877                             parno = (I32)uv + 1;
11878                             RExC_parse = (char*)endptr;
11879                         }
11880                         /* else "Switch condition not recognized" below */
11881                     } else if (RExC_parse[0] == '&') {
11882                         SV *sv_dat;
11883                         RExC_parse++;
11884                         sv_dat = reg_scan_name(pRExC_state,
11885                                                REG_RSN_RETURN_DATA);
11886                         if (sv_dat)
11887                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11888                     }
11889                     ret = reganode(pRExC_state, INSUBP, parno);
11890                     goto insert_if_check_paren;
11891                 }
11892                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11893                     /* (?(1)...) */
11894                     char c;
11895                     UV uv;
11896                     endptr = RExC_end;
11897                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11898                         && uv <= I32_MAX
11899                     ) {
11900                         parno = (I32)uv;
11901                         RExC_parse = (char*)endptr;
11902                     }
11903                     else {
11904                         vFAIL("panic: grok_atoUV returned FALSE");
11905                     }
11906                     ret = reganode(pRExC_state, GROUPP, parno);
11907
11908                  insert_if_check_paren:
11909                     if (UCHARAT(RExC_parse) != ')') {
11910                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11911                         vFAIL("Switch condition not recognized");
11912                     }
11913                     nextchar(pRExC_state);
11914                   insert_if:
11915                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11916                     br = regbranch(pRExC_state, &flags, 1, depth+1);
11917                     if (br == 0) {
11918                         RETURN_FAIL_ON_RESTART(flags,flagp);
11919                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11920                               (UV) flags);
11921                     } else
11922                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
11923                                                           LONGJMP, 0));
11924                     c = UCHARAT(RExC_parse);
11925                     nextchar(pRExC_state);
11926                     if (flags&HASWIDTH)
11927                         *flagp |= HASWIDTH;
11928                     if (c == '|') {
11929                         if (is_define)
11930                             vFAIL("(?(DEFINE)....) does not allow branches");
11931
11932                         /* Fake one for optimizer.  */
11933                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11934
11935                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11936                             RETURN_FAIL_ON_RESTART(flags, flagp);
11937                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11938                                   (UV) flags);
11939                         }
11940                         REGTAIL(pRExC_state, ret, lastbr);
11941                         if (flags&HASWIDTH)
11942                             *flagp |= HASWIDTH;
11943                         c = UCHARAT(RExC_parse);
11944                         nextchar(pRExC_state);
11945                     }
11946                     else
11947                         lastbr = 0;
11948                     if (c != ')') {
11949                         if (RExC_parse >= RExC_end)
11950                             vFAIL("Switch (?(condition)... not terminated");
11951                         else
11952                             vFAIL("Switch (?(condition)... contains too many branches");
11953                     }
11954                     ender = reg_node(pRExC_state, TAIL);
11955                     REGTAIL(pRExC_state, br, ender);
11956                     if (lastbr) {
11957                         REGTAIL(pRExC_state, lastbr, ender);
11958                         REGTAIL(pRExC_state, REGNODE_OFFSET(
11959                                                 NEXTOPER(
11960                                                 NEXTOPER(REGNODE_p(lastbr)))),
11961                                              ender);
11962                     }
11963                     else
11964                         REGTAIL(pRExC_state, ret, ender);
11965 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
11966                     RExC_size++; /* XXX WHY do we need this?!!
11967                                     For large programs it seems to be required
11968                                     but I can't figure out why. -- dmq*/
11969 #endif
11970                     return ret;
11971                 }
11972                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11973                 vFAIL("Unknown switch condition (?(...))");
11974             }
11975             case '[':           /* (?[ ... ]) */
11976                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11977                                          oregcomp_parse);
11978             case 0: /* A NUL */
11979                 RExC_parse--; /* for vFAIL to print correctly */
11980                 vFAIL("Sequence (? incomplete");
11981                 break;
11982             default: /* e.g., (?i) */
11983                 RExC_parse = (char *) seqstart + 1;
11984               parse_flags:
11985                 parse_lparen_question_flags(pRExC_state);
11986                 if (UCHARAT(RExC_parse) != ':') {
11987                     if (RExC_parse < RExC_end)
11988                         nextchar(pRExC_state);
11989                     *flagp = TRYAGAIN;
11990                     return 0;
11991                 }
11992                 paren = ':';
11993                 nextchar(pRExC_state);
11994                 ret = 0;
11995                 goto parse_rest;
11996             } /* end switch */
11997         }
11998         else {
11999             if (*RExC_parse == '{') {
12000                 ckWARNregdep(RExC_parse + 1,
12001                             "Unescaped left brace in regex is "
12002                             "deprecated here (and will be fatal "
12003                             "in Perl 5.32), passed through");
12004             }
12005             /* Not bothering to indent here, as the above 'else' is temporary
12006              * */
12007         if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12008           capturing_parens:
12009             parno = RExC_npar;
12010             RExC_npar++;
12011             if (RExC_total_parens <= 0) {
12012                 /* If we are in our first pass through (and maybe only pass),
12013                  * we  need to allocate memory for the capturing parentheses
12014                  * data structures.  Since we start at npar=1, when it reaches
12015                  * 2, for the first time it has something to put in it.  Above
12016                  * 2 means we extend what we already have */
12017                 if (RExC_npar == 2) {
12018                     /* setup RExC_open_parens, which holds the address of each
12019                      * OPEN tag, and to make things simpler for the 0 index the
12020                      * start of the program - this is used later for offsets */
12021                     Newxz(RExC_open_parens, RExC_npar, regnode_offset);
12022                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12023
12024                     /* setup RExC_close_parens, which holds the address of each
12025                      * CLOSE tag, and to make things simpler for the 0 index
12026                      * the end of the program - this is used later for offsets
12027                      * */
12028                     Newxz(RExC_close_parens, RExC_npar, regnode_offset);
12029                     /* we dont know where end op starts yet, so we dont need to
12030                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12031                      * above */
12032                 }
12033                 else {
12034                     Renew(RExC_open_parens, RExC_npar, regnode_offset);
12035                     Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
12036
12037                     Renew(RExC_close_parens, RExC_npar, regnode_offset);
12038                     Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
12039                 }
12040             }
12041
12042             ret = reganode(pRExC_state, OPEN, parno);
12043             if (!RExC_nestroot)
12044                 RExC_nestroot = parno;
12045             if (RExC_open_parens && !RExC_open_parens[parno])
12046             {
12047                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12048                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12049                     22, "|    |", (int)(depth * 2 + 1), "",
12050                     (IV)parno, ret));
12051                 RExC_open_parens[parno]= ret;
12052             }
12053
12054             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12055             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12056             is_open = 1;
12057         } else {
12058             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12059             paren = ':';
12060             ret = 0;
12061         }
12062         }
12063     }
12064     else                        /* ! paren */
12065         ret = 0;
12066
12067    parse_rest:
12068     /* Pick up the branches, linking them together. */
12069     parse_start = RExC_parse;   /* MJD */
12070     br = regbranch(pRExC_state, &flags, 1, depth+1);
12071
12072     /*     branch_len = (paren != 0); */
12073
12074     if (br == 0) {
12075         RETURN_FAIL_ON_RESTART(flags, flagp);
12076         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12077     }
12078     if (*RExC_parse == '|') {
12079         if (RExC_use_BRANCHJ) {
12080             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12081         }
12082         else {                  /* MJD */
12083             reginsert(pRExC_state, BRANCH, br, depth+1);
12084             Set_Node_Length(REGNODE_p(br), paren != 0);
12085             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12086         }
12087         have_branch = 1;
12088     }
12089     else if (paren == ':') {
12090         *flagp |= flags&SIMPLE;
12091     }
12092     if (is_open) {                              /* Starts with OPEN. */
12093         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
12094     }
12095     else if (paren != '?')              /* Not Conditional */
12096         ret = br;
12097     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12098     lastbr = br;
12099     while (*RExC_parse == '|') {
12100         if (RExC_use_BRANCHJ) {
12101             ender = reganode(pRExC_state, LONGJMP, 0);
12102
12103             /* Append to the previous. */
12104             REGTAIL(pRExC_state,
12105                     REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12106                     ender);
12107         }
12108         nextchar(pRExC_state);
12109         if (freeze_paren) {
12110             if (RExC_npar > after_freeze)
12111                 after_freeze = RExC_npar;
12112             RExC_npar = freeze_paren;
12113         }
12114         br = regbranch(pRExC_state, &flags, 0, depth+1);
12115
12116         if (br == 0) {
12117             RETURN_FAIL_ON_RESTART(flags, flagp);
12118             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12119         }
12120         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
12121         lastbr = br;
12122         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12123     }
12124
12125     if (have_branch || paren != ':') {
12126         regnode * br;
12127
12128         /* Make a closing node, and hook it on the end. */
12129         switch (paren) {
12130         case ':':
12131             ender = reg_node(pRExC_state, TAIL);
12132             break;
12133         case 1: case 2:
12134             ender = reganode(pRExC_state, CLOSE, parno);
12135             if ( RExC_close_parens ) {
12136                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12137                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12138                         22, "|    |", (int)(depth * 2 + 1), "",
12139                         (IV)parno, ender));
12140                 RExC_close_parens[parno]= ender;
12141                 if (RExC_nestroot == parno)
12142                     RExC_nestroot = 0;
12143             }
12144             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12145             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12146             break;
12147         case 's':
12148             ender = reg_node(pRExC_state, SRCLOSE);
12149             RExC_in_script_run = 0;
12150             break;
12151         case '<':
12152         case 'a':
12153         case 'A':
12154         case 'b':
12155         case 'B':
12156         case ',':
12157         case '=':
12158         case '!':
12159             *flagp &= ~HASWIDTH;
12160             /* FALLTHROUGH */
12161         case 't':   /* aTomic */
12162         case '>':
12163             ender = reg_node(pRExC_state, SUCCEED);
12164             break;
12165         case 0:
12166             ender = reg_node(pRExC_state, END);
12167             assert(!RExC_end_op); /* there can only be one! */
12168             RExC_end_op = REGNODE_p(ender);
12169             if (RExC_close_parens) {
12170                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12171                     "%*s%*s Setting close paren #0 (END) to %d\n",
12172                     22, "|    |", (int)(depth * 2 + 1), "",
12173                     ender));
12174
12175                 RExC_close_parens[0]= ender;
12176             }
12177             break;
12178         }
12179         DEBUG_PARSE_r(
12180             DEBUG_PARSE_MSG("lsbr");
12181             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12182             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12183             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12184                           SvPV_nolen_const(RExC_mysv1),
12185                           (IV)lastbr,
12186                           SvPV_nolen_const(RExC_mysv2),
12187                           (IV)ender,
12188                           (IV)(ender - lastbr)
12189             );
12190         );
12191         REGTAIL(pRExC_state, lastbr, ender);
12192
12193         if (have_branch) {
12194             char is_nothing= 1;
12195             if (depth==1)
12196                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12197
12198             /* Hook the tails of the branches to the closing node. */
12199             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12200                 const U8 op = PL_regkind[OP(br)];
12201                 if (op == BRANCH) {
12202                     REGTAIL_STUDY(pRExC_state,
12203                                   REGNODE_OFFSET(NEXTOPER(br)),
12204                                   ender);
12205                     if ( OP(NEXTOPER(br)) != NOTHING
12206                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12207                         is_nothing= 0;
12208                 }
12209                 else if (op == BRANCHJ) {
12210                     REGTAIL_STUDY(pRExC_state,
12211                                   REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12212                                   ender);
12213                     /* for now we always disable this optimisation * /
12214                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12215                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12216                     */
12217                         is_nothing= 0;
12218                 }
12219             }
12220             if (is_nothing) {
12221                 regnode * ret_as_regnode = REGNODE_p(ret);
12222                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12223                                ? regnext(ret_as_regnode)
12224                                : ret_as_regnode;
12225                 DEBUG_PARSE_r(
12226                     DEBUG_PARSE_MSG("NADA");
12227                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12228                                      NULL, pRExC_state);
12229                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12230                                      NULL, pRExC_state);
12231                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12232                                   SvPV_nolen_const(RExC_mysv1),
12233                                   (IV)REG_NODE_NUM(ret_as_regnode),
12234                                   SvPV_nolen_const(RExC_mysv2),
12235                                   (IV)ender,
12236                                   (IV)(ender - ret)
12237                     );
12238                 );
12239                 OP(br)= NOTHING;
12240                 if (OP(REGNODE_p(ender)) == TAIL) {
12241                     NEXT_OFF(br)= 0;
12242                     RExC_emit= REGNODE_OFFSET(br) + 1;
12243                 } else {
12244                     regnode *opt;
12245                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12246                         OP(opt)= OPTIMIZED;
12247                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12248                 }
12249             }
12250         }
12251     }
12252
12253     {
12254         const char *p;
12255          /* Even/odd or x=don't care: 010101x10x */
12256         static const char parens[] = "=!aA<,>Bbt";
12257          /* flag below is set to 0 up through 'A'; 1 for larger */
12258
12259         if (paren && (p = strchr(parens, paren))) {
12260             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12261             int flag = (p - parens) > 3;
12262
12263             if (paren == '>' || paren == 't') {
12264                 node = SUSPEND, flag = 0;
12265             }
12266
12267             reginsert(pRExC_state, node, ret, depth+1);
12268             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12269             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12270             FLAGS(REGNODE_p(ret)) = flag;
12271             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
12272         }
12273     }
12274
12275     /* Check for proper termination. */
12276     if (paren) {
12277         /* restore original flags, but keep (?p) and, if we've encountered
12278          * something in the parse that changes /d rules into /u, keep the /u */
12279         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12280         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12281             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12282         }
12283         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12284             RExC_parse = oregcomp_parse;
12285             vFAIL("Unmatched (");
12286         }
12287         nextchar(pRExC_state);
12288     }
12289     else if (!paren && RExC_parse < RExC_end) {
12290         if (*RExC_parse == ')') {
12291             RExC_parse++;
12292             vFAIL("Unmatched )");
12293         }
12294         else
12295             FAIL("Junk on end of regexp");      /* "Can't happen". */
12296         NOT_REACHED; /* NOTREACHED */
12297     }
12298
12299     if (RExC_in_lookbehind) {
12300         RExC_in_lookbehind--;
12301     }
12302     if (after_freeze > RExC_npar)
12303         RExC_npar = after_freeze;
12304     return(ret);
12305 }
12306
12307 /*
12308  - regbranch - one alternative of an | operator
12309  *
12310  * Implements the concatenation operator.
12311  *
12312  * On success, returns the offset at which any next node should be placed into
12313  * the regex engine program being compiled.
12314  *
12315  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12316  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12317  * UTF-8
12318  */
12319 STATIC regnode_offset
12320 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12321 {
12322     regnode_offset ret;
12323     regnode_offset chain = 0;
12324     regnode_offset latest;
12325     I32 flags = 0, c = 0;
12326     GET_RE_DEBUG_FLAGS_DECL;
12327
12328     PERL_ARGS_ASSERT_REGBRANCH;
12329
12330     DEBUG_PARSE("brnc");
12331
12332     if (first)
12333         ret = 0;
12334     else {
12335         if (RExC_use_BRANCHJ)
12336             ret = reganode(pRExC_state, BRANCHJ, 0);
12337         else {
12338             ret = reg_node(pRExC_state, BRANCH);
12339             Set_Node_Length(REGNODE_p(ret), 1);
12340         }
12341     }
12342
12343     *flagp = WORST;                     /* Tentatively. */
12344
12345     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12346                             FALSE /* Don't force to /x */ );
12347     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12348         flags &= ~TRYAGAIN;
12349         latest = regpiece(pRExC_state, &flags, depth+1);
12350         if (latest == 0) {
12351             if (flags & TRYAGAIN)
12352                 continue;
12353             RETURN_FAIL_ON_RESTART(flags, flagp);
12354             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12355         }
12356         else if (ret == 0)
12357             ret = latest;
12358         *flagp |= flags&(HASWIDTH|POSTPONED);
12359         if (chain == 0)         /* First piece. */
12360             *flagp |= flags&SPSTART;
12361         else {
12362             /* FIXME adding one for every branch after the first is probably
12363              * excessive now we have TRIE support. (hv) */
12364             MARK_NAUGHTY(1);
12365             if (     chain > (SSize_t) BRANCH_MAX_OFFSET
12366                 && ! RExC_use_BRANCHJ)
12367             {
12368                 /* XXX We could just redo this branch, but figuring out what
12369                  * bookkeeping needs to be reset is a pain */
12370                 REQUIRE_BRANCHJ(flagp, 0);
12371             }
12372             REGTAIL(pRExC_state, chain, latest);
12373         }
12374         chain = latest;
12375         c++;
12376     }
12377     if (chain == 0) {   /* Loop ran zero times. */
12378         chain = reg_node(pRExC_state, NOTHING);
12379         if (ret == 0)
12380             ret = chain;
12381     }
12382     if (c == 1) {
12383         *flagp |= flags&SIMPLE;
12384     }
12385
12386     return ret;
12387 }
12388
12389 /*
12390  - regpiece - something followed by possible quantifier * + ? {n,m}
12391  *
12392  * Note that the branching code sequences used for ? and the general cases
12393  * of * and + are somewhat optimized:  they use the same NOTHING node as
12394  * both the endmarker for their branch list and the body of the last branch.
12395  * It might seem that this node could be dispensed with entirely, but the
12396  * endmarker role is not redundant.
12397  *
12398  * On success, returns the offset at which any next node should be placed into
12399  * the regex engine program being compiled.
12400  *
12401  * Returns 0 otherwise, with *flagp set to indicate why:
12402  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12403  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12404  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12405  */
12406 STATIC regnode_offset
12407 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12408 {
12409     regnode_offset ret;
12410     char op;
12411     char *next;
12412     I32 flags;
12413     const char * const origparse = RExC_parse;
12414     I32 min;
12415     I32 max = REG_INFTY;
12416 #ifdef RE_TRACK_PATTERN_OFFSETS
12417     char *parse_start;
12418 #endif
12419     const char *maxpos = NULL;
12420     UV uv;
12421
12422     /* Save the original in case we change the emitted regop to a FAIL. */
12423     const regnode_offset orig_emit = RExC_emit;
12424
12425     GET_RE_DEBUG_FLAGS_DECL;
12426
12427     PERL_ARGS_ASSERT_REGPIECE;
12428
12429     DEBUG_PARSE("piec");
12430
12431     ret = regatom(pRExC_state, &flags, depth+1);
12432     if (ret == 0) {
12433         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12434         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12435     }
12436
12437     op = *RExC_parse;
12438
12439     if (op == '{' && regcurly(RExC_parse)) {
12440         maxpos = NULL;
12441 #ifdef RE_TRACK_PATTERN_OFFSETS
12442         parse_start = RExC_parse; /* MJD */
12443 #endif
12444         next = RExC_parse + 1;
12445         while (isDIGIT(*next) || *next == ',') {
12446             if (*next == ',') {
12447                 if (maxpos)
12448                     break;
12449                 else
12450                     maxpos = next;
12451             }
12452             next++;
12453         }
12454         if (*next == '}') {             /* got one */
12455             const char* endptr;
12456             if (!maxpos)
12457                 maxpos = next;
12458             RExC_parse++;
12459             if (isDIGIT(*RExC_parse)) {
12460                 endptr = RExC_end;
12461                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12462                     vFAIL("Invalid quantifier in {,}");
12463                 if (uv >= REG_INFTY)
12464                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12465                 min = (I32)uv;
12466             } else {
12467                 min = 0;
12468             }
12469             if (*maxpos == ',')
12470                 maxpos++;
12471             else
12472                 maxpos = RExC_parse;
12473             if (isDIGIT(*maxpos)) {
12474                 endptr = RExC_end;
12475                 if (!grok_atoUV(maxpos, &uv, &endptr))
12476                     vFAIL("Invalid quantifier in {,}");
12477                 if (uv >= REG_INFTY)
12478                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12479                 max = (I32)uv;
12480             } else {
12481                 max = REG_INFTY;                /* meaning "infinity" */
12482             }
12483             RExC_parse = next;
12484             nextchar(pRExC_state);
12485             if (max < min) {    /* If can't match, warn and optimize to fail
12486                                    unconditionally */
12487                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12488                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12489                 NEXT_OFF(REGNODE_p(orig_emit)) =
12490                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12491                 return ret;
12492             }
12493             else if (min == max && *RExC_parse == '?')
12494             {
12495                 ckWARN2reg(RExC_parse + 1,
12496                            "Useless use of greediness modifier '%c'",
12497                            *RExC_parse);
12498             }
12499
12500           do_curly:
12501             if ((flags&SIMPLE)) {
12502                 if (min == 0 && max == REG_INFTY) {
12503                     reginsert(pRExC_state, STAR, ret, depth+1);
12504                     MARK_NAUGHTY(4);
12505                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12506                     goto nest_check;
12507                 }
12508                 if (min == 1 && max == REG_INFTY) {
12509                     reginsert(pRExC_state, PLUS, ret, depth+1);
12510                     MARK_NAUGHTY(3);
12511                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12512                     goto nest_check;
12513                 }
12514                 MARK_NAUGHTY_EXP(2, 2);
12515                 reginsert(pRExC_state, CURLY, ret, depth+1);
12516                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12517                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12518             }
12519             else {
12520                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12521
12522                 FLAGS(REGNODE_p(w)) = 0;
12523                 REGTAIL(pRExC_state, ret, w);
12524                 if (RExC_use_BRANCHJ) {
12525                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12526                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12527                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12528                 }
12529                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12530                                 /* MJD hk */
12531                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12532                 Set_Node_Length(REGNODE_p(ret),
12533                                 op == '{' ? (RExC_parse - parse_start) : 1);
12534
12535                 if (RExC_use_BRANCHJ)
12536                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12537                                                        LONGJMP. */
12538                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
12539                 RExC_whilem_seen++;
12540                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12541             }
12542             FLAGS(REGNODE_p(ret)) = 0;
12543
12544             if (min > 0)
12545                 *flagp = WORST;
12546             if (max > 0)
12547                 *flagp |= HASWIDTH;
12548             ARG1_SET(REGNODE_p(ret), (U16)min);
12549             ARG2_SET(REGNODE_p(ret), (U16)max);
12550             if (max == REG_INFTY)
12551                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12552
12553             goto nest_check;
12554         }
12555     }
12556
12557     if (!ISMULT1(op)) {
12558         *flagp = flags;
12559         return(ret);
12560     }
12561
12562 #if 0                           /* Now runtime fix should be reliable. */
12563
12564     /* if this is reinstated, don't forget to put this back into perldiag:
12565
12566             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12567
12568            (F) The part of the regexp subject to either the * or + quantifier
12569            could match an empty string. The {#} shows in the regular
12570            expression about where the problem was discovered.
12571
12572     */
12573
12574     if (!(flags&HASWIDTH) && op != '?')
12575       vFAIL("Regexp *+ operand could be empty");
12576 #endif
12577
12578 #ifdef RE_TRACK_PATTERN_OFFSETS
12579     parse_start = RExC_parse;
12580 #endif
12581     nextchar(pRExC_state);
12582
12583     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12584
12585     if (op == '*') {
12586         min = 0;
12587         goto do_curly;
12588     }
12589     else if (op == '+') {
12590         min = 1;
12591         goto do_curly;
12592     }
12593     else if (op == '?') {
12594         min = 0; max = 1;
12595         goto do_curly;
12596     }
12597   nest_check:
12598     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12599         ckWARN2reg(RExC_parse,
12600                    "%" UTF8f " matches null string many times",
12601                    UTF8fARG(UTF, (RExC_parse >= origparse
12602                                  ? RExC_parse - origparse
12603                                  : 0),
12604                    origparse));
12605     }
12606
12607     if (*RExC_parse == '?') {
12608         nextchar(pRExC_state);
12609         reginsert(pRExC_state, MINMOD, ret, depth+1);
12610         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
12611     }
12612     else if (*RExC_parse == '+') {
12613         regnode_offset ender;
12614         nextchar(pRExC_state);
12615         ender = reg_node(pRExC_state, SUCCEED);
12616         REGTAIL(pRExC_state, ret, ender);
12617         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12618         ender = reg_node(pRExC_state, TAIL);
12619         REGTAIL(pRExC_state, ret, ender);
12620     }
12621
12622     if (ISMULT2(RExC_parse)) {
12623         RExC_parse++;
12624         vFAIL("Nested quantifiers");
12625     }
12626
12627     return(ret);
12628 }
12629
12630 STATIC bool
12631 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12632                 regnode_offset * node_p,
12633                 UV * code_point_p,
12634                 int * cp_count,
12635                 I32 * flagp,
12636                 const bool strict,
12637                 const U32 depth
12638     )
12639 {
12640  /* This routine teases apart the various meanings of \N and returns
12641   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12642   * in the current context.
12643   *
12644   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12645   *
12646   * If <code_point_p> is not NULL, the context is expecting the result to be a
12647   * single code point.  If this \N instance turns out to a single code point,
12648   * the function returns TRUE and sets *code_point_p to that code point.
12649   *
12650   * If <node_p> is not NULL, the context is expecting the result to be one of
12651   * the things representable by a regnode.  If this \N instance turns out to be
12652   * one such, the function generates the regnode, returns TRUE and sets *node_p
12653   * to point to the offset of that regnode into the regex engine program being
12654   * compiled.
12655   *
12656   * If this instance of \N isn't legal in any context, this function will
12657   * generate a fatal error and not return.
12658   *
12659   * On input, RExC_parse should point to the first char following the \N at the
12660   * time of the call.  On successful return, RExC_parse will have been updated
12661   * to point to just after the sequence identified by this routine.  Also
12662   * *flagp has been updated as needed.
12663   *
12664   * When there is some problem with the current context and this \N instance,
12665   * the function returns FALSE, without advancing RExC_parse, nor setting
12666   * *node_p, nor *code_point_p, nor *flagp.
12667   *
12668   * If <cp_count> is not NULL, the caller wants to know the length (in code
12669   * points) that this \N sequence matches.  This is set, and the input is
12670   * parsed for errors, even if the function returns FALSE, as detailed below.
12671   *
12672   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12673   *
12674   * Probably the most common case is for the \N to specify a single code point.
12675   * *cp_count will be set to 1, and *code_point_p will be set to that code
12676   * point.
12677   *
12678   * Another possibility is for the input to be an empty \N{}, which for
12679   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
12680   * will be set to a generated NOTHING node.
12681   *
12682   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12683   * set to 0. *node_p will be set to a generated REG_ANY node.
12684   *
12685   * The fourth possibility is that \N resolves to a sequence of more than one
12686   * code points.  *cp_count will be set to the number of code points in the
12687   * sequence. *node_p will be set to a generated node returned by this
12688   * function calling S_reg().
12689   *
12690   * The final possibility is that it is premature to be calling this function;
12691   * the parse needs to be restarted.  This can happen when this changes from
12692   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12693   * latter occurs only when the fourth possibility would otherwise be in
12694   * effect, and is because one of those code points requires the pattern to be
12695   * recompiled as UTF-8.  The function returns FALSE, and sets the
12696   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12697   * happens, the caller needs to desist from continuing parsing, and return
12698   * this information to its caller.  This is not set for when there is only one
12699   * code point, as this can be called as part of an ANYOF node, and they can
12700   * store above-Latin1 code points without the pattern having to be in UTF-8.
12701   *
12702   * For non-single-quoted regexes, the tokenizer has resolved character and
12703   * sequence names inside \N{...} into their Unicode values, normalizing the
12704   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12705   * hex-represented code points in the sequence.  This is done there because
12706   * the names can vary based on what charnames pragma is in scope at the time,
12707   * so we need a way to take a snapshot of what they resolve to at the time of
12708   * the original parse. [perl #56444].
12709   *
12710   * That parsing is skipped for single-quoted regexes, so we may here get
12711   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
12712   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
12713   * is legal and handled here.  The code point is Unicode, and has to be
12714   * translated into the native character set for non-ASCII platforms.
12715   */
12716
12717     char * endbrace;    /* points to '}' following the name */
12718     char* p = RExC_parse; /* Temporary */
12719
12720     SV * substitute_parse = NULL;
12721     char *orig_end;
12722     char *save_start;
12723     I32 flags;
12724     Size_t count = 0;   /* code point count kept internally by this function */
12725
12726     GET_RE_DEBUG_FLAGS_DECL;
12727
12728     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12729
12730     GET_RE_DEBUG_FLAGS;
12731
12732     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12733     assert(! (node_p && cp_count));               /* At most 1 should be set */
12734
12735     if (cp_count) {     /* Initialize return for the most common case */
12736         *cp_count = 1;
12737     }
12738
12739     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12740      * modifier.  The other meanings do not, so use a temporary until we find
12741      * out which we are being called with */
12742     skip_to_be_ignored_text(pRExC_state, &p,
12743                             FALSE /* Don't force to /x */ );
12744
12745     /* Disambiguate between \N meaning a named character versus \N meaning
12746      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12747      * quantifier, or there is no '{' at all */
12748     if (*p != '{' || regcurly(p)) {
12749         RExC_parse = p;
12750         if (cp_count) {
12751             *cp_count = -1;
12752         }
12753
12754         if (! node_p) {
12755             return FALSE;
12756         }
12757
12758         *node_p = reg_node(pRExC_state, REG_ANY);
12759         *flagp |= HASWIDTH|SIMPLE;
12760         MARK_NAUGHTY(1);
12761         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12762         return TRUE;
12763     }
12764
12765     /* The test above made sure that the next real character is a '{', but
12766      * under the /x modifier, it could be separated by space (or a comment and
12767      * \n) and this is not allowed (for consistency with \x{...} and the
12768      * tokenizer handling of \N{NAME}). */
12769     if (*RExC_parse != '{') {
12770         vFAIL("Missing braces on \\N{}");
12771     }
12772
12773     RExC_parse++;       /* Skip past the '{' */
12774
12775     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12776     if (! endbrace) { /* no trailing brace */
12777         vFAIL2("Missing right brace on \\%c{}", 'N');
12778     }
12779
12780     /* Here, we have decided it should be a named character or sequence */
12781     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12782                                         semantics */
12783
12784     if (endbrace == RExC_parse) {   /* empty: \N{} */
12785         if (strict) {
12786             RExC_parse++;   /* Position after the "}" */
12787             vFAIL("Zero length \\N{}");
12788         }
12789         if (cp_count) {
12790             *cp_count = 0;
12791         }
12792         nextchar(pRExC_state);
12793         if (! node_p) {
12794             return FALSE;
12795         }
12796
12797         *node_p = reg_node(pRExC_state, NOTHING);
12798         return TRUE;
12799     }
12800
12801     /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
12802     if (   endbrace - RExC_parse < 2
12803         || strnNE(RExC_parse, "U+", 2))
12804     {
12805         RExC_parse = endbrace;  /* position msg's '<--HERE' */
12806         vFAIL("\\N{NAME} must be resolved by the lexer");
12807     }
12808
12809         /* This code purposely indented below because of future changes coming */
12810
12811         /* We can get to here when the input is \N{U+...} or when toke.c has
12812          * converted a name to the \N{U+...} form.  This include changing a
12813          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12814
12815         RExC_parse += 2;    /* Skip past the 'U+' */
12816
12817         /* Code points are separated by dots.  The '}' terminates the whole
12818          * thing. */
12819
12820         do {    /* Loop until the ending brace */
12821             UV cp = 0;
12822             char * start_digit;     /* The first of the current code point */
12823             if (! isXDIGIT(*RExC_parse)) {
12824                 RExC_parse++;
12825                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12826             }
12827
12828             start_digit = RExC_parse;
12829             count++;
12830
12831             /* Loop through the hex digits of the current code point */
12832             do {
12833                 /* Adding this digit will shift the result 4 bits.  If that
12834                  * result would be above the legal max, it's overflow */
12835                 if (cp > MAX_LEGAL_CP >> 4) {
12836
12837                     /* Find the end of the code point */
12838                     do {
12839                         RExC_parse ++;
12840                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12841
12842                     /* Be sure to synchronize this message with the similar one
12843                      * in utf8.c */
12844                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
12845                         " permissible max is 0x%" UVxf,
12846                         (int) (RExC_parse - start_digit), start_digit,
12847                         MAX_LEGAL_CP);
12848                 }
12849
12850                 /* Accumulate this (valid) digit into the running total */
12851                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
12852
12853                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
12854                  * underscore separator */
12855                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12856                     RExC_parse++;
12857                 }
12858             } while (isXDIGIT(*RExC_parse));
12859
12860             /* Here, have accumulated the next code point */
12861             if (RExC_parse >= endbrace) {   /* If done ... */
12862                 if (count != 1) {
12863                     goto do_concat;
12864                 }
12865
12866                 /* Here, is a single code point; fail if doesn't want that */
12867                 if (! code_point_p) {
12868                     RExC_parse = p;
12869                     return FALSE;
12870                 }
12871
12872                 /* A single code point is easy to handle; just return it */
12873                 *code_point_p = UNI_TO_NATIVE(cp);
12874                 RExC_parse = endbrace;
12875                 nextchar(pRExC_state);
12876                 return TRUE;
12877             }
12878
12879             /* Here, the only legal thing would be a multiple character
12880              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
12881              * character must be a dot (and the one after that can't be the
12882              * endbrace, or we'd have something like \N{U+100.} ) */
12883             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12884                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
12885                                 ? UTF8SKIP(RExC_parse)
12886                                 : 1;
12887                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
12888                     RExC_parse = endbrace;
12889                 }
12890                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12891             }
12892
12893             /* Here, looks like its really a multiple character sequence.  Fail
12894              * if that's not what the caller wants.  But continue with counting
12895              * and error checking if they still want a count */
12896             if (! node_p && ! cp_count) {
12897                 return FALSE;
12898             }
12899
12900             /* What is done here is to convert this to a sub-pattern of the
12901              * form \x{char1}\x{char2}...  and then call reg recursively to
12902              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
12903              * atomicness, while not having to worry about special handling
12904              * that some code points may have.  We don't create a subpattern,
12905              * but go through the motions of code point counting and error
12906              * checking, if the caller doesn't want a node returned. */
12907
12908             if (node_p && count == 1) {
12909                 substitute_parse = newSVpvs("?:");
12910             }
12911
12912           do_concat:
12913
12914             if (node_p) {
12915                 /* Convert to notation the rest of the code understands */
12916                 sv_catpvs(substitute_parse, "\\x{");
12917                 sv_catpvn(substitute_parse, start_digit,
12918                                             RExC_parse - start_digit);
12919                 sv_catpvs(substitute_parse, "}");
12920             }
12921
12922             /* Move to after the dot (or ending brace the final time through.)
12923              * */
12924             RExC_parse++;
12925             count++;
12926
12927         } while (RExC_parse < endbrace);
12928
12929         if (! node_p) { /* Doesn't want the node */
12930             assert (cp_count);
12931
12932             *cp_count = count;
12933             return FALSE;
12934         }
12935
12936         sv_catpvs(substitute_parse, ")");
12937
12938 #ifdef EBCDIC
12939         /* The values are Unicode, and therefore have to be converted to native
12940          * on a non-Unicode (meaning non-ASCII) platform. */
12941         RExC_recode_x_to_native = 1;
12942 #endif
12943
12944     /* Here, we have the string the name evaluates to, ready to be parsed,
12945      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
12946      * constructs.  This can be called from within a substitute parse already.
12947      * The error reporting mechanism doesn't work for 2 levels of this, but the
12948      * code above has validated this new construct, so there should be no
12949      * errors generated by the below.  And this isn' an exact copy, so the
12950      * mechanism to seamlessly deal with this won't work, so turn off warnings
12951      * during it */
12952     save_start = RExC_start;
12953     orig_end = RExC_end;
12954
12955     RExC_parse = RExC_start = SvPVX(substitute_parse);
12956     RExC_end = RExC_parse + SvCUR(substitute_parse);
12957     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
12958
12959     *node_p = reg(pRExC_state, 1, &flags, depth+1);
12960
12961     /* Restore the saved values */
12962     RESTORE_WARNINGS;
12963     RExC_start = save_start;
12964     RExC_parse = endbrace;
12965     RExC_end = orig_end;
12966 #ifdef EBCDIC
12967     RExC_recode_x_to_native = 0;
12968 #endif
12969
12970     SvREFCNT_dec_NN(substitute_parse);
12971
12972     if (! *node_p) {
12973         RETURN_FAIL_ON_RESTART(flags, flagp);
12974         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
12975             (UV) flags);
12976     }
12977     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12978
12979     nextchar(pRExC_state);
12980
12981     return TRUE;
12982 }
12983
12984
12985 PERL_STATIC_INLINE U8
12986 S_compute_EXACTish(RExC_state_t *pRExC_state)
12987 {
12988     U8 op;
12989
12990     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12991
12992     if (! FOLD) {
12993         return (LOC)
12994                 ? EXACTL
12995                 : EXACT;
12996     }
12997
12998     op = get_regex_charset(RExC_flags);
12999     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13000         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13001                  been, so there is no hole */
13002     }
13003
13004     return op + EXACTF;
13005 }
13006
13007 STATIC bool
13008 S_new_regcurly(const char *s, const char *e)
13009 {
13010     /* This is a temporary function designed to match the most lenient form of
13011      * a {m,n} quantifier we ever envision, with either number omitted, and
13012      * spaces anywhere between/before/after them.
13013      *
13014      * If this function fails, then the string it matches is very unlikely to
13015      * ever be considered a valid quantifier, so we can allow the '{' that
13016      * begins it to be considered as a literal */
13017
13018     bool has_min = FALSE;
13019     bool has_max = FALSE;
13020
13021     PERL_ARGS_ASSERT_NEW_REGCURLY;
13022
13023     if (s >= e || *s++ != '{')
13024         return FALSE;
13025
13026     while (s < e && isSPACE(*s)) {
13027         s++;
13028     }
13029     while (s < e && isDIGIT(*s)) {
13030         has_min = TRUE;
13031         s++;
13032     }
13033     while (s < e && isSPACE(*s)) {
13034         s++;
13035     }
13036
13037     if (*s == ',') {
13038         s++;
13039         while (s < e && isSPACE(*s)) {
13040             s++;
13041         }
13042         while (s < e && isDIGIT(*s)) {
13043             has_max = TRUE;
13044             s++;
13045         }
13046         while (s < e && isSPACE(*s)) {
13047             s++;
13048         }
13049     }
13050
13051     return s < e && *s == '}' && (has_min || has_max);
13052 }
13053
13054 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13055  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13056
13057 static I32
13058 S_backref_value(char *p, char *e)
13059 {
13060     const char* endptr = e;
13061     UV val;
13062     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13063         return (I32)val;
13064     return I32_MAX;
13065 }
13066
13067
13068 /*
13069  - regatom - the lowest level
13070
13071    Try to identify anything special at the start of the current parse position.
13072    If there is, then handle it as required. This may involve generating a
13073    single regop, such as for an assertion; or it may involve recursing, such as
13074    to handle a () structure.
13075
13076    If the string doesn't start with something special then we gobble up
13077    as much literal text as we can.  If we encounter a quantifier, we have to
13078    back off the final literal character, as that quantifier applies to just it
13079    and not to the whole string of literals.
13080
13081    Once we have been able to handle whatever type of thing started the
13082    sequence, we return the offset into the regex engine program being compiled
13083    at which any  next regnode should be placed.
13084
13085    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13086    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13087    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13088    Otherwise does not return 0.
13089
13090    Note: we have to be careful with escapes, as they can be both literal
13091    and special, and in the case of \10 and friends, context determines which.
13092
13093    A summary of the code structure is:
13094
13095    switch (first_byte) {
13096         cases for each special:
13097             handle this special;
13098             break;
13099         case '\\':
13100             switch (2nd byte) {
13101                 cases for each unambiguous special:
13102                     handle this special;
13103                     break;
13104                 cases for each ambigous special/literal:
13105                     disambiguate;
13106                     if (special)  handle here
13107                     else goto defchar;
13108                 default: // unambiguously literal:
13109                     goto defchar;
13110             }
13111         default:  // is a literal char
13112             // FALL THROUGH
13113         defchar:
13114             create EXACTish node for literal;
13115             while (more input and node isn't full) {
13116                 switch (input_byte) {
13117                    cases for each special;
13118                        make sure parse pointer is set so that the next call to
13119                            regatom will see this special first
13120                        goto loopdone; // EXACTish node terminated by prev. char
13121                    default:
13122                        append char to EXACTISH node;
13123                 }
13124                 get next input byte;
13125             }
13126         loopdone:
13127    }
13128    return the generated node;
13129
13130    Specifically there are two separate switches for handling
13131    escape sequences, with the one for handling literal escapes requiring
13132    a dummy entry for all of the special escapes that are actually handled
13133    by the other.
13134
13135 */
13136
13137 STATIC regnode_offset
13138 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13139 {
13140     regnode_offset ret = 0;
13141     I32 flags = 0;
13142     char *parse_start;
13143     U8 op;
13144     int invert = 0;
13145     U8 arg;
13146
13147     GET_RE_DEBUG_FLAGS_DECL;
13148
13149     *flagp = WORST;             /* Tentatively. */
13150
13151     DEBUG_PARSE("atom");
13152
13153     PERL_ARGS_ASSERT_REGATOM;
13154
13155   tryagain:
13156     parse_start = RExC_parse;
13157     assert(RExC_parse < RExC_end);
13158     switch ((U8)*RExC_parse) {
13159     case '^':
13160         RExC_seen_zerolen++;
13161         nextchar(pRExC_state);
13162         if (RExC_flags & RXf_PMf_MULTILINE)
13163             ret = reg_node(pRExC_state, MBOL);
13164         else
13165             ret = reg_node(pRExC_state, SBOL);
13166         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13167         break;
13168     case '$':
13169         nextchar(pRExC_state);
13170         if (*RExC_parse)
13171             RExC_seen_zerolen++;
13172         if (RExC_flags & RXf_PMf_MULTILINE)
13173             ret = reg_node(pRExC_state, MEOL);
13174         else
13175             ret = reg_node(pRExC_state, SEOL);
13176         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13177         break;
13178     case '.':
13179         nextchar(pRExC_state);
13180         if (RExC_flags & RXf_PMf_SINGLELINE)
13181             ret = reg_node(pRExC_state, SANY);
13182         else
13183             ret = reg_node(pRExC_state, REG_ANY);
13184         *flagp |= HASWIDTH|SIMPLE;
13185         MARK_NAUGHTY(1);
13186         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13187         break;
13188     case '[':
13189     {
13190         char * const oregcomp_parse = ++RExC_parse;
13191         ret = regclass(pRExC_state, flagp, depth+1,
13192                        FALSE, /* means parse the whole char class */
13193                        TRUE, /* allow multi-char folds */
13194                        FALSE, /* don't silence non-portable warnings. */
13195                        (bool) RExC_strict,
13196                        TRUE, /* Allow an optimized regnode result */
13197                        NULL);
13198         if (ret == 0) {
13199             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13200             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13201                   (UV) *flagp);
13202         }
13203         if (*RExC_parse != ']') {
13204             RExC_parse = oregcomp_parse;
13205             vFAIL("Unmatched [");
13206         }
13207         nextchar(pRExC_state);
13208         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13209         break;
13210     }
13211     case '(':
13212         nextchar(pRExC_state);
13213         ret = reg(pRExC_state, 2, &flags, depth+1);
13214         if (ret == 0) {
13215                 if (flags & TRYAGAIN) {
13216                     if (RExC_parse >= RExC_end) {
13217                          /* Make parent create an empty node if needed. */
13218                         *flagp |= TRYAGAIN;
13219                         return(0);
13220                     }
13221                     goto tryagain;
13222                 }
13223                 RETURN_FAIL_ON_RESTART(flags, flagp);
13224                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13225                                                                  (UV) flags);
13226         }
13227         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13228         break;
13229     case '|':
13230     case ')':
13231         if (flags & TRYAGAIN) {
13232             *flagp |= TRYAGAIN;
13233             return 0;
13234         }
13235         vFAIL("Internal urp");
13236                                 /* Supposed to be caught earlier. */
13237         break;
13238     case '?':
13239     case '+':
13240     case '*':
13241         RExC_parse++;
13242         vFAIL("Quantifier follows nothing");
13243         break;
13244     case '\\':
13245         /* Special Escapes
13246
13247            This switch handles escape sequences that resolve to some kind
13248            of special regop and not to literal text. Escape sequences that
13249            resolve to literal text are handled below in the switch marked
13250            "Literal Escapes".
13251
13252            Every entry in this switch *must* have a corresponding entry
13253            in the literal escape switch. However, the opposite is not
13254            required, as the default for this switch is to jump to the
13255            literal text handling code.
13256         */
13257         RExC_parse++;
13258         switch ((U8)*RExC_parse) {
13259         /* Special Escapes */
13260         case 'A':
13261             RExC_seen_zerolen++;
13262             ret = reg_node(pRExC_state, SBOL);
13263             /* SBOL is shared with /^/ so we set the flags so we can tell
13264              * /\A/ from /^/ in split. */
13265             FLAGS(REGNODE_p(ret)) = 1;
13266             *flagp |= SIMPLE;
13267             goto finish_meta_pat;
13268         case 'G':
13269             ret = reg_node(pRExC_state, GPOS);
13270             RExC_seen |= REG_GPOS_SEEN;
13271             *flagp |= SIMPLE;
13272             goto finish_meta_pat;
13273         case 'K':
13274             RExC_seen_zerolen++;
13275             ret = reg_node(pRExC_state, KEEPS);
13276             *flagp |= SIMPLE;
13277             /* XXX:dmq : disabling in-place substitution seems to
13278              * be necessary here to avoid cases of memory corruption, as
13279              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13280              */
13281             RExC_seen |= REG_LOOKBEHIND_SEEN;
13282             goto finish_meta_pat;
13283         case 'Z':
13284             ret = reg_node(pRExC_state, SEOL);
13285             *flagp |= SIMPLE;
13286             RExC_seen_zerolen++;                /* Do not optimize RE away */
13287             goto finish_meta_pat;
13288         case 'z':
13289             ret = reg_node(pRExC_state, EOS);
13290             *flagp |= SIMPLE;
13291             RExC_seen_zerolen++;                /* Do not optimize RE away */
13292             goto finish_meta_pat;
13293         case 'C':
13294             vFAIL("\\C no longer supported");
13295         case 'X':
13296             ret = reg_node(pRExC_state, CLUMP);
13297             *flagp |= HASWIDTH;
13298             goto finish_meta_pat;
13299
13300         case 'W':
13301             invert = 1;
13302             /* FALLTHROUGH */
13303         case 'w':
13304             arg = ANYOF_WORDCHAR;
13305             goto join_posix;
13306
13307         case 'B':
13308             invert = 1;
13309             /* FALLTHROUGH */
13310         case 'b':
13311           {
13312             U8 flags = 0;
13313             regex_charset charset = get_regex_charset(RExC_flags);
13314
13315             RExC_seen_zerolen++;
13316             RExC_seen |= REG_LOOKBEHIND_SEEN;
13317             op = BOUND + charset;
13318
13319             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13320                 flags = TRADITIONAL_BOUND;
13321                 if (op > BOUNDA) {  /* /aa is same as /a */
13322                     op = BOUNDA;
13323                 }
13324             }
13325             else {
13326                 STRLEN length;
13327                 char name = *RExC_parse;
13328                 char * endbrace = NULL;
13329                 RExC_parse += 2;
13330                 if (RExC_parse < RExC_end) {
13331                     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13332                 }
13333
13334                 if (! endbrace) {
13335                     vFAIL2("Missing right brace on \\%c{}", name);
13336                 }
13337                 /* XXX Need to decide whether to take spaces or not.  Should be
13338                  * consistent with \p{}, but that currently is SPACE, which
13339                  * means vertical too, which seems wrong
13340                  * while (isBLANK(*RExC_parse)) {
13341                     RExC_parse++;
13342                 }*/
13343                 if (endbrace == RExC_parse) {
13344                     RExC_parse++;  /* After the '}' */
13345                     vFAIL2("Empty \\%c{}", name);
13346                 }
13347                 length = endbrace - RExC_parse;
13348                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13349                     length--;
13350                 }*/
13351                 switch (*RExC_parse) {
13352                     case 'g':
13353                         if (    length != 1
13354                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13355                         {
13356                             goto bad_bound_type;
13357                         }
13358                         flags = GCB_BOUND;
13359                         break;
13360                     case 'l':
13361                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13362                             goto bad_bound_type;
13363                         }
13364                         flags = LB_BOUND;
13365                         break;
13366                     case 's':
13367                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13368                             goto bad_bound_type;
13369                         }
13370                         flags = SB_BOUND;
13371                         break;
13372                     case 'w':
13373                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13374                             goto bad_bound_type;
13375                         }
13376                         flags = WB_BOUND;
13377                         break;
13378                     default:
13379                       bad_bound_type:
13380                         RExC_parse = endbrace;
13381                         vFAIL2utf8f(
13382                             "'%" UTF8f "' is an unknown bound type",
13383                             UTF8fARG(UTF, length, endbrace - length));
13384                         NOT_REACHED; /*NOTREACHED*/
13385                 }
13386                 RExC_parse = endbrace;
13387                 REQUIRE_UNI_RULES(flagp, 0);
13388
13389                 if (op == BOUND) {
13390                     op = BOUNDU;
13391                 }
13392                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13393                     op = BOUNDU;
13394                     length += 4;
13395
13396                     /* Don't have to worry about UTF-8, in this message because
13397                      * to get here the contents of the \b must be ASCII */
13398                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13399                               "Using /u for '%.*s' instead of /%s",
13400                               (unsigned) length,
13401                               endbrace - length + 1,
13402                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13403                               ? ASCII_RESTRICT_PAT_MODS
13404                               : ASCII_MORE_RESTRICT_PAT_MODS);
13405                 }
13406             }
13407
13408             if (op == BOUND) {
13409                 RExC_seen_d_op = TRUE;
13410             }
13411             else if (op == BOUNDL) {
13412                 RExC_contains_locale = 1;
13413             }
13414
13415             if (invert) {
13416                 op += NBOUND - BOUND;
13417             }
13418
13419             ret = reg_node(pRExC_state, op);
13420             FLAGS(REGNODE_p(ret)) = flags;
13421
13422             *flagp |= SIMPLE;
13423
13424             goto finish_meta_pat;
13425           }
13426
13427         case 'D':
13428             invert = 1;
13429             /* FALLTHROUGH */
13430         case 'd':
13431             arg = ANYOF_DIGIT;
13432             if (! DEPENDS_SEMANTICS) {
13433                 goto join_posix;
13434             }
13435
13436             /* \d doesn't have any matches in the upper Latin1 range, hence /d
13437              * is equivalent to /u.  Changing to /u saves some branches at
13438              * runtime */
13439             op = POSIXU;
13440             goto join_posix_op_known;
13441
13442         case 'R':
13443             ret = reg_node(pRExC_state, LNBREAK);
13444             *flagp |= HASWIDTH|SIMPLE;
13445             goto finish_meta_pat;
13446
13447         case 'H':
13448             invert = 1;
13449             /* FALLTHROUGH */
13450         case 'h':
13451             arg = ANYOF_BLANK;
13452             op = POSIXU;
13453             goto join_posix_op_known;
13454
13455         case 'V':
13456             invert = 1;
13457             /* FALLTHROUGH */
13458         case 'v':
13459             arg = ANYOF_VERTWS;
13460             op = POSIXU;
13461             goto join_posix_op_known;
13462
13463         case 'S':
13464             invert = 1;
13465             /* FALLTHROUGH */
13466         case 's':
13467             arg = ANYOF_SPACE;
13468
13469           join_posix:
13470
13471             op = POSIXD + get_regex_charset(RExC_flags);
13472             if (op > POSIXA) {  /* /aa is same as /a */
13473                 op = POSIXA;
13474             }
13475             else if (op == POSIXL) {
13476                 RExC_contains_locale = 1;
13477             }
13478             else if (op == POSIXD) {
13479                 RExC_seen_d_op = TRUE;
13480             }
13481
13482           join_posix_op_known:
13483
13484             if (invert) {
13485                 op += NPOSIXD - POSIXD;
13486             }
13487
13488             ret = reg_node(pRExC_state, op);
13489             FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13490
13491             *flagp |= HASWIDTH|SIMPLE;
13492             /* FALLTHROUGH */
13493
13494           finish_meta_pat:
13495             if (   UCHARAT(RExC_parse + 1) == '{'
13496                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13497             {
13498                 RExC_parse += 2;
13499                 vFAIL("Unescaped left brace in regex is illegal here");
13500             }
13501             nextchar(pRExC_state);
13502             Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13503             break;
13504         case 'p':
13505         case 'P':
13506             RExC_parse--;
13507
13508             ret = regclass(pRExC_state, flagp, depth+1,
13509                            TRUE, /* means just parse this element */
13510                            FALSE, /* don't allow multi-char folds */
13511                            FALSE, /* don't silence non-portable warnings.  It
13512                                      would be a bug if these returned
13513                                      non-portables */
13514                            (bool) RExC_strict,
13515                            TRUE, /* Allow an optimized regnode result */
13516                            NULL);
13517             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13518             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13519              * multi-char folds are allowed.  */
13520             if (!ret)
13521                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13522                       (UV) *flagp);
13523
13524             RExC_parse--;
13525
13526             Set_Node_Offset(REGNODE_p(ret), parse_start);
13527             Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13528             nextchar(pRExC_state);
13529             break;
13530         case 'N':
13531             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13532              * \N{...} evaluates to a sequence of more than one code points).
13533              * The function call below returns a regnode, which is our result.
13534              * The parameters cause it to fail if the \N{} evaluates to a
13535              * single code point; we handle those like any other literal.  The
13536              * reason that the multicharacter case is handled here and not as
13537              * part of the EXACtish code is because of quantifiers.  In
13538              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13539              * this way makes that Just Happen. dmq.
13540              * join_exact() will join this up with adjacent EXACTish nodes
13541              * later on, if appropriate. */
13542             ++RExC_parse;
13543             if (grok_bslash_N(pRExC_state,
13544                               &ret,     /* Want a regnode returned */
13545                               NULL,     /* Fail if evaluates to a single code
13546                                            point */
13547                               NULL,     /* Don't need a count of how many code
13548                                            points */
13549                               flagp,
13550                               RExC_strict,
13551                               depth)
13552             ) {
13553                 break;
13554             }
13555
13556             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13557
13558             /* Here, evaluates to a single code point.  Go get that */
13559             RExC_parse = parse_start;
13560             goto defchar;
13561
13562         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13563       parse_named_seq:
13564         {
13565             char ch;
13566             if (   RExC_parse >= RExC_end - 1
13567                 || ((   ch = RExC_parse[1]) != '<'
13568                                       && ch != '\''
13569                                       && ch != '{'))
13570             {
13571                 RExC_parse++;
13572                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13573                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13574             } else {
13575                 RExC_parse += 2;
13576                 ret = handle_named_backref(pRExC_state,
13577                                            flagp,
13578                                            parse_start,
13579                                            (ch == '<')
13580                                            ? '>'
13581                                            : (ch == '{')
13582                                              ? '}'
13583                                              : '\'');
13584             }
13585             break;
13586         }
13587         case 'g':
13588         case '1': case '2': case '3': case '4':
13589         case '5': case '6': case '7': case '8': case '9':
13590             {
13591                 I32 num;
13592                 bool hasbrace = 0;
13593
13594                 if (*RExC_parse == 'g') {
13595                     bool isrel = 0;
13596
13597                     RExC_parse++;
13598                     if (*RExC_parse == '{') {
13599                         RExC_parse++;
13600                         hasbrace = 1;
13601                     }
13602                     if (*RExC_parse == '-') {
13603                         RExC_parse++;
13604                         isrel = 1;
13605                     }
13606                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13607                         if (isrel) RExC_parse--;
13608                         RExC_parse -= 2;
13609                         goto parse_named_seq;
13610                     }
13611
13612                     if (RExC_parse >= RExC_end) {
13613                         goto unterminated_g;
13614                     }
13615                     num = S_backref_value(RExC_parse, RExC_end);
13616                     if (num == 0)
13617                         vFAIL("Reference to invalid group 0");
13618                     else if (num == I32_MAX) {
13619                          if (isDIGIT(*RExC_parse))
13620                             vFAIL("Reference to nonexistent group");
13621                         else
13622                           unterminated_g:
13623                             vFAIL("Unterminated \\g... pattern");
13624                     }
13625
13626                     if (isrel) {
13627                         num = RExC_npar - num;
13628                         if (num < 1)
13629                             vFAIL("Reference to nonexistent or unclosed group");
13630                     }
13631                 }
13632                 else {
13633                     num = S_backref_value(RExC_parse, RExC_end);
13634                     /* bare \NNN might be backref or octal - if it is larger
13635                      * than or equal RExC_npar then it is assumed to be an
13636                      * octal escape. Note RExC_npar is +1 from the actual
13637                      * number of parens. */
13638                     /* Note we do NOT check if num == I32_MAX here, as that is
13639                      * handled by the RExC_npar check */
13640
13641                     if (
13642                         /* any numeric escape < 10 is always a backref */
13643                         num > 9
13644                         /* any numeric escape < RExC_npar is a backref */
13645                         && num >= RExC_npar
13646                         /* cannot be an octal escape if it starts with 8 */
13647                         && *RExC_parse != '8'
13648                         /* cannot be an octal escape it it starts with 9 */
13649                         && *RExC_parse != '9'
13650                     ) {
13651                         /* Probably not meant to be a backref, instead likely
13652                          * to be an octal character escape, e.g. \35 or \777.
13653                          * The above logic should make it obvious why using
13654                          * octal escapes in patterns is problematic. - Yves */
13655                         RExC_parse = parse_start;
13656                         goto defchar;
13657                     }
13658                 }
13659
13660                 /* At this point RExC_parse points at a numeric escape like
13661                  * \12 or \88 or something similar, which we should NOT treat
13662                  * as an octal escape. It may or may not be a valid backref
13663                  * escape. For instance \88888888 is unlikely to be a valid
13664                  * backref. */
13665                 while (isDIGIT(*RExC_parse))
13666                     RExC_parse++;
13667                 if (hasbrace) {
13668                     if (*RExC_parse != '}')
13669                         vFAIL("Unterminated \\g{...} pattern");
13670                     RExC_parse++;
13671                 }
13672                 if (num >= (I32)RExC_npar) {
13673
13674                     /* It might be a forward reference; we can't fail until we
13675                      * know, by completing the parse to get all the groups, and
13676                      * then reparsing */
13677                     if (RExC_total_parens > 0)  {
13678                         if (num >= RExC_total_parens)  {
13679                             vFAIL("Reference to nonexistent group");
13680                         }
13681                     }
13682                     else {
13683                         REQUIRE_PARENS_PASS;
13684                     }
13685                 }
13686                 RExC_sawback = 1;
13687                 ret = reganode(pRExC_state,
13688                                ((! FOLD)
13689                                  ? REF
13690                                  : (ASCII_FOLD_RESTRICTED)
13691                                    ? REFFA
13692                                    : (AT_LEAST_UNI_SEMANTICS)
13693                                      ? REFFU
13694                                      : (LOC)
13695                                        ? REFFL
13696                                        : REFF),
13697                                 num);
13698                 if (OP(REGNODE_p(ret)) == REFF) {
13699                     RExC_seen_d_op = TRUE;
13700                 }
13701                 *flagp |= HASWIDTH;
13702
13703                 /* override incorrect value set in reganode MJD */
13704                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13705                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13706                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13707                                         FALSE /* Don't force to /x */ );
13708             }
13709             break;
13710         case '\0':
13711             if (RExC_parse >= RExC_end)
13712                 FAIL("Trailing \\");
13713             /* FALLTHROUGH */
13714         default:
13715             /* Do not generate "unrecognized" warnings here, we fall
13716                back into the quick-grab loop below */
13717             RExC_parse = parse_start;
13718             goto defchar;
13719         } /* end of switch on a \foo sequence */
13720         break;
13721
13722     case '#':
13723
13724         /* '#' comments should have been spaced over before this function was
13725          * called */
13726         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13727         /*
13728         if (RExC_flags & RXf_PMf_EXTENDED) {
13729             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13730             if (RExC_parse < RExC_end)
13731                 goto tryagain;
13732         }
13733         */
13734
13735         /* FALLTHROUGH */
13736
13737     default:
13738           defchar: {
13739
13740             /* Here, we have determined that the next thing is probably a
13741              * literal character.  RExC_parse points to the first byte of its
13742              * definition.  (It still may be an escape sequence that evaluates
13743              * to a single character) */
13744
13745             STRLEN len = 0;
13746             UV ender = 0;
13747             char *p;
13748             char *s;
13749
13750 /* This allows us to fill a node with just enough spare so that if the final
13751  * character folds, its expansion is guaranteed to fit */
13752 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13753
13754             char *s0;
13755             U8 upper_parse = MAX_NODE_STRING_SIZE;
13756
13757             /* We start out as an EXACT node, even if under /i, until we find a
13758              * character which is in a fold.  The algorithm now segregates into
13759              * separate nodes, characters that fold from those that don't under
13760              * /i.  (This hopefully will create nodes that are fixed strings
13761              * even under /i, giving the optimizer something to grab on to.)
13762              * So, if a node has something in it and the next character is in
13763              * the opposite category, that node is closed up, and the function
13764              * returns.  Then regatom is called again, and a new node is
13765              * created for the new category. */
13766             U8 node_type = EXACT;
13767
13768             /* Assume the node will be fully used; the excess is given back at
13769              * the end.  We can't make any other length assumptions, as a byte
13770              * input sequence could shrink down. */
13771             Ptrdiff_t initial_size = STR_SZ(256);
13772
13773             bool next_is_quantifier;
13774             char * oldp = NULL;
13775
13776             /* We can convert EXACTF nodes to EXACTFU if they contain only
13777              * characters that match identically regardless of the target
13778              * string's UTF8ness.  The reason to do this is that EXACTF is not
13779              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13780              * runtime.
13781              *
13782              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13783              * contain only above-Latin1 characters (hence must be in UTF8),
13784              * which don't participate in folds with Latin1-range characters,
13785              * as the latter's folds aren't known until runtime. */
13786             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13787
13788             /* Single-character EXACTish nodes are almost always SIMPLE.  This
13789              * allows us to override this as encountered */
13790             U8 maybe_SIMPLE = SIMPLE;
13791
13792             /* Does this node contain something that can't match unless the
13793              * target string is (also) in UTF-8 */
13794             bool requires_utf8_target = FALSE;
13795
13796             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13797             bool has_ss = FALSE;
13798
13799             /* So is the MICRO SIGN */
13800             bool has_micro_sign = FALSE;
13801
13802             /* Allocate an EXACT node.  The node_type may change below to
13803              * another EXACTish node, but since the size of the node doesn't
13804              * change, it works */
13805             ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
13806             FILL_NODE(ret, node_type);
13807             RExC_emit++;
13808
13809             s = STRING(REGNODE_p(ret));
13810
13811             s0 = s;
13812
13813           reparse:
13814
13815             /* This breaks under rare circumstances.  If folding, we do not
13816              * want to split a node at a character that is a non-final in a
13817              * multi-char fold, as an input string could just happen to want to
13818              * match across the node boundary.  The code at the end of the loop
13819              * looks for this, and backs off until it finds not such a
13820              * character, but it is possible (though extremely, extremely
13821              * unlikely) for all characters in the node to be non-final fold
13822              * ones, in which case we just leave the node fully filled, and
13823              * hope that it doesn't match the string in just the wrong place */
13824
13825             assert( ! UTF     /* Is at the beginning of a character */
13826                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13827                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13828
13829
13830             /* Here, we have a literal character.  Find the maximal string of
13831              * them in the input that we can fit into a single EXACTish node.
13832              * We quit at the first non-literal or when the node gets full, or
13833              * under /i the categorization of folding/non-folding character
13834              * changes */
13835             for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
13836
13837                 /* In most cases each iteration adds one byte to the output.
13838                  * The exceptions override this */
13839                 Size_t added_len = 1;
13840
13841                 oldp = p;
13842
13843                 /* White space has already been ignored */
13844                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13845                        || ! is_PATWS_safe((p), RExC_end, UTF));
13846
13847                 switch ((U8)*p) {
13848                 case '^':
13849                 case '$':
13850                 case '.':
13851                 case '[':
13852                 case '(':
13853                 case ')':
13854                 case '|':
13855                     goto loopdone;
13856                 case '\\':
13857                     /* Literal Escapes Switch
13858
13859                        This switch is meant to handle escape sequences that
13860                        resolve to a literal character.
13861
13862                        Every escape sequence that represents something
13863                        else, like an assertion or a char class, is handled
13864                        in the switch marked 'Special Escapes' above in this
13865                        routine, but also has an entry here as anything that
13866                        isn't explicitly mentioned here will be treated as
13867                        an unescaped equivalent literal.
13868                     */
13869
13870                     switch ((U8)*++p) {
13871
13872                     /* These are all the special escapes. */
13873                     case 'A':             /* Start assertion */
13874                     case 'b': case 'B':   /* Word-boundary assertion*/
13875                     case 'C':             /* Single char !DANGEROUS! */
13876                     case 'd': case 'D':   /* digit class */
13877                     case 'g': case 'G':   /* generic-backref, pos assertion */
13878                     case 'h': case 'H':   /* HORIZWS */
13879                     case 'k': case 'K':   /* named backref, keep marker */
13880                     case 'p': case 'P':   /* Unicode property */
13881                               case 'R':   /* LNBREAK */
13882                     case 's': case 'S':   /* space class */
13883                     case 'v': case 'V':   /* VERTWS */
13884                     case 'w': case 'W':   /* word class */
13885                     case 'X':             /* eXtended Unicode "combining
13886                                              character sequence" */
13887                     case 'z': case 'Z':   /* End of line/string assertion */
13888                         --p;
13889                         goto loopdone;
13890
13891                     /* Anything after here is an escape that resolves to a
13892                        literal. (Except digits, which may or may not)
13893                      */
13894                     case 'n':
13895                         ender = '\n';
13896                         p++;
13897                         break;
13898                     case 'N': /* Handle a single-code point named character. */
13899                         RExC_parse = p + 1;
13900                         if (! grok_bslash_N(pRExC_state,
13901                                             NULL,   /* Fail if evaluates to
13902                                                        anything other than a
13903                                                        single code point */
13904                                             &ender, /* The returned single code
13905                                                        point */
13906                                             NULL,   /* Don't need a count of
13907                                                        how many code points */
13908                                             flagp,
13909                                             RExC_strict,
13910                                             depth)
13911                         ) {
13912                             if (*flagp & NEED_UTF8)
13913                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13914                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13915
13916                             /* Here, it wasn't a single code point.  Go close
13917                              * up this EXACTish node.  The switch() prior to
13918                              * this switch handles the other cases */
13919                             RExC_parse = p = oldp;
13920                             goto loopdone;
13921                         }
13922                         p = RExC_parse;
13923                         RExC_parse = parse_start;
13924
13925                         /* The \N{} means the pattern, if previously /d,
13926                          * becomes /u.  That means it can't be an EXACTF node,
13927                          * but an EXACTFU */
13928                         if (node_type == EXACTF) {
13929                             node_type = EXACTFU;
13930
13931                             /* If the node already contains something that
13932                              * differs between EXACTF and EXACTFU, reparse it
13933                              * as EXACTFU */
13934                             if (! maybe_exactfu) {
13935                                 len = 0;
13936                                 s = s0;
13937                                 goto reparse;
13938                             }
13939                         }
13940
13941                         break;
13942                     case 'r':
13943                         ender = '\r';
13944                         p++;
13945                         break;
13946                     case 't':
13947                         ender = '\t';
13948                         p++;
13949                         break;
13950                     case 'f':
13951                         ender = '\f';
13952                         p++;
13953                         break;
13954                     case 'e':
13955                         ender = ESC_NATIVE;
13956                         p++;
13957                         break;
13958                     case 'a':
13959                         ender = '\a';
13960                         p++;
13961                         break;
13962                     case 'o':
13963                         {
13964                             UV result;
13965                             const char* error_msg;
13966
13967                             bool valid = grok_bslash_o(&p,
13968                                                        RExC_end,
13969                                                        &result,
13970                                                        &error_msg,
13971                                                        TO_OUTPUT_WARNINGS(p),
13972                                                        (bool) RExC_strict,
13973                                                        TRUE, /* Output warnings
13974                                                                 for non-
13975                                                                 portables */
13976                                                        UTF);
13977                             if (! valid) {
13978                                 RExC_parse = p; /* going to die anyway; point
13979                                                    to exact spot of failure */
13980                                 vFAIL(error_msg);
13981                             }
13982                             UPDATE_WARNINGS_LOC(p - 1);
13983                             ender = result;
13984                             break;
13985                         }
13986                     case 'x':
13987                         {
13988                             UV result = UV_MAX; /* initialize to erroneous
13989                                                    value */
13990                             const char* error_msg;
13991
13992                             bool valid = grok_bslash_x(&p,
13993                                                        RExC_end,
13994                                                        &result,
13995                                                        &error_msg,
13996                                                        TO_OUTPUT_WARNINGS(p),
13997                                                        (bool) RExC_strict,
13998                                                        TRUE, /* Silence warnings
13999                                                                 for non-
14000                                                                 portables */
14001                                                        UTF);
14002                             if (! valid) {
14003                                 RExC_parse = p; /* going to die anyway; point
14004                                                    to exact spot of failure */
14005                                 vFAIL(error_msg);
14006                             }
14007                             UPDATE_WARNINGS_LOC(p - 1);
14008                             ender = result;
14009
14010                             if (ender < 0x100) {
14011 #ifdef EBCDIC
14012                                 if (RExC_recode_x_to_native) {
14013                                     ender = LATIN1_TO_NATIVE(ender);
14014                                 }
14015 #endif
14016                             }
14017                             break;
14018                         }
14019                     case 'c':
14020                         p++;
14021                         ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14022                         UPDATE_WARNINGS_LOC(p);
14023                         p++;
14024                         break;
14025                     case '8': case '9': /* must be a backreference */
14026                         --p;
14027                         /* we have an escape like \8 which cannot be an octal escape
14028                          * so we exit the loop, and let the outer loop handle this
14029                          * escape which may or may not be a legitimate backref. */
14030                         goto loopdone;
14031                     case '1': case '2': case '3':case '4':
14032                     case '5': case '6': case '7':
14033                         /* When we parse backslash escapes there is ambiguity
14034                          * between backreferences and octal escapes. Any escape
14035                          * from \1 - \9 is a backreference, any multi-digit
14036                          * escape which does not start with 0 and which when
14037                          * evaluated as decimal could refer to an already
14038                          * parsed capture buffer is a back reference. Anything
14039                          * else is octal.
14040                          *
14041                          * Note this implies that \118 could be interpreted as
14042                          * 118 OR as "\11" . "8" depending on whether there
14043                          * were 118 capture buffers defined already in the
14044                          * pattern.  */
14045
14046                         /* NOTE, RExC_npar is 1 more than the actual number of
14047                          * parens we have seen so far, hence the "<" as opposed
14048                          * to "<=" */
14049                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14050                         {  /* Not to be treated as an octal constant, go
14051                                    find backref */
14052                             --p;
14053                             goto loopdone;
14054                         }
14055                         /* FALLTHROUGH */
14056                     case '0':
14057                         {
14058                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14059                             STRLEN numlen = 3;
14060                             ender = grok_oct(p, &numlen, &flags, NULL);
14061                             p += numlen;
14062                             if (   isDIGIT(*p)  /* like \08, \178 */
14063                                 && ckWARN(WARN_REGEXP)
14064                                 && numlen < 3)
14065                             {
14066                                 reg_warn_non_literal_string(
14067                                          p + 1,
14068                                          form_short_octal_warning(p, numlen));
14069                             }
14070                         }
14071                         break;
14072                     case '\0':
14073                         if (p >= RExC_end)
14074                             FAIL("Trailing \\");
14075                         /* FALLTHROUGH */
14076                     default:
14077                         if (isALPHANUMERIC(*p)) {
14078                             /* An alpha followed by '{' is going to fail next
14079                              * iteration, so don't output this warning in that
14080                              * case */
14081                             if (! isALPHA(*p) || *(p + 1) != '{') {
14082                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14083                                                   " passed through", p);
14084                             }
14085                         }
14086                         goto normal_default;
14087                     } /* End of switch on '\' */
14088                     break;
14089                 case '{':
14090                     /* Trying to gain new uses for '{' without breaking too
14091                      * much existing code is hard.  The solution currently
14092                      * adopted is:
14093                      *  1)  If there is no ambiguity that a '{' should always
14094                      *      be taken literally, at the start of a construct, we
14095                      *      just do so.
14096                      *  2)  If the literal '{' conflicts with our desired use
14097                      *      of it as a metacharacter, we die.  The deprecation
14098                      *      cycles for this have come and gone.
14099                      *  3)  If there is ambiguity, we raise a simple warning.
14100                      *      This could happen, for example, if the user
14101                      *      intended it to introduce a quantifier, but slightly
14102                      *      misspelled the quantifier.  Without this warning,
14103                      *      the quantifier would silently be taken as a literal
14104                      *      string of characters instead of a meta construct */
14105                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14106                         if (      RExC_strict
14107                             || (  p > parse_start + 1
14108                                 && isALPHA_A(*(p - 1))
14109                                 && *(p - 2) == '\\')
14110                             || new_regcurly(p, RExC_end))
14111                         {
14112                             RExC_parse = p + 1;
14113                             vFAIL("Unescaped left brace in regex is "
14114                                   "illegal here");
14115                         }
14116                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14117                                          " passed through");
14118                     }
14119                     goto normal_default;
14120                 case '}':
14121                 case ']':
14122                     if (p > RExC_parse && RExC_strict) {
14123                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14124                     }
14125                     /*FALLTHROUGH*/
14126                 default:    /* A literal character */
14127                   normal_default:
14128                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14129                         STRLEN numlen;
14130                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14131                                                &numlen, UTF8_ALLOW_DEFAULT);
14132                         p += numlen;
14133                     }
14134                     else
14135                         ender = (U8) *p++;
14136                     break;
14137                 } /* End of switch on the literal */
14138
14139                 /* Here, have looked at the literal character, and <ender>
14140                  * contains its ordinal; <p> points to the character after it.
14141                  * */
14142
14143                 if (ender > 255) {
14144                     REQUIRE_UTF8(flagp);
14145                 }
14146
14147                 /* We need to check if the next non-ignored thing is a
14148                  * quantifier.  Move <p> to after anything that should be
14149                  * ignored, which, as a side effect, positions <p> for the next
14150                  * loop iteration */
14151                 skip_to_be_ignored_text(pRExC_state, &p,
14152                                         FALSE /* Don't force to /x */ );
14153
14154                 /* If the next thing is a quantifier, it applies to this
14155                  * character only, which means that this character has to be in
14156                  * its own node and can't just be appended to the string in an
14157                  * existing node, so if there are already other characters in
14158                  * the node, close the node with just them, and set up to do
14159                  * this character again next time through, when it will be the
14160                  * only thing in its new node */
14161
14162                 next_is_quantifier =    LIKELY(p < RExC_end)
14163                                      && UNLIKELY(ISMULT2(p));
14164
14165                 if (next_is_quantifier && LIKELY(len)) {
14166                     p = oldp;
14167                     goto loopdone;
14168                 }
14169
14170                 /* Ready to add 'ender' to the node */
14171
14172                 if (! FOLD) {  /* The simple case, just append the literal */
14173
14174                       not_fold_common:
14175                         if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14176                             *(s++) = (char) ender;
14177                         }
14178                         else {
14179                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14180                             added_len = (char *) new_s - s;
14181                             s = (char *) new_s;
14182
14183                             if (ender > 255)  {
14184                                 requires_utf8_target = TRUE;
14185                             }
14186                         }
14187                 }
14188                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14189
14190                     /* Here are folding under /l, and the code point is
14191                      * problematic.  If this is the first character in the
14192                      * node, change the node type to folding.   Otherwise, if
14193                      * this is the first problematic character, close up the
14194                      * existing node, so can start a new node with this one */
14195                     if (! len) {
14196                         node_type = EXACTFL;
14197                         RExC_contains_locale = 1;
14198                     }
14199                     else if (node_type == EXACT) {
14200                         p = oldp;
14201                         goto loopdone;
14202                     }
14203
14204                     /* This problematic code point means we can't simplify
14205                      * things */
14206                     maybe_exactfu = FALSE;
14207
14208                     /* Here, we are adding a problematic fold character.
14209                      * "Problematic" in this context means that its fold isn't
14210                      * known until runtime.  (The non-problematic code points
14211                      * are the above-Latin1 ones that fold to also all
14212                      * above-Latin1.  Their folds don't vary no matter what the
14213                      * locale is.) But here we have characters whose fold
14214                      * depends on the locale.  We just add in the unfolded
14215                      * character, and wait until runtime to fold it */
14216                     goto not_fold_common;
14217                 }
14218                 else /* regular fold; see if actually is in a fold */
14219                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14220                          || (ender > 255
14221                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14222                 {
14223                     /* Here, folding, but the character isn't in a fold.
14224                      *
14225                      * Start a new node if previous characters in the node were
14226                      * folded */
14227                     if (len && node_type != EXACT) {
14228                         p = oldp;
14229                         goto loopdone;
14230                     }
14231
14232                     /* Here, continuing a node with non-folded characters.  Add
14233                      * this one */
14234                     goto not_fold_common;
14235                 }
14236                 else {  /* Here, does participate in some fold */
14237
14238                     /* If this is the first character in the node, change its
14239                      * type to folding.  Otherwise, if this is the first
14240                      * folding character in the node, close up the existing
14241                      * node, so can start a new node with this one.  */
14242                     if (! len) {
14243                         node_type = compute_EXACTish(pRExC_state);
14244                     }
14245                     else if (node_type == EXACT) {
14246                         p = oldp;
14247                         goto loopdone;
14248                     }
14249
14250                     if (UTF) {  /* Use the folded value */
14251                         if (UVCHR_IS_INVARIANT(ender)) {
14252                             *(s)++ = (U8) toFOLD(ender);
14253                         }
14254                         else {
14255                             ender = _to_uni_fold_flags(
14256                                     ender,
14257                                     (U8 *) s,
14258                                     &added_len,
14259                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14260                                                     ? FOLD_FLAGS_NOMIX_ASCII
14261                                                     : 0));
14262                             s += added_len;
14263
14264                             if (   ender > 255
14265                                 && LIKELY(ender != GREEK_SMALL_LETTER_MU))
14266                             {
14267                                 /* U+B5 folds to the MU, so its possible for a
14268                                  * non-UTF-8 target to match it */
14269                                 requires_utf8_target = TRUE;
14270                             }
14271                         }
14272                     }
14273                     else {
14274
14275                         /* Here is non-UTF8.  First, see if the character's
14276                          * fold differs between /d and /u. */
14277                         if (PL_fold[ender] != PL_fold_latin1[ender]) {
14278                             maybe_exactfu = FALSE;
14279                         }
14280
14281 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14282    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14283                                       || UNICODE_DOT_DOT_VERSION > 0)
14284
14285                         /* On non-ancient Unicode versions, this includes the
14286                          * multi-char fold SHARP S to 'ss' */
14287
14288                         if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
14289                                  || (   isALPHA_FOLD_EQ(ender, 's')
14290                                      && len > 0
14291                                      && isALPHA_FOLD_EQ(*(s-1), 's')))
14292                         {
14293                             /* Here, we have one of the following:
14294                              *  a)  a SHARP S.  This folds to 'ss' only under
14295                              *      /u rules.  If we are in that situation,
14296                              *      fold the SHARP S to 'ss'.  See the comments
14297                              *      for join_exact() as to why we fold this
14298                              *      non-UTF at compile time, and no others.
14299                              *  b)  'ss'.  When under /u, there's nothing
14300                              *      special needed to be done here.  The
14301                              *      previous iteration handled the first 's',
14302                              *      and this iteration will handle the second.
14303                              *      If, on the otherhand it's not /u, we have
14304                              *      to exclude the possibility of moving to /u,
14305                              *      so that we won't generate an unwanted
14306                              *      match, unless, at runtime, the target
14307                              *      string is in UTF-8.
14308                              * */
14309
14310                             has_ss = TRUE;
14311                             maybe_exactfu = FALSE;  /* Can't generate an
14312                                                        EXACTFU node (unless we
14313                                                        already are in one) */
14314                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14315                                 maybe_SIMPLE = 0;
14316                                 if (node_type == EXACTFU) {
14317                                     *(s++) = 's';
14318
14319                                     /* Let the code below add in the extra 's' */
14320                                     ender = 's';
14321                                     added_len = 2;
14322                                 }
14323                             }
14324                         }
14325 #endif
14326
14327                         else if (UNLIKELY(ender == MICRO_SIGN)) {
14328                             has_micro_sign = TRUE;
14329                         }
14330
14331                         *(s++) = (char) (DEPENDS_SEMANTICS)
14332                                         ? toFOLD(ender)
14333
14334                                           /* Under /u, the fold of any
14335                                            * character in the 0-255 range
14336                                            * happens to be its lowercase
14337                                            * equivalent, except for LATIN SMALL
14338                                            * LETTER SHARP S, which was handled
14339                                            * above, and the MICRO SIGN, whose
14340                                            * fold requires UTF-8 to represent.
14341                                            * */
14342                                         : toLOWER_L1(ender);
14343                     }
14344                 } /* End of adding current character to the node */
14345
14346                 len += added_len;
14347
14348                 if (next_is_quantifier) {
14349
14350                     /* Here, the next input is a quantifier, and to get here,
14351                      * the current character is the only one in the node. */
14352                     goto loopdone;
14353                 }
14354
14355             } /* End of loop through literal characters */
14356
14357             /* Here we have either exhausted the input or ran out of room in
14358              * the node.  (If we encountered a character that can't be in the
14359              * node, transfer is made directly to <loopdone>, and so we
14360              * wouldn't have fallen off the end of the loop.)  In the latter
14361              * case, we artificially have to split the node into two, because
14362              * we just don't have enough space to hold everything.  This
14363              * creates a problem if the final character participates in a
14364              * multi-character fold in the non-final position, as a match that
14365              * should have occurred won't, due to the way nodes are matched,
14366              * and our artificial boundary.  So back off until we find a non-
14367              * problematic character -- one that isn't at the beginning or
14368              * middle of such a fold.  (Either it doesn't participate in any
14369              * folds, or appears only in the final position of all the folds it
14370              * does participate in.)  A better solution with far fewer false
14371              * positives, and that would fill the nodes more completely, would
14372              * be to actually have available all the multi-character folds to
14373              * test against, and to back-off only far enough to be sure that
14374              * this node isn't ending with a partial one.  <upper_parse> is set
14375              * further below (if we need to reparse the node) to include just
14376              * up through that final non-problematic character that this code
14377              * identifies, so when it is set to less than the full node, we can
14378              * skip the rest of this */
14379             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14380
14381                 const STRLEN full_len = len;
14382
14383                 assert(len >= MAX_NODE_STRING_SIZE);
14384
14385                 /* Here, <s> points to the final byte of the final character.
14386                  * Look backwards through the string until find a non-
14387                  * problematic character */
14388
14389                 if (! UTF) {
14390
14391                     /* This has no multi-char folds to non-UTF characters */
14392                     if (ASCII_FOLD_RESTRICTED) {
14393                         goto loopdone;
14394                     }
14395
14396                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
14397                     len = s - s0 + 1;
14398                 }
14399                 else {
14400
14401                     /* Point to the first byte of the final character */
14402                     s = (char *) utf8_hop((U8 *) s, -1);
14403
14404                     while (s >= s0) {   /* Search backwards until find
14405                                            a non-problematic char */
14406                         if (UTF8_IS_INVARIANT(*s)) {
14407
14408                             /* There are no ascii characters that participate
14409                              * in multi-char folds under /aa.  In EBCDIC, the
14410                              * non-ascii invariants are all control characters,
14411                              * so don't ever participate in any folds. */
14412                             if (ASCII_FOLD_RESTRICTED
14413                                 || ! IS_NON_FINAL_FOLD(*s))
14414                             {
14415                                 break;
14416                             }
14417                         }
14418                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14419                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14420                                                                   *s, *(s+1))))
14421                             {
14422                                 break;
14423                             }
14424                         }
14425                         else if (! _invlist_contains_cp(
14426                                         PL_NonFinalFold,
14427                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
14428                         {
14429                             break;
14430                         }
14431
14432                         /* Here, the current character is problematic in that
14433                          * it does occur in the non-final position of some
14434                          * fold, so try the character before it, but have to
14435                          * special case the very first byte in the string, so
14436                          * we don't read outside the string */
14437                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14438                     } /* End of loop backwards through the string */
14439
14440                     /* If there were only problematic characters in the string,
14441                      * <s> will point to before s0, in which case the length
14442                      * should be 0, otherwise include the length of the
14443                      * non-problematic character just found */
14444                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14445                 }
14446
14447                 /* Here, have found the final character, if any, that is
14448                  * non-problematic as far as ending the node without splitting
14449                  * it across a potential multi-char fold.  <len> contains the
14450                  * number of bytes in the node up-to and including that
14451                  * character, or is 0 if there is no such character, meaning
14452                  * the whole node contains only problematic characters.  In
14453                  * this case, give up and just take the node as-is.  We can't
14454                  * do any better */
14455                 if (len == 0) {
14456                     len = full_len;
14457
14458                 } else {
14459
14460                     /* Here, the node does contain some characters that aren't
14461                      * problematic.  If one such is the final character in the
14462                      * node, we are done */
14463                     if (len == full_len) {
14464                         goto loopdone;
14465                     }
14466                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
14467
14468                         /* If the final character is problematic, but the
14469                          * penultimate is not, back-off that last character to
14470                          * later start a new node with it */
14471                         p = oldp;
14472                         goto loopdone;
14473                     }
14474
14475                     /* Here, the final non-problematic character is earlier
14476                      * in the input than the penultimate character.  What we do
14477                      * is reparse from the beginning, going up only as far as
14478                      * this final ok one, thus guaranteeing that the node ends
14479                      * in an acceptable character.  The reason we reparse is
14480                      * that we know how far in the character is, but we don't
14481                      * know how to correlate its position with the input parse.
14482                      * An alternate implementation would be to build that
14483                      * correlation as we go along during the original parse,
14484                      * but that would entail extra work for every node, whereas
14485                      * this code gets executed only when the string is too
14486                      * large for the node, and the final two characters are
14487                      * problematic, an infrequent occurrence.  Yet another
14488                      * possible strategy would be to save the tail of the
14489                      * string, and the next time regatom is called, initialize
14490                      * with that.  The problem with this is that unless you
14491                      * back off one more character, you won't be guaranteed
14492                      * regatom will get called again, unless regbranch,
14493                      * regpiece ... are also changed.  If you do back off that
14494                      * extra character, so that there is input guaranteed to
14495                      * force calling regatom, you can't handle the case where
14496                      * just the first character in the node is acceptable.  I
14497                      * (khw) decided to try this method which doesn't have that
14498                      * pitfall; if performance issues are found, we can do a
14499                      * combination of the current approach plus that one */
14500                     upper_parse = len;
14501                     len = 0;
14502                     s = s0;
14503                     goto reparse;
14504                 }
14505             }   /* End of verifying node ends with an appropriate char */
14506
14507           loopdone:   /* Jumped to when encounters something that shouldn't be
14508                          in the node */
14509
14510             /* Free up any over-allocated space */
14511             change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
14512
14513             /* I (khw) don't know if you can get here with zero length, but the
14514              * old code handled this situation by creating a zero-length EXACT
14515              * node.  Might as well be NOTHING instead */
14516             if (len == 0) {
14517                 OP(REGNODE_p(ret)) = NOTHING;
14518             }
14519             else {
14520
14521                 /* If the node type is EXACT here, check to see if it
14522                  * should be EXACTL, or EXACT_ONLY8. */
14523                 if (node_type == EXACT) {
14524                     if (LOC) {
14525                         node_type = EXACTL;
14526                     }
14527                     else if (requires_utf8_target) {
14528                         node_type = EXACT_ONLY8;
14529                     }
14530                 } else if (FOLD) {
14531                     if (    UNLIKELY(has_micro_sign || has_ss)
14532                         && (node_type == EXACTFU || (   node_type == EXACTF
14533                                                      && maybe_exactfu)))
14534                     {   /* These two conditions are problematic in non-UTF-8
14535                            EXACTFU nodes. */
14536                         assert(! UTF);
14537                         node_type = EXACTFUP;
14538                     }
14539                     else if (node_type == EXACTFL) {
14540
14541                         /* 'maybe_exactfu' is deliberately set above to
14542                          * indicate this node type, where all code points in it
14543                          * are above 255 */
14544                         if (maybe_exactfu) {
14545                             node_type = EXACTFLU8;
14546                         }
14547                     }
14548                     else if (node_type == EXACTF) {  /* Means is /di */
14549
14550                         /* If 'maybe_exactfu' is clear, then we need to stay
14551                          * /di.  If it is set, it means there are no code
14552                          * points that match differently depending on UTF8ness
14553                          * of the target string, so it can become an EXACTFU
14554                          * node */
14555                         if (! maybe_exactfu) {
14556                             RExC_seen_d_op = TRUE;
14557                         }
14558                         else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
14559                                  || isALPHA_FOLD_EQ(ender, 's'))
14560                         {
14561                             /* But, if the node begins or ends in an 's' we
14562                              * have to defer changing it into an EXACTFU, as
14563                              * the node could later get joined with another one
14564                              * that ends or begins with 's' creating an 'ss'
14565                              * sequence which would then wrongly match the
14566                              * sharp s without the target being UTF-8.  We
14567                              * create a special node that we resolve later when
14568                              * we join nodes together */
14569
14570                             node_type = EXACTFU_S_EDGE;
14571                         }
14572                         else {
14573                             node_type = EXACTFU;
14574                         }
14575                     }
14576
14577                     if (requires_utf8_target && node_type == EXACTFU) {
14578                         node_type = EXACTFU_ONLY8;
14579                     }
14580                 }
14581
14582                 OP(REGNODE_p(ret)) = node_type;
14583                 STR_LEN(REGNODE_p(ret)) = len;
14584                 RExC_emit += STR_SZ(len);
14585
14586                 /* If the node isn't a single character, it can't be SIMPLE */
14587                 if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
14588                     maybe_SIMPLE = 0;
14589                 }
14590
14591                 *flagp |= HASWIDTH | maybe_SIMPLE;
14592             }
14593
14594             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
14595             RExC_parse = p;
14596
14597             {
14598                 /* len is STRLEN which is unsigned, need to copy to signed */
14599                 IV iv = len;
14600                 if (iv < 0)
14601                     vFAIL("Internal disaster");
14602             }
14603
14604         } /* End of label 'defchar:' */
14605         break;
14606     } /* End of giant switch on input character */
14607
14608     /* Position parse to next real character */
14609     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14610                                             FALSE /* Don't force to /x */ );
14611     if (   *RExC_parse == '{'
14612         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14613     {
14614         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14615             RExC_parse++;
14616             vFAIL("Unescaped left brace in regex is illegal here");
14617         }
14618         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14619                                   " passed through");
14620     }
14621
14622     return(ret);
14623 }
14624
14625
14626 STATIC void
14627 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14628 {
14629     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
14630      * sets up the bitmap and any flags, removing those code points from the
14631      * inversion list, setting it to NULL should it become completely empty */
14632
14633     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14634     assert(PL_regkind[OP(node)] == ANYOF);
14635
14636     /* There is no bitmap for this node type */
14637     if (OP(node) == ANYOFH) {
14638         return;
14639     }
14640
14641     ANYOF_BITMAP_ZERO(node);
14642     if (*invlist_ptr) {
14643
14644         /* This gets set if we actually need to modify things */
14645         bool change_invlist = FALSE;
14646
14647         UV start, end;
14648
14649         /* Start looking through *invlist_ptr */
14650         invlist_iterinit(*invlist_ptr);
14651         while (invlist_iternext(*invlist_ptr, &start, &end)) {
14652             UV high;
14653             int i;
14654
14655             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14656                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14657             }
14658
14659             /* Quit if are above what we should change */
14660             if (start >= NUM_ANYOF_CODE_POINTS) {
14661                 break;
14662             }
14663
14664             change_invlist = TRUE;
14665
14666             /* Set all the bits in the range, up to the max that we are doing */
14667             high = (end < NUM_ANYOF_CODE_POINTS - 1)
14668                    ? end
14669                    : NUM_ANYOF_CODE_POINTS - 1;
14670             for (i = start; i <= (int) high; i++) {
14671                 if (! ANYOF_BITMAP_TEST(node, i)) {
14672                     ANYOF_BITMAP_SET(node, i);
14673                 }
14674             }
14675         }
14676         invlist_iterfinish(*invlist_ptr);
14677
14678         /* Done with loop; remove any code points that are in the bitmap from
14679          * *invlist_ptr; similarly for code points above the bitmap if we have
14680          * a flag to match all of them anyways */
14681         if (change_invlist) {
14682             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14683         }
14684         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14685             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14686         }
14687
14688         /* If have completely emptied it, remove it completely */
14689         if (_invlist_len(*invlist_ptr) == 0) {
14690             SvREFCNT_dec_NN(*invlist_ptr);
14691             *invlist_ptr = NULL;
14692         }
14693     }
14694 }
14695
14696 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14697    Character classes ([:foo:]) can also be negated ([:^foo:]).
14698    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14699    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14700    but trigger failures because they are currently unimplemented. */
14701
14702 #define POSIXCC_DONE(c)   ((c) == ':')
14703 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14704 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14705 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14706
14707 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
14708 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
14709 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
14710
14711 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14712
14713 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14714  * routine. q.v. */
14715 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
14716         if (posix_warnings) {                                               \
14717             if (! RExC_warn_text ) RExC_warn_text =                         \
14718                                          (AV *) sv_2mortal((SV *) newAV()); \
14719             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
14720                                              WARNING_PREFIX                 \
14721                                              text                           \
14722                                              REPORT_LOCATION,               \
14723                                              REPORT_LOCATION_ARGS(p)));     \
14724         }                                                                   \
14725     } STMT_END
14726 #define CLEAR_POSIX_WARNINGS()                                              \
14727     STMT_START {                                                            \
14728         if (posix_warnings && RExC_warn_text)                               \
14729             av_clear(RExC_warn_text);                                       \
14730     } STMT_END
14731
14732 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
14733     STMT_START {                                                            \
14734         CLEAR_POSIX_WARNINGS();                                             \
14735         return ret;                                                         \
14736     } STMT_END
14737
14738 STATIC int
14739 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14740
14741     const char * const s,      /* Where the putative posix class begins.
14742                                   Normally, this is one past the '['.  This
14743                                   parameter exists so it can be somewhere
14744                                   besides RExC_parse. */
14745     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14746                                   NULL */
14747     AV ** posix_warnings,      /* Where to place any generated warnings, or
14748                                   NULL */
14749     const bool check_only      /* Don't die if error */
14750 )
14751 {
14752     /* This parses what the caller thinks may be one of the three POSIX
14753      * constructs:
14754      *  1) a character class, like [:blank:]
14755      *  2) a collating symbol, like [. .]
14756      *  3) an equivalence class, like [= =]
14757      * In the latter two cases, it croaks if it finds a syntactically legal
14758      * one, as these are not handled by Perl.
14759      *
14760      * The main purpose is to look for a POSIX character class.  It returns:
14761      *  a) the class number
14762      *      if it is a completely syntactically and semantically legal class.
14763      *      'updated_parse_ptr', if not NULL, is set to point to just after the
14764      *      closing ']' of the class
14765      *  b) OOB_NAMEDCLASS
14766      *      if it appears that one of the three POSIX constructs was meant, but
14767      *      its specification was somehow defective.  'updated_parse_ptr', if
14768      *      not NULL, is set to point to the character just after the end
14769      *      character of the class.  See below for handling of warnings.
14770      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14771      *      if it  doesn't appear that a POSIX construct was intended.
14772      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
14773      *      raised.
14774      *
14775      * In b) there may be errors or warnings generated.  If 'check_only' is
14776      * TRUE, then any errors are discarded.  Warnings are returned to the
14777      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
14778      * instead it is NULL, warnings are suppressed.
14779      *
14780      * The reason for this function, and its complexity is that a bracketed
14781      * character class can contain just about anything.  But it's easy to
14782      * mistype the very specific posix class syntax but yielding a valid
14783      * regular bracketed class, so it silently gets compiled into something
14784      * quite unintended.
14785      *
14786      * The solution adopted here maintains backward compatibility except that
14787      * it adds a warning if it looks like a posix class was intended but
14788      * improperly specified.  The warning is not raised unless what is input
14789      * very closely resembles one of the 14 legal posix classes.  To do this,
14790      * it uses fuzzy parsing.  It calculates how many single-character edits it
14791      * would take to transform what was input into a legal posix class.  Only
14792      * if that number is quite small does it think that the intention was a
14793      * posix class.  Obviously these are heuristics, and there will be cases
14794      * where it errs on one side or another, and they can be tweaked as
14795      * experience informs.
14796      *
14797      * The syntax for a legal posix class is:
14798      *
14799      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14800      *
14801      * What this routine considers syntactically to be an intended posix class
14802      * is this (the comments indicate some restrictions that the pattern
14803      * doesn't show):
14804      *
14805      *  qr/(?x: \[?                         # The left bracket, possibly
14806      *                                      # omitted
14807      *          \h*                         # possibly followed by blanks
14808      *          (?: \^ \h* )?               # possibly a misplaced caret
14809      *          [:;]?                       # The opening class character,
14810      *                                      # possibly omitted.  A typo
14811      *                                      # semi-colon can also be used.
14812      *          \h*
14813      *          \^?                         # possibly a correctly placed
14814      *                                      # caret, but not if there was also
14815      *                                      # a misplaced one
14816      *          \h*
14817      *          .{3,15}                     # The class name.  If there are
14818      *                                      # deviations from the legal syntax,
14819      *                                      # its edit distance must be close
14820      *                                      # to a real class name in order
14821      *                                      # for it to be considered to be
14822      *                                      # an intended posix class.
14823      *          \h*
14824      *          [[:punct:]]?                # The closing class character,
14825      *                                      # possibly omitted.  If not a colon
14826      *                                      # nor semi colon, the class name
14827      *                                      # must be even closer to a valid
14828      *                                      # one
14829      *          \h*
14830      *          \]?                         # The right bracket, possibly
14831      *                                      # omitted.
14832      *     )/
14833      *
14834      * In the above, \h must be ASCII-only.
14835      *
14836      * These are heuristics, and can be tweaked as field experience dictates.
14837      * There will be cases when someone didn't intend to specify a posix class
14838      * that this warns as being so.  The goal is to minimize these, while
14839      * maximizing the catching of things intended to be a posix class that
14840      * aren't parsed as such.
14841      */
14842
14843     const char* p             = s;
14844     const char * const e      = RExC_end;
14845     unsigned complement       = 0;      /* If to complement the class */
14846     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
14847     bool has_opening_bracket  = FALSE;
14848     bool has_opening_colon    = FALSE;
14849     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
14850                                                    valid class */
14851     const char * possible_end = NULL;   /* used for a 2nd parse pass */
14852     const char* name_start;             /* ptr to class name first char */
14853
14854     /* If the number of single-character typos the input name is away from a
14855      * legal name is no more than this number, it is considered to have meant
14856      * the legal name */
14857     int max_distance          = 2;
14858
14859     /* to store the name.  The size determines the maximum length before we
14860      * decide that no posix class was intended.  Should be at least
14861      * sizeof("alphanumeric") */
14862     UV input_text[15];
14863     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14864
14865     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14866
14867     CLEAR_POSIX_WARNINGS();
14868
14869     if (p >= e) {
14870         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14871     }
14872
14873     if (*(p - 1) != '[') {
14874         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14875         found_problem = TRUE;
14876     }
14877     else {
14878         has_opening_bracket = TRUE;
14879     }
14880
14881     /* They could be confused and think you can put spaces between the
14882      * components */
14883     if (isBLANK(*p)) {
14884         found_problem = TRUE;
14885
14886         do {
14887             p++;
14888         } while (p < e && isBLANK(*p));
14889
14890         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14891     }
14892
14893     /* For [. .] and [= =].  These are quite different internally from [: :],
14894      * so they are handled separately.  */
14895     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14896                                             and 1 for at least one char in it
14897                                           */
14898     {
14899         const char open_char  = *p;
14900         const char * temp_ptr = p + 1;
14901
14902         /* These two constructs are not handled by perl, and if we find a
14903          * syntactically valid one, we croak.  khw, who wrote this code, finds
14904          * this explanation of them very unclear:
14905          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14906          * And searching the rest of the internet wasn't very helpful either.
14907          * It looks like just about any byte can be in these constructs,
14908          * depending on the locale.  But unless the pattern is being compiled
14909          * under /l, which is very rare, Perl runs under the C or POSIX locale.
14910          * In that case, it looks like [= =] isn't allowed at all, and that
14911          * [. .] could be any single code point, but for longer strings the
14912          * constituent characters would have to be the ASCII alphabetics plus
14913          * the minus-hyphen.  Any sensible locale definition would limit itself
14914          * to these.  And any portable one definitely should.  Trying to parse
14915          * the general case is a nightmare (see [perl #127604]).  So, this code
14916          * looks only for interiors of these constructs that match:
14917          *      qr/.|[-\w]{2,}/
14918          * Using \w relaxes the apparent rules a little, without adding much
14919          * danger of mistaking something else for one of these constructs.
14920          *
14921          * [. .] in some implementations described on the internet is usable to
14922          * escape a character that otherwise is special in bracketed character
14923          * classes.  For example [.].] means a literal right bracket instead of
14924          * the ending of the class
14925          *
14926          * [= =] can legitimately contain a [. .] construct, but we don't
14927          * handle this case, as that [. .] construct will later get parsed
14928          * itself and croak then.  And [= =] is checked for even when not under
14929          * /l, as Perl has long done so.
14930          *
14931          * The code below relies on there being a trailing NUL, so it doesn't
14932          * have to keep checking if the parse ptr < e.
14933          */
14934         if (temp_ptr[1] == open_char) {
14935             temp_ptr++;
14936         }
14937         else while (    temp_ptr < e
14938                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14939         {
14940             temp_ptr++;
14941         }
14942
14943         if (*temp_ptr == open_char) {
14944             temp_ptr++;
14945             if (*temp_ptr == ']') {
14946                 temp_ptr++;
14947                 if (! found_problem && ! check_only) {
14948                     RExC_parse = (char *) temp_ptr;
14949                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
14950                             "extensions", open_char, open_char);
14951                 }
14952
14953                 /* Here, the syntax wasn't completely valid, or else the call
14954                  * is to check-only */
14955                 if (updated_parse_ptr) {
14956                     *updated_parse_ptr = (char *) temp_ptr;
14957                 }
14958
14959                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14960             }
14961         }
14962
14963         /* If we find something that started out to look like one of these
14964          * constructs, but isn't, we continue below so that it can be checked
14965          * for being a class name with a typo of '.' or '=' instead of a colon.
14966          * */
14967     }
14968
14969     /* Here, we think there is a possibility that a [: :] class was meant, and
14970      * we have the first real character.  It could be they think the '^' comes
14971      * first */
14972     if (*p == '^') {
14973         found_problem = TRUE;
14974         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14975         complement = 1;
14976         p++;
14977
14978         if (isBLANK(*p)) {
14979             found_problem = TRUE;
14980
14981             do {
14982                 p++;
14983             } while (p < e && isBLANK(*p));
14984
14985             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14986         }
14987     }
14988
14989     /* But the first character should be a colon, which they could have easily
14990      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14991      * distinguish from a colon, so treat that as a colon).  */
14992     if (*p == ':') {
14993         p++;
14994         has_opening_colon = TRUE;
14995     }
14996     else if (*p == ';') {
14997         found_problem = TRUE;
14998         p++;
14999         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15000         has_opening_colon = TRUE;
15001     }
15002     else {
15003         found_problem = TRUE;
15004         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15005
15006         /* Consider an initial punctuation (not one of the recognized ones) to
15007          * be a left terminator */
15008         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15009             p++;
15010         }
15011     }
15012
15013     /* They may think that you can put spaces between the components */
15014     if (isBLANK(*p)) {
15015         found_problem = TRUE;
15016
15017         do {
15018             p++;
15019         } while (p < e && isBLANK(*p));
15020
15021         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15022     }
15023
15024     if (*p == '^') {
15025
15026         /* We consider something like [^:^alnum:]] to not have been intended to
15027          * be a posix class, but XXX maybe we should */
15028         if (complement) {
15029             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15030         }
15031
15032         complement = 1;
15033         p++;
15034     }
15035
15036     /* Again, they may think that you can put spaces between the components */
15037     if (isBLANK(*p)) {
15038         found_problem = TRUE;
15039
15040         do {
15041             p++;
15042         } while (p < e && isBLANK(*p));
15043
15044         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15045     }
15046
15047     if (*p == ']') {
15048
15049         /* XXX This ']' may be a typo, and something else was meant.  But
15050          * treating it as such creates enough complications, that that
15051          * possibility isn't currently considered here.  So we assume that the
15052          * ']' is what is intended, and if we've already found an initial '[',
15053          * this leaves this construct looking like [:] or [:^], which almost
15054          * certainly weren't intended to be posix classes */
15055         if (has_opening_bracket) {
15056             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15057         }
15058
15059         /* But this function can be called when we parse the colon for
15060          * something like qr/[alpha:]]/, so we back up to look for the
15061          * beginning */
15062         p--;
15063
15064         if (*p == ';') {
15065             found_problem = TRUE;
15066             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15067         }
15068         else if (*p != ':') {
15069
15070             /* XXX We are currently very restrictive here, so this code doesn't
15071              * consider the possibility that, say, /[alpha.]]/ was intended to
15072              * be a posix class. */
15073             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15074         }
15075
15076         /* Here we have something like 'foo:]'.  There was no initial colon,
15077          * and we back up over 'foo.  XXX Unlike the going forward case, we
15078          * don't handle typos of non-word chars in the middle */
15079         has_opening_colon = FALSE;
15080         p--;
15081
15082         while (p > RExC_start && isWORDCHAR(*p)) {
15083             p--;
15084         }
15085         p++;
15086
15087         /* Here, we have positioned ourselves to where we think the first
15088          * character in the potential class is */
15089     }
15090
15091     /* Now the interior really starts.  There are certain key characters that
15092      * can end the interior, or these could just be typos.  To catch both
15093      * cases, we may have to do two passes.  In the first pass, we keep on
15094      * going unless we come to a sequence that matches
15095      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15096      * This means it takes a sequence to end the pass, so two typos in a row if
15097      * that wasn't what was intended.  If the class is perfectly formed, just
15098      * this one pass is needed.  We also stop if there are too many characters
15099      * being accumulated, but this number is deliberately set higher than any
15100      * real class.  It is set high enough so that someone who thinks that
15101      * 'alphanumeric' is a correct name would get warned that it wasn't.
15102      * While doing the pass, we keep track of where the key characters were in
15103      * it.  If we don't find an end to the class, and one of the key characters
15104      * was found, we redo the pass, but stop when we get to that character.
15105      * Thus the key character was considered a typo in the first pass, but a
15106      * terminator in the second.  If two key characters are found, we stop at
15107      * the second one in the first pass.  Again this can miss two typos, but
15108      * catches a single one
15109      *
15110      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15111      * point to the first key character.  For the second pass, it starts as -1.
15112      * */
15113
15114     name_start = p;
15115   parse_name:
15116     {
15117         bool has_blank               = FALSE;
15118         bool has_upper               = FALSE;
15119         bool has_terminating_colon   = FALSE;
15120         bool has_terminating_bracket = FALSE;
15121         bool has_semi_colon          = FALSE;
15122         unsigned int name_len        = 0;
15123         int punct_count              = 0;
15124
15125         while (p < e) {
15126
15127             /* Squeeze out blanks when looking up the class name below */
15128             if (isBLANK(*p) ) {
15129                 has_blank = TRUE;
15130                 found_problem = TRUE;
15131                 p++;
15132                 continue;
15133             }
15134
15135             /* The name will end with a punctuation */
15136             if (isPUNCT(*p)) {
15137                 const char * peek = p + 1;
15138
15139                 /* Treat any non-']' punctuation followed by a ']' (possibly
15140                  * with intervening blanks) as trying to terminate the class.
15141                  * ']]' is very likely to mean a class was intended (but
15142                  * missing the colon), but the warning message that gets
15143                  * generated shows the error position better if we exit the
15144                  * loop at the bottom (eventually), so skip it here. */
15145                 if (*p != ']') {
15146                     if (peek < e && isBLANK(*peek)) {
15147                         has_blank = TRUE;
15148                         found_problem = TRUE;
15149                         do {
15150                             peek++;
15151                         } while (peek < e && isBLANK(*peek));
15152                     }
15153
15154                     if (peek < e && *peek == ']') {
15155                         has_terminating_bracket = TRUE;
15156                         if (*p == ':') {
15157                             has_terminating_colon = TRUE;
15158                         }
15159                         else if (*p == ';') {
15160                             has_semi_colon = TRUE;
15161                             has_terminating_colon = TRUE;
15162                         }
15163                         else {
15164                             found_problem = TRUE;
15165                         }
15166                         p = peek + 1;
15167                         goto try_posix;
15168                     }
15169                 }
15170
15171                 /* Here we have punctuation we thought didn't end the class.
15172                  * Keep track of the position of the key characters that are
15173                  * more likely to have been class-enders */
15174                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15175
15176                     /* Allow just one such possible class-ender not actually
15177                      * ending the class. */
15178                     if (possible_end) {
15179                         break;
15180                     }
15181                     possible_end = p;
15182                 }
15183
15184                 /* If we have too many punctuation characters, no use in
15185                  * keeping going */
15186                 if (++punct_count > max_distance) {
15187                     break;
15188                 }
15189
15190                 /* Treat the punctuation as a typo. */
15191                 input_text[name_len++] = *p;
15192                 p++;
15193             }
15194             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15195                 input_text[name_len++] = toLOWER(*p);
15196                 has_upper = TRUE;
15197                 found_problem = TRUE;
15198                 p++;
15199             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15200                 input_text[name_len++] = *p;
15201                 p++;
15202             }
15203             else {
15204                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15205                 p+= UTF8SKIP(p);
15206             }
15207
15208             /* The declaration of 'input_text' is how long we allow a potential
15209              * class name to be, before saying they didn't mean a class name at
15210              * all */
15211             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15212                 break;
15213             }
15214         }
15215
15216         /* We get to here when the possible class name hasn't been properly
15217          * terminated before:
15218          *   1) we ran off the end of the pattern; or
15219          *   2) found two characters, each of which might have been intended to
15220          *      be the name's terminator
15221          *   3) found so many punctuation characters in the purported name,
15222          *      that the edit distance to a valid one is exceeded
15223          *   4) we decided it was more characters than anyone could have
15224          *      intended to be one. */
15225
15226         found_problem = TRUE;
15227
15228         /* In the final two cases, we know that looking up what we've
15229          * accumulated won't lead to a match, even a fuzzy one. */
15230         if (   name_len >= C_ARRAY_LENGTH(input_text)
15231             || punct_count > max_distance)
15232         {
15233             /* If there was an intermediate key character that could have been
15234              * an intended end, redo the parse, but stop there */
15235             if (possible_end && possible_end != (char *) -1) {
15236                 possible_end = (char *) -1; /* Special signal value to say
15237                                                we've done a first pass */
15238                 p = name_start;
15239                 goto parse_name;
15240             }
15241
15242             /* Otherwise, it can't have meant to have been a class */
15243             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15244         }
15245
15246         /* If we ran off the end, and the final character was a punctuation
15247          * one, back up one, to look at that final one just below.  Later, we
15248          * will restore the parse pointer if appropriate */
15249         if (name_len && p == e && isPUNCT(*(p-1))) {
15250             p--;
15251             name_len--;
15252         }
15253
15254         if (p < e && isPUNCT(*p)) {
15255             if (*p == ']') {
15256                 has_terminating_bracket = TRUE;
15257
15258                 /* If this is a 2nd ']', and the first one is just below this
15259                  * one, consider that to be the real terminator.  This gives a
15260                  * uniform and better positioning for the warning message  */
15261                 if (   possible_end
15262                     && possible_end != (char *) -1
15263                     && *possible_end == ']'
15264                     && name_len && input_text[name_len - 1] == ']')
15265                 {
15266                     name_len--;
15267                     p = possible_end;
15268
15269                     /* And this is actually equivalent to having done the 2nd
15270                      * pass now, so set it to not try again */
15271                     possible_end = (char *) -1;
15272                 }
15273             }
15274             else {
15275                 if (*p == ':') {
15276                     has_terminating_colon = TRUE;
15277                 }
15278                 else if (*p == ';') {
15279                     has_semi_colon = TRUE;
15280                     has_terminating_colon = TRUE;
15281                 }
15282                 p++;
15283             }
15284         }
15285
15286     try_posix:
15287
15288         /* Here, we have a class name to look up.  We can short circuit the
15289          * stuff below for short names that can't possibly be meant to be a
15290          * class name.  (We can do this on the first pass, as any second pass
15291          * will yield an even shorter name) */
15292         if (name_len < 3) {
15293             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15294         }
15295
15296         /* Find which class it is.  Initially switch on the length of the name.
15297          * */
15298         switch (name_len) {
15299             case 4:
15300                 if (memEQs(name_start, 4, "word")) {
15301                     /* this is not POSIX, this is the Perl \w */
15302                     class_number = ANYOF_WORDCHAR;
15303                 }
15304                 break;
15305             case 5:
15306                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15307                  *                        graph lower print punct space upper
15308                  * Offset 4 gives the best switch position.  */
15309                 switch (name_start[4]) {
15310                     case 'a':
15311                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15312                             class_number = ANYOF_ALPHA;
15313                         break;
15314                     case 'e':
15315                         if (memBEGINs(name_start, 5, "spac")) /* space */
15316                             class_number = ANYOF_SPACE;
15317                         break;
15318                     case 'h':
15319                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15320                             class_number = ANYOF_GRAPH;
15321                         break;
15322                     case 'i':
15323                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15324                             class_number = ANYOF_ASCII;
15325                         break;
15326                     case 'k':
15327                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15328                             class_number = ANYOF_BLANK;
15329                         break;
15330                     case 'l':
15331                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15332                             class_number = ANYOF_CNTRL;
15333                         break;
15334                     case 'm':
15335                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15336                             class_number = ANYOF_ALPHANUMERIC;
15337                         break;
15338                     case 'r':
15339                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15340                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15341                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15342                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15343                         break;
15344                     case 't':
15345                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15346                             class_number = ANYOF_DIGIT;
15347                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15348                             class_number = ANYOF_PRINT;
15349                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15350                             class_number = ANYOF_PUNCT;
15351                         break;
15352                 }
15353                 break;
15354             case 6:
15355                 if (memEQs(name_start, 6, "xdigit"))
15356                     class_number = ANYOF_XDIGIT;
15357                 break;
15358         }
15359
15360         /* If the name exactly matches a posix class name the class number will
15361          * here be set to it, and the input almost certainly was meant to be a
15362          * posix class, so we can skip further checking.  If instead the syntax
15363          * is exactly correct, but the name isn't one of the legal ones, we
15364          * will return that as an error below.  But if neither of these apply,
15365          * it could be that no posix class was intended at all, or that one
15366          * was, but there was a typo.  We tease these apart by doing fuzzy
15367          * matching on the name */
15368         if (class_number == OOB_NAMEDCLASS && found_problem) {
15369             const UV posix_names[][6] = {
15370                                                 { 'a', 'l', 'n', 'u', 'm' },
15371                                                 { 'a', 'l', 'p', 'h', 'a' },
15372                                                 { 'a', 's', 'c', 'i', 'i' },
15373                                                 { 'b', 'l', 'a', 'n', 'k' },
15374                                                 { 'c', 'n', 't', 'r', 'l' },
15375                                                 { 'd', 'i', 'g', 'i', 't' },
15376                                                 { 'g', 'r', 'a', 'p', 'h' },
15377                                                 { 'l', 'o', 'w', 'e', 'r' },
15378                                                 { 'p', 'r', 'i', 'n', 't' },
15379                                                 { 'p', 'u', 'n', 'c', 't' },
15380                                                 { 's', 'p', 'a', 'c', 'e' },
15381                                                 { 'u', 'p', 'p', 'e', 'r' },
15382                                                 { 'w', 'o', 'r', 'd' },
15383                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15384                                             };
15385             /* The names of the above all have added NULs to make them the same
15386              * size, so we need to also have the real lengths */
15387             const UV posix_name_lengths[] = {
15388                                                 sizeof("alnum") - 1,
15389                                                 sizeof("alpha") - 1,
15390                                                 sizeof("ascii") - 1,
15391                                                 sizeof("blank") - 1,
15392                                                 sizeof("cntrl") - 1,
15393                                                 sizeof("digit") - 1,
15394                                                 sizeof("graph") - 1,
15395                                                 sizeof("lower") - 1,
15396                                                 sizeof("print") - 1,
15397                                                 sizeof("punct") - 1,
15398                                                 sizeof("space") - 1,
15399                                                 sizeof("upper") - 1,
15400                                                 sizeof("word")  - 1,
15401                                                 sizeof("xdigit")- 1
15402                                             };
15403             unsigned int i;
15404             int temp_max = max_distance;    /* Use a temporary, so if we
15405                                                reparse, we haven't changed the
15406                                                outer one */
15407
15408             /* Use a smaller max edit distance if we are missing one of the
15409              * delimiters */
15410             if (   has_opening_bracket + has_opening_colon < 2
15411                 || has_terminating_bracket + has_terminating_colon < 2)
15412             {
15413                 temp_max--;
15414             }
15415
15416             /* See if the input name is close to a legal one */
15417             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15418
15419                 /* Short circuit call if the lengths are too far apart to be
15420                  * able to match */
15421                 if (abs( (int) (name_len - posix_name_lengths[i]))
15422                     > temp_max)
15423                 {
15424                     continue;
15425                 }
15426
15427                 if (edit_distance(input_text,
15428                                   posix_names[i],
15429                                   name_len,
15430                                   posix_name_lengths[i],
15431                                   temp_max
15432                                  )
15433                     > -1)
15434                 { /* If it is close, it probably was intended to be a class */
15435                     goto probably_meant_to_be;
15436                 }
15437             }
15438
15439             /* Here the input name is not close enough to a valid class name
15440              * for us to consider it to be intended to be a posix class.  If
15441              * we haven't already done so, and the parse found a character that
15442              * could have been terminators for the name, but which we absorbed
15443              * as typos during the first pass, repeat the parse, signalling it
15444              * to stop at that character */
15445             if (possible_end && possible_end != (char *) -1) {
15446                 possible_end = (char *) -1;
15447                 p = name_start;
15448                 goto parse_name;
15449             }
15450
15451             /* Here neither pass found a close-enough class name */
15452             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15453         }
15454
15455     probably_meant_to_be:
15456
15457         /* Here we think that a posix specification was intended.  Update any
15458          * parse pointer */
15459         if (updated_parse_ptr) {
15460             *updated_parse_ptr = (char *) p;
15461         }
15462
15463         /* If a posix class name was intended but incorrectly specified, we
15464          * output or return the warnings */
15465         if (found_problem) {
15466
15467             /* We set flags for these issues in the parse loop above instead of
15468              * adding them to the list of warnings, because we can parse it
15469              * twice, and we only want one warning instance */
15470             if (has_upper) {
15471                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15472             }
15473             if (has_blank) {
15474                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15475             }
15476             if (has_semi_colon) {
15477                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15478             }
15479             else if (! has_terminating_colon) {
15480                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15481             }
15482             if (! has_terminating_bracket) {
15483                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15484             }
15485
15486             if (   posix_warnings
15487                 && RExC_warn_text
15488                 && av_top_index(RExC_warn_text) > -1)
15489             {
15490                 *posix_warnings = RExC_warn_text;
15491             }
15492         }
15493         else if (class_number != OOB_NAMEDCLASS) {
15494             /* If it is a known class, return the class.  The class number
15495              * #defines are structured so each complement is +1 to the normal
15496              * one */
15497             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15498         }
15499         else if (! check_only) {
15500
15501             /* Here, it is an unrecognized class.  This is an error (unless the
15502             * call is to check only, which we've already handled above) */
15503             const char * const complement_string = (complement)
15504                                                    ? "^"
15505                                                    : "";
15506             RExC_parse = (char *) p;
15507             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15508                         complement_string,
15509                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15510         }
15511     }
15512
15513     return OOB_NAMEDCLASS;
15514 }
15515 #undef ADD_POSIX_WARNING
15516
15517 STATIC unsigned  int
15518 S_regex_set_precedence(const U8 my_operator) {
15519
15520     /* Returns the precedence in the (?[...]) construct of the input operator,
15521      * specified by its character representation.  The precedence follows
15522      * general Perl rules, but it extends this so that ')' and ']' have (low)
15523      * precedence even though they aren't really operators */
15524
15525     switch (my_operator) {
15526         case '!':
15527             return 5;
15528         case '&':
15529             return 4;
15530         case '^':
15531         case '|':
15532         case '+':
15533         case '-':
15534             return 3;
15535         case ')':
15536             return 2;
15537         case ']':
15538             return 1;
15539     }
15540
15541     NOT_REACHED; /* NOTREACHED */
15542     return 0;   /* Silence compiler warning */
15543 }
15544
15545 STATIC regnode_offset
15546 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15547                     I32 *flagp, U32 depth,
15548                     char * const oregcomp_parse)
15549 {
15550     /* Handle the (?[...]) construct to do set operations */
15551
15552     U8 curchar;                     /* Current character being parsed */
15553     UV start, end;                  /* End points of code point ranges */
15554     SV* final = NULL;               /* The end result inversion list */
15555     SV* result_string;              /* 'final' stringified */
15556     AV* stack;                      /* stack of operators and operands not yet
15557                                        resolved */
15558     AV* fence_stack = NULL;         /* A stack containing the positions in
15559                                        'stack' of where the undealt-with left
15560                                        parens would be if they were actually
15561                                        put there */
15562     /* The 'volatile' is a workaround for an optimiser bug
15563      * in Solaris Studio 12.3. See RT #127455 */
15564     volatile IV fence = 0;          /* Position of where most recent undealt-
15565                                        with left paren in stack is; -1 if none.
15566                                      */
15567     STRLEN len;                     /* Temporary */
15568     regnode_offset node;                  /* Temporary, and final regnode returned by
15569                                        this function */
15570     const bool save_fold = FOLD;    /* Temporary */
15571     char *save_end, *save_parse;    /* Temporaries */
15572     const bool in_locale = LOC;     /* we turn off /l during processing */
15573
15574     GET_RE_DEBUG_FLAGS_DECL;
15575
15576     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15577
15578     DEBUG_PARSE("xcls");
15579
15580     if (in_locale) {
15581         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15582     }
15583
15584     /* The use of this operator implies /u.  This is required so that the
15585      * compile time values are valid in all runtime cases */
15586     REQUIRE_UNI_RULES(flagp, 0);
15587
15588     ckWARNexperimental(RExC_parse,
15589                        WARN_EXPERIMENTAL__REGEX_SETS,
15590                        "The regex_sets feature is experimental");
15591
15592     /* Everything in this construct is a metacharacter.  Operands begin with
15593      * either a '\' (for an escape sequence), or a '[' for a bracketed
15594      * character class.  Any other character should be an operator, or
15595      * parenthesis for grouping.  Both types of operands are handled by calling
15596      * regclass() to parse them.  It is called with a parameter to indicate to
15597      * return the computed inversion list.  The parsing here is implemented via
15598      * a stack.  Each entry on the stack is a single character representing one
15599      * of the operators; or else a pointer to an operand inversion list. */
15600
15601 #define IS_OPERATOR(a) SvIOK(a)
15602 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
15603
15604     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
15605      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15606      * with pronouncing it called it Reverse Polish instead, but now that YOU
15607      * know how to pronounce it you can use the correct term, thus giving due
15608      * credit to the person who invented it, and impressing your geek friends.
15609      * Wikipedia says that the pronounciation of "Ł" has been changing so that
15610      * it is now more like an English initial W (as in wonk) than an L.)
15611      *
15612      * This means that, for example, 'a | b & c' is stored on the stack as
15613      *
15614      * c  [4]
15615      * b  [3]
15616      * &  [2]
15617      * a  [1]
15618      * |  [0]
15619      *
15620      * where the numbers in brackets give the stack [array] element number.
15621      * In this implementation, parentheses are not stored on the stack.
15622      * Instead a '(' creates a "fence" so that the part of the stack below the
15623      * fence is invisible except to the corresponding ')' (this allows us to
15624      * replace testing for parens, by using instead subtraction of the fence
15625      * position).  As new operands are processed they are pushed onto the stack
15626      * (except as noted in the next paragraph).  New operators of higher
15627      * precedence than the current final one are inserted on the stack before
15628      * the lhs operand (so that when the rhs is pushed next, everything will be
15629      * in the correct positions shown above.  When an operator of equal or
15630      * lower precedence is encountered in parsing, all the stacked operations
15631      * of equal or higher precedence are evaluated, leaving the result as the
15632      * top entry on the stack.  This makes higher precedence operations
15633      * evaluate before lower precedence ones, and causes operations of equal
15634      * precedence to left associate.
15635      *
15636      * The only unary operator '!' is immediately pushed onto the stack when
15637      * encountered.  When an operand is encountered, if the top of the stack is
15638      * a '!", the complement is immediately performed, and the '!' popped.  The
15639      * resulting value is treated as a new operand, and the logic in the
15640      * previous paragraph is executed.  Thus in the expression
15641      *      [a] + ! [b]
15642      * the stack looks like
15643      *
15644      * !
15645      * a
15646      * +
15647      *
15648      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15649      * becomes
15650      *
15651      * !b
15652      * a
15653      * +
15654      *
15655      * A ')' is treated as an operator with lower precedence than all the
15656      * aforementioned ones, which causes all operations on the stack above the
15657      * corresponding '(' to be evaluated down to a single resultant operand.
15658      * Then the fence for the '(' is removed, and the operand goes through the
15659      * algorithm above, without the fence.
15660      *
15661      * A separate stack is kept of the fence positions, so that the position of
15662      * the latest so-far unbalanced '(' is at the top of it.
15663      *
15664      * The ']' ending the construct is treated as the lowest operator of all,
15665      * so that everything gets evaluated down to a single operand, which is the
15666      * result */
15667
15668     sv_2mortal((SV *)(stack = newAV()));
15669     sv_2mortal((SV *)(fence_stack = newAV()));
15670
15671     while (RExC_parse < RExC_end) {
15672         I32 top_index;              /* Index of top-most element in 'stack' */
15673         SV** top_ptr;               /* Pointer to top 'stack' element */
15674         SV* current = NULL;         /* To contain the current inversion list
15675                                        operand */
15676         SV* only_to_avoid_leaks;
15677
15678         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15679                                 TRUE /* Force /x */ );
15680         if (RExC_parse >= RExC_end) {   /* Fail */
15681             break;
15682         }
15683
15684         curchar = UCHARAT(RExC_parse);
15685
15686 redo_curchar:
15687
15688 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15689                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15690         DEBUG_U(dump_regex_sets_structures(pRExC_state,
15691                                            stack, fence, fence_stack));
15692 #endif
15693
15694         top_index = av_tindex_skip_len_mg(stack);
15695
15696         switch (curchar) {
15697             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15698             char stacked_operator;  /* The topmost operator on the 'stack'. */
15699             SV* lhs;                /* Operand to the left of the operator */
15700             SV* rhs;                /* Operand to the right of the operator */
15701             SV* fence_ptr;          /* Pointer to top element of the fence
15702                                        stack */
15703
15704             case '(':
15705
15706                 if (   RExC_parse < RExC_end - 2
15707                     && UCHARAT(RExC_parse + 1) == '?'
15708                     && UCHARAT(RExC_parse + 2) == '^')
15709                 {
15710                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
15711                      * This happens when we have some thing like
15712                      *
15713                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15714                      *   ...
15715                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15716                      *
15717                      * Here we would be handling the interpolated
15718                      * '$thai_or_lao'.  We handle this by a recursive call to
15719                      * ourselves which returns the inversion list the
15720                      * interpolated expression evaluates to.  We use the flags
15721                      * from the interpolated pattern. */
15722                     U32 save_flags = RExC_flags;
15723                     const char * save_parse;
15724
15725                     RExC_parse += 2;        /* Skip past the '(?' */
15726                     save_parse = RExC_parse;
15727
15728                     /* Parse the flags for the '(?'.  We already know the first
15729                      * flag to parse is a '^' */
15730                     parse_lparen_question_flags(pRExC_state);
15731
15732                     if (   RExC_parse >= RExC_end - 4
15733                         || UCHARAT(RExC_parse) != ':'
15734                         || UCHARAT(++RExC_parse) != '('
15735                         || UCHARAT(++RExC_parse) != '?'
15736                         || UCHARAT(++RExC_parse) != '[')
15737                     {
15738
15739                         /* In combination with the above, this moves the
15740                          * pointer to the point just after the first erroneous
15741                          * character. */
15742                         if (RExC_parse >= RExC_end - 4) {
15743                             RExC_parse = RExC_end;
15744                         }
15745                         else if (RExC_parse != save_parse) {
15746                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15747                         }
15748                         vFAIL("Expecting '(?flags:(?[...'");
15749                     }
15750
15751                     /* Recurse, with the meat of the embedded expression */
15752                     RExC_parse++;
15753                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15754                                                     depth+1, oregcomp_parse);
15755
15756                     /* Here, 'current' contains the embedded expression's
15757                      * inversion list, and RExC_parse points to the trailing
15758                      * ']'; the next character should be the ')' */
15759                     RExC_parse++;
15760                     if (UCHARAT(RExC_parse) != ')')
15761                         vFAIL("Expecting close paren for nested extended charclass");
15762
15763                     /* Then the ')' matching the original '(' handled by this
15764                      * case: statement */
15765                     RExC_parse++;
15766                     if (UCHARAT(RExC_parse) != ')')
15767                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
15768
15769                     RExC_flags = save_flags;
15770                     goto handle_operand;
15771                 }
15772
15773                 /* A regular '('.  Look behind for illegal syntax */
15774                 if (top_index - fence >= 0) {
15775                     /* If the top entry on the stack is an operator, it had
15776                      * better be a '!', otherwise the entry below the top
15777                      * operand should be an operator */
15778                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
15779                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15780                         || (   IS_OPERAND(*top_ptr)
15781                             && (   top_index - fence < 1
15782                                 || ! (stacked_ptr = av_fetch(stack,
15783                                                              top_index - 1,
15784                                                              FALSE))
15785                                 || ! IS_OPERATOR(*stacked_ptr))))
15786                     {
15787                         RExC_parse++;
15788                         vFAIL("Unexpected '(' with no preceding operator");
15789                     }
15790                 }
15791
15792                 /* Stack the position of this undealt-with left paren */
15793                 av_push(fence_stack, newSViv(fence));
15794                 fence = top_index + 1;
15795                 break;
15796
15797             case '\\':
15798                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15799                  * multi-char folds are allowed.  */
15800                 if (!regclass(pRExC_state, flagp, depth+1,
15801                               TRUE, /* means parse just the next thing */
15802                               FALSE, /* don't allow multi-char folds */
15803                               FALSE, /* don't silence non-portable warnings.  */
15804                               TRUE,  /* strict */
15805                               FALSE, /* Require return to be an ANYOF */
15806                               &current))
15807                 {
15808                     FAIL2("panic: regclass returned failure to handle_sets, "
15809                           "flags=%#" UVxf, (UV) *flagp);
15810                 }
15811
15812                 /* regclass() will return with parsing just the \ sequence,
15813                  * leaving the parse pointer at the next thing to parse */
15814                 RExC_parse--;
15815                 goto handle_operand;
15816
15817             case '[':   /* Is a bracketed character class */
15818             {
15819                 /* See if this is a [:posix:] class. */
15820                 bool is_posix_class = (OOB_NAMEDCLASS
15821                             < handle_possible_posix(pRExC_state,
15822                                                 RExC_parse + 1,
15823                                                 NULL,
15824                                                 NULL,
15825                                                 TRUE /* checking only */));
15826                 /* If it is a posix class, leave the parse pointer at the '['
15827                  * to fool regclass() into thinking it is part of a
15828                  * '[[:posix:]]'. */
15829                 if (! is_posix_class) {
15830                     RExC_parse++;
15831                 }
15832
15833                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15834                  * multi-char folds are allowed.  */
15835                 if (!regclass(pRExC_state, flagp, depth+1,
15836                                 is_posix_class, /* parse the whole char
15837                                                     class only if not a
15838                                                     posix class */
15839                                 FALSE, /* don't allow multi-char folds */
15840                                 TRUE, /* silence non-portable warnings. */
15841                                 TRUE, /* strict */
15842                                 FALSE, /* Require return to be an ANYOF */
15843                                 &current))
15844                 {
15845                     FAIL2("panic: regclass returned failure to handle_sets, "
15846                           "flags=%#" UVxf, (UV) *flagp);
15847                 }
15848
15849                 if (! current) {
15850                     break;
15851                 }
15852
15853                 /* function call leaves parse pointing to the ']', except if we
15854                  * faked it */
15855                 if (is_posix_class) {
15856                     RExC_parse--;
15857                 }
15858
15859                 goto handle_operand;
15860             }
15861
15862             case ']':
15863                 if (top_index >= 1) {
15864                     goto join_operators;
15865                 }
15866
15867                 /* Only a single operand on the stack: are done */
15868                 goto done;
15869
15870             case ')':
15871                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15872                     if (UCHARAT(RExC_parse - 1) == ']')  {
15873                         break;
15874                     }
15875                     RExC_parse++;
15876                     vFAIL("Unexpected ')'");
15877                 }
15878
15879                 /* If nothing after the fence, is missing an operand */
15880                 if (top_index - fence < 0) {
15881                     RExC_parse++;
15882                     goto bad_syntax;
15883                 }
15884                 /* If at least two things on the stack, treat this as an
15885                   * operator */
15886                 if (top_index - fence >= 1) {
15887                     goto join_operators;
15888                 }
15889
15890                 /* Here only a single thing on the fenced stack, and there is a
15891                  * fence.  Get rid of it */
15892                 fence_ptr = av_pop(fence_stack);
15893                 assert(fence_ptr);
15894                 fence = SvIV(fence_ptr);
15895                 SvREFCNT_dec_NN(fence_ptr);
15896                 fence_ptr = NULL;
15897
15898                 if (fence < 0) {
15899                     fence = 0;
15900                 }
15901
15902                 /* Having gotten rid of the fence, we pop the operand at the
15903                  * stack top and process it as a newly encountered operand */
15904                 current = av_pop(stack);
15905                 if (IS_OPERAND(current)) {
15906                     goto handle_operand;
15907                 }
15908
15909                 RExC_parse++;
15910                 goto bad_syntax;
15911
15912             case '&':
15913             case '|':
15914             case '+':
15915             case '-':
15916             case '^':
15917
15918                 /* These binary operators should have a left operand already
15919                  * parsed */
15920                 if (   top_index - fence < 0
15921                     || top_index - fence == 1
15922                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15923                     || ! IS_OPERAND(*top_ptr))
15924                 {
15925                     goto unexpected_binary;
15926                 }
15927
15928                 /* If only the one operand is on the part of the stack visible
15929                  * to us, we just place this operator in the proper position */
15930                 if (top_index - fence < 2) {
15931
15932                     /* Place the operator before the operand */
15933
15934                     SV* lhs = av_pop(stack);
15935                     av_push(stack, newSVuv(curchar));
15936                     av_push(stack, lhs);
15937                     break;
15938                 }
15939
15940                 /* But if there is something else on the stack, we need to
15941                  * process it before this new operator if and only if the
15942                  * stacked operation has equal or higher precedence than the
15943                  * new one */
15944
15945              join_operators:
15946
15947                 /* The operator on the stack is supposed to be below both its
15948                  * operands */
15949                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15950                     || IS_OPERAND(*stacked_ptr))
15951                 {
15952                     /* But if not, it's legal and indicates we are completely
15953                      * done if and only if we're currently processing a ']',
15954                      * which should be the final thing in the expression */
15955                     if (curchar == ']') {
15956                         goto done;
15957                     }
15958
15959                   unexpected_binary:
15960                     RExC_parse++;
15961                     vFAIL2("Unexpected binary operator '%c' with no "
15962                            "preceding operand", curchar);
15963                 }
15964                 stacked_operator = (char) SvUV(*stacked_ptr);
15965
15966                 if (regex_set_precedence(curchar)
15967                     > regex_set_precedence(stacked_operator))
15968                 {
15969                     /* Here, the new operator has higher precedence than the
15970                      * stacked one.  This means we need to add the new one to
15971                      * the stack to await its rhs operand (and maybe more
15972                      * stuff).  We put it before the lhs operand, leaving
15973                      * untouched the stacked operator and everything below it
15974                      * */
15975                     lhs = av_pop(stack);
15976                     assert(IS_OPERAND(lhs));
15977
15978                     av_push(stack, newSVuv(curchar));
15979                     av_push(stack, lhs);
15980                     break;
15981                 }
15982
15983                 /* Here, the new operator has equal or lower precedence than
15984                  * what's already there.  This means the operation already
15985                  * there should be performed now, before the new one. */
15986
15987                 rhs = av_pop(stack);
15988                 if (! IS_OPERAND(rhs)) {
15989
15990                     /* This can happen when a ! is not followed by an operand,
15991                      * like in /(?[\t &!])/ */
15992                     goto bad_syntax;
15993                 }
15994
15995                 lhs = av_pop(stack);
15996
15997                 if (! IS_OPERAND(lhs)) {
15998
15999                     /* This can happen when there is an empty (), like in
16000                      * /(?[[0]+()+])/ */
16001                     goto bad_syntax;
16002                 }
16003
16004                 switch (stacked_operator) {
16005                     case '&':
16006                         _invlist_intersection(lhs, rhs, &rhs);
16007                         break;
16008
16009                     case '|':
16010                     case '+':
16011                         _invlist_union(lhs, rhs, &rhs);
16012                         break;
16013
16014                     case '-':
16015                         _invlist_subtract(lhs, rhs, &rhs);
16016                         break;
16017
16018                     case '^':   /* The union minus the intersection */
16019                     {
16020                         SV* i = NULL;
16021                         SV* u = NULL;
16022
16023                         _invlist_union(lhs, rhs, &u);
16024                         _invlist_intersection(lhs, rhs, &i);
16025                         _invlist_subtract(u, i, &rhs);
16026                         SvREFCNT_dec_NN(i);
16027                         SvREFCNT_dec_NN(u);
16028                         break;
16029                     }
16030                 }
16031                 SvREFCNT_dec(lhs);
16032
16033                 /* Here, the higher precedence operation has been done, and the
16034                  * result is in 'rhs'.  We overwrite the stacked operator with
16035                  * the result.  Then we redo this code to either push the new
16036                  * operator onto the stack or perform any higher precedence
16037                  * stacked operation */
16038                 only_to_avoid_leaks = av_pop(stack);
16039                 SvREFCNT_dec(only_to_avoid_leaks);
16040                 av_push(stack, rhs);
16041                 goto redo_curchar;
16042
16043             case '!':   /* Highest priority, right associative */
16044
16045                 /* If what's already at the top of the stack is another '!",
16046                  * they just cancel each other out */
16047                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16048                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16049                 {
16050                     only_to_avoid_leaks = av_pop(stack);
16051                     SvREFCNT_dec(only_to_avoid_leaks);
16052                 }
16053                 else { /* Otherwise, since it's right associative, just push
16054                           onto the stack */
16055                     av_push(stack, newSVuv(curchar));
16056                 }
16057                 break;
16058
16059             default:
16060                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16061                 if (RExC_parse >= RExC_end) {
16062                     break;
16063                 }
16064                 vFAIL("Unexpected character");
16065
16066           handle_operand:
16067
16068             /* Here 'current' is the operand.  If something is already on the
16069              * stack, we have to check if it is a !.  But first, the code above
16070              * may have altered the stack in the time since we earlier set
16071              * 'top_index'.  */
16072
16073             top_index = av_tindex_skip_len_mg(stack);
16074             if (top_index - fence >= 0) {
16075                 /* If the top entry on the stack is an operator, it had better
16076                  * be a '!', otherwise the entry below the top operand should
16077                  * be an operator */
16078                 top_ptr = av_fetch(stack, top_index, FALSE);
16079                 assert(top_ptr);
16080                 if (IS_OPERATOR(*top_ptr)) {
16081
16082                     /* The only permissible operator at the top of the stack is
16083                      * '!', which is applied immediately to this operand. */
16084                     curchar = (char) SvUV(*top_ptr);
16085                     if (curchar != '!') {
16086                         SvREFCNT_dec(current);
16087                         vFAIL2("Unexpected binary operator '%c' with no "
16088                                 "preceding operand", curchar);
16089                     }
16090
16091                     _invlist_invert(current);
16092
16093                     only_to_avoid_leaks = av_pop(stack);
16094                     SvREFCNT_dec(only_to_avoid_leaks);
16095
16096                     /* And we redo with the inverted operand.  This allows
16097                      * handling multiple ! in a row */
16098                     goto handle_operand;
16099                 }
16100                           /* Single operand is ok only for the non-binary ')'
16101                            * operator */
16102                 else if ((top_index - fence == 0 && curchar != ')')
16103                          || (top_index - fence > 0
16104                              && (! (stacked_ptr = av_fetch(stack,
16105                                                            top_index - 1,
16106                                                            FALSE))
16107                                  || IS_OPERAND(*stacked_ptr))))
16108                 {
16109                     SvREFCNT_dec(current);
16110                     vFAIL("Operand with no preceding operator");
16111                 }
16112             }
16113
16114             /* Here there was nothing on the stack or the top element was
16115              * another operand.  Just add this new one */
16116             av_push(stack, current);
16117
16118         } /* End of switch on next parse token */
16119
16120         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16121     } /* End of loop parsing through the construct */
16122
16123     vFAIL("Syntax error in (?[...])");
16124
16125   done:
16126
16127     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16128         if (RExC_parse < RExC_end) {
16129             RExC_parse++;
16130         }
16131
16132         vFAIL("Unexpected ']' with no following ')' in (?[...");
16133     }
16134
16135     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16136         vFAIL("Unmatched (");
16137     }
16138
16139     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16140         || ((final = av_pop(stack)) == NULL)
16141         || ! IS_OPERAND(final)
16142         || ! is_invlist(final)
16143         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16144     {
16145       bad_syntax:
16146         SvREFCNT_dec(final);
16147         vFAIL("Incomplete expression within '(?[ ])'");
16148     }
16149
16150     /* Here, 'final' is the resultant inversion list from evaluating the
16151      * expression.  Return it if so requested */
16152     if (return_invlist) {
16153         *return_invlist = final;
16154         return END;
16155     }
16156
16157     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16158      * expecting a string of ranges and individual code points */
16159     invlist_iterinit(final);
16160     result_string = newSVpvs("");
16161     while (invlist_iternext(final, &start, &end)) {
16162         if (start == end) {
16163             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16164         }
16165         else {
16166             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16167                                                      start,          end);
16168         }
16169     }
16170
16171     /* About to generate an ANYOF (or similar) node from the inversion list we
16172      * have calculated */
16173     save_parse = RExC_parse;
16174     RExC_parse = SvPV(result_string, len);
16175     save_end = RExC_end;
16176     RExC_end = RExC_parse + len;
16177     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16178
16179     /* We turn off folding around the call, as the class we have constructed
16180      * already has all folding taken into consideration, and we don't want
16181      * regclass() to add to that */
16182     RExC_flags &= ~RXf_PMf_FOLD;
16183     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16184      * folds are allowed.  */
16185     node = regclass(pRExC_state, flagp, depth+1,
16186                     FALSE, /* means parse the whole char class */
16187                     FALSE, /* don't allow multi-char folds */
16188                     TRUE, /* silence non-portable warnings.  The above may very
16189                              well have generated non-portable code points, but
16190                              they're valid on this machine */
16191                     FALSE, /* similarly, no need for strict */
16192                     FALSE, /* Require return to be an ANYOF */
16193                     NULL
16194                 );
16195
16196     RESTORE_WARNINGS;
16197     RExC_parse = save_parse + 1;
16198     RExC_end = save_end;
16199     SvREFCNT_dec_NN(final);
16200     SvREFCNT_dec_NN(result_string);
16201
16202     if (save_fold) {
16203         RExC_flags |= RXf_PMf_FOLD;
16204     }
16205
16206     if (!node)
16207         FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
16208                     PTR2UV(flagp));
16209
16210     /* Fix up the node type if we are in locale.  (We have pretended we are
16211      * under /u for the purposes of regclass(), as this construct will only
16212      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16213      * as to cause any warnings about bad locales to be output in regexec.c),
16214      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16215      * reason we above forbid optimization into something other than an ANYOF
16216      * node is simply to minimize the number of code changes in regexec.c.
16217      * Otherwise we would have to create new EXACTish node types and deal with
16218      * them.  This decision could be revisited should this construct become
16219      * popular.
16220      *
16221      * (One might think we could look at the resulting ANYOF node and suppress
16222      * the flag if everything is above 255, as those would be UTF-8 only,
16223      * but this isn't true, as the components that led to that result could
16224      * have been locale-affected, and just happen to cancel each other out
16225      * under UTF-8 locales.) */
16226     if (in_locale) {
16227         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16228
16229         assert(OP(REGNODE_p(node)) == ANYOF);
16230
16231         OP(REGNODE_p(node)) = ANYOFL;
16232         ANYOF_FLAGS(REGNODE_p(node))
16233                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16234     }
16235
16236     nextchar(pRExC_state);
16237     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16238     return node;
16239 }
16240
16241 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16242
16243 STATIC void
16244 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16245                              AV * stack, const IV fence, AV * fence_stack)
16246 {   /* Dumps the stacks in handle_regex_sets() */
16247
16248     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16249     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16250     SSize_t i;
16251
16252     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16253
16254     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16255
16256     if (stack_top < 0) {
16257         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16258     }
16259     else {
16260         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16261         for (i = stack_top; i >= 0; i--) {
16262             SV ** element_ptr = av_fetch(stack, i, FALSE);
16263             if (! element_ptr) {
16264             }
16265
16266             if (IS_OPERATOR(*element_ptr)) {
16267                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16268                                             (int) i, (int) SvIV(*element_ptr));
16269             }
16270             else {
16271                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16272                 sv_dump(*element_ptr);
16273             }
16274         }
16275     }
16276
16277     if (fence_stack_top < 0) {
16278         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16279     }
16280     else {
16281         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16282         for (i = fence_stack_top; i >= 0; i--) {
16283             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16284             if (! element_ptr) {
16285             }
16286
16287             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16288                                             (int) i, (int) SvIV(*element_ptr));
16289         }
16290     }
16291 }
16292
16293 #endif
16294
16295 #undef IS_OPERATOR
16296 #undef IS_OPERAND
16297
16298 STATIC void
16299 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16300 {
16301     /* This adds the Latin1/above-Latin1 folding rules.
16302      *
16303      * This should be called only for a Latin1-range code points, cp, which is
16304      * known to be involved in a simple fold with other code points above
16305      * Latin1.  It would give false results if /aa has been specified.
16306      * Multi-char folds are outside the scope of this, and must be handled
16307      * specially. */
16308
16309     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16310
16311     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16312
16313     /* The rules that are valid for all Unicode versions are hard-coded in */
16314     switch (cp) {
16315         case 'k':
16316         case 'K':
16317           *invlist =
16318              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16319             break;
16320         case 's':
16321         case 'S':
16322           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16323             break;
16324         case MICRO_SIGN:
16325           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16326           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16327             break;
16328         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16329         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16330           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16331             break;
16332         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16333           *invlist = add_cp_to_invlist(*invlist,
16334                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16335             break;
16336
16337         default:    /* Other code points are checked against the data for the
16338                        current Unicode version */
16339           {
16340             Size_t folds_count;
16341             unsigned int first_fold;
16342             const unsigned int * remaining_folds;
16343             UV folded_cp;
16344
16345             if (isASCII(cp)) {
16346                 folded_cp = toFOLD(cp);
16347             }
16348             else {
16349                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16350                 Size_t dummy_len;
16351                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16352             }
16353
16354             if (folded_cp > 255) {
16355                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16356             }
16357
16358             folds_count = _inverse_folds(folded_cp, &first_fold,
16359                                                     &remaining_folds);
16360             if (folds_count == 0) {
16361
16362                 /* Use deprecated warning to increase the chances of this being
16363                  * output */
16364                 ckWARN2reg_d(RExC_parse,
16365                         "Perl folding rules are not up-to-date for 0x%02X;"
16366                         " please use the perlbug utility to report;", cp);
16367             }
16368             else {
16369                 unsigned int i;
16370
16371                 if (first_fold > 255) {
16372                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16373                 }
16374                 for (i = 0; i < folds_count - 1; i++) {
16375                     if (remaining_folds[i] > 255) {
16376                         *invlist = add_cp_to_invlist(*invlist,
16377                                                     remaining_folds[i]);
16378                     }
16379                 }
16380             }
16381             break;
16382          }
16383     }
16384 }
16385
16386 STATIC void
16387 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16388 {
16389     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16390      * warnings. */
16391
16392     SV * msg;
16393     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16394
16395     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16396
16397     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16398         return;
16399     }
16400
16401     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16402         if (first_is_fatal) {           /* Avoid leaking this */
16403             av_undef(posix_warnings);   /* This isn't necessary if the
16404                                             array is mortal, but is a
16405                                             fail-safe */
16406             (void) sv_2mortal(msg);
16407             PREPARE_TO_DIE;
16408         }
16409         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16410         SvREFCNT_dec_NN(msg);
16411     }
16412
16413     UPDATE_WARNINGS_LOC(RExC_parse);
16414 }
16415
16416 STATIC AV *
16417 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16418 {
16419     /* This adds the string scalar <multi_string> to the array
16420      * <multi_char_matches>.  <multi_string> is known to have exactly
16421      * <cp_count> code points in it.  This is used when constructing a
16422      * bracketed character class and we find something that needs to match more
16423      * than a single character.
16424      *
16425      * <multi_char_matches> is actually an array of arrays.  Each top-level
16426      * element is an array that contains all the strings known so far that are
16427      * the same length.  And that length (in number of code points) is the same
16428      * as the index of the top-level array.  Hence, the [2] element is an
16429      * array, each element thereof is a string containing TWO code points;
16430      * while element [3] is for strings of THREE characters, and so on.  Since
16431      * this is for multi-char strings there can never be a [0] nor [1] element.
16432      *
16433      * When we rewrite the character class below, we will do so such that the
16434      * longest strings are written first, so that it prefers the longest
16435      * matching strings first.  This is done even if it turns out that any
16436      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16437      * Christiansen has agreed that this is ok.  This makes the test for the
16438      * ligature 'ffi' come before the test for 'ff', for example */
16439
16440     AV* this_array;
16441     AV** this_array_ptr;
16442
16443     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16444
16445     if (! multi_char_matches) {
16446         multi_char_matches = newAV();
16447     }
16448
16449     if (av_exists(multi_char_matches, cp_count)) {
16450         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16451         this_array = *this_array_ptr;
16452     }
16453     else {
16454         this_array = newAV();
16455         av_store(multi_char_matches, cp_count,
16456                  (SV*) this_array);
16457     }
16458     av_push(this_array, multi_string);
16459
16460     return multi_char_matches;
16461 }
16462
16463 /* The names of properties whose definitions are not known at compile time are
16464  * stored in this SV, after a constant heading.  So if the length has been
16465  * changed since initialization, then there is a run-time definition. */
16466 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16467                                         (SvCUR(listsv) != initial_listsv_len)
16468
16469 /* There is a restricted set of white space characters that are legal when
16470  * ignoring white space in a bracketed character class.  This generates the
16471  * code to skip them.
16472  *
16473  * There is a line below that uses the same white space criteria but is outside
16474  * this macro.  Both here and there must use the same definition */
16475 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
16476     STMT_START {                                                        \
16477         if (do_skip) {                                                  \
16478             while (isBLANK_A(UCHARAT(p)))                               \
16479             {                                                           \
16480                 p++;                                                    \
16481             }                                                           \
16482         }                                                               \
16483     } STMT_END
16484
16485 STATIC regnode_offset
16486 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16487                  const bool stop_at_1,  /* Just parse the next thing, don't
16488                                            look for a full character class */
16489                  bool allow_multi_folds,
16490                  const bool silence_non_portable,   /* Don't output warnings
16491                                                        about too large
16492                                                        characters */
16493                  const bool strict,
16494                  bool optimizable,                  /* ? Allow a non-ANYOF return
16495                                                        node */
16496                  SV** ret_invlist  /* Return an inversion list, not a node */
16497           )
16498 {
16499     /* parse a bracketed class specification.  Most of these will produce an
16500      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16501      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
16502      * under /i with multi-character folds: it will be rewritten following the
16503      * paradigm of this example, where the <multi-fold>s are characters which
16504      * fold to multiple character sequences:
16505      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16506      * gets effectively rewritten as:
16507      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16508      * reg() gets called (recursively) on the rewritten version, and this
16509      * function will return what it constructs.  (Actually the <multi-fold>s
16510      * aren't physically removed from the [abcdefghi], it's just that they are
16511      * ignored in the recursion by means of a flag:
16512      * <RExC_in_multi_char_class>.)
16513      *
16514      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16515      * characters, with the corresponding bit set if that character is in the
16516      * list.  For characters above this, a range list or swash is used.  There
16517      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16518      * determinable at compile time
16519      *
16520      * On success, returns the offset at which any next node should be placed
16521      * into the regex engine program being compiled.
16522      *
16523      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16524      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16525      * UTF-8
16526      */
16527
16528     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16529     IV range = 0;
16530     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16531     regnode_offset ret;
16532     STRLEN numlen;
16533     int namedclass = OOB_NAMEDCLASS;
16534     char *rangebegin = NULL;
16535     SV *listsv = NULL;
16536     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16537                                       than just initialized.  */
16538     SV* properties = NULL;    /* Code points that match \p{} \P{} */
16539     SV* posixes = NULL;     /* Code points that match classes like [:word:],
16540                                extended beyond the Latin1 range.  These have to
16541                                be kept separate from other code points for much
16542                                of this function because their handling  is
16543                                different under /i, and for most classes under
16544                                /d as well */
16545     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
16546                                separate for a while from the non-complemented
16547                                versions because of complications with /d
16548                                matching */
16549     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16550                                   treated more simply than the general case,
16551                                   leading to less compilation and execution
16552                                   work */
16553     UV element_count = 0;   /* Number of distinct elements in the class.
16554                                Optimizations may be possible if this is tiny */
16555     AV * multi_char_matches = NULL; /* Code points that fold to more than one
16556                                        character; used under /i */
16557     UV n;
16558     char * stop_ptr = RExC_end;    /* where to stop parsing */
16559
16560     /* ignore unescaped whitespace? */
16561     const bool skip_white = cBOOL(   ret_invlist
16562                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16563
16564     /* Unicode properties are stored in a swash; this holds the current one
16565      * being parsed.  If this swash is the only above-latin1 component of the
16566      * character class, an optimization is to pass it directly on to the
16567      * execution engine.  Otherwise, it is set to NULL to indicate that there
16568      * are other things in the class that have to be dealt with at execution
16569      * time */
16570     SV* swash = NULL;           /* Code points that match \p{} \P{} */
16571
16572     /* inversion list of code points this node matches only when the target
16573      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
16574      * /d) */
16575     SV* upper_latin1_only_utf8_matches = NULL;
16576
16577     /* Inversion list of code points this node matches regardless of things
16578      * like locale, folding, utf8ness of the target string */
16579     SV* cp_list = NULL;
16580
16581     /* Like cp_list, but code points on this list need to be checked for things
16582      * that fold to/from them under /i */
16583     SV* cp_foldable_list = NULL;
16584
16585     /* Like cp_list, but code points on this list are valid only when the
16586      * runtime locale is UTF-8 */
16587     SV* only_utf8_locale_list = NULL;
16588
16589     /* In a range, if one of the endpoints is non-character-set portable,
16590      * meaning that it hard-codes a code point that may mean a different
16591      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16592      * mnemonic '\t' which each mean the same character no matter which
16593      * character set the platform is on. */
16594     unsigned int non_portable_endpoint = 0;
16595
16596     /* Is the range unicode? which means on a platform that isn't 1-1 native
16597      * to Unicode (i.e. non-ASCII), each code point in it should be considered
16598      * to be a Unicode value.  */
16599     bool unicode_range = FALSE;
16600     bool invert = FALSE;    /* Is this class to be complemented */
16601
16602     bool warn_super = ALWAYS_WARN_SUPER;
16603
16604     const char * orig_parse = RExC_parse;
16605
16606     /* This variable is used to mark where the end in the input is of something
16607      * that looks like a POSIX construct but isn't.  During the parse, when
16608      * something looks like it could be such a construct is encountered, it is
16609      * checked for being one, but not if we've already checked this area of the
16610      * input.  Only after this position is reached do we check again */
16611     char *not_posix_region_end = RExC_parse - 1;
16612
16613     AV* posix_warnings = NULL;
16614     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16615     U8 op = END;    /* The returned node-type, initialized to an impossible
16616                        one.  */
16617     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
16618     U32 posixl = 0;       /* bit field of posix classes matched under /l */
16619
16620
16621 /* Flags as to what things aren't knowable until runtime.  (Note that these are
16622  * mutually exclusive.) */
16623 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
16624                                             haven't been defined as of yet */
16625 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
16626                                             UTF-8 or not */
16627 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
16628                                             what gets folded */
16629     U32 has_runtime_dependency = 0;     /* OR of the above flags */
16630
16631     GET_RE_DEBUG_FLAGS_DECL;
16632
16633     PERL_ARGS_ASSERT_REGCLASS;
16634 #ifndef DEBUGGING
16635     PERL_UNUSED_ARG(depth);
16636 #endif
16637
16638
16639     /* If wants an inversion list returned, we can't optimize to something
16640      * else. */
16641     if (ret_invlist) {
16642         optimizable = FALSE;
16643     }
16644
16645     DEBUG_PARSE("clas");
16646
16647 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
16648     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
16649                                    && UNICODE_DOT_DOT_VERSION == 0)
16650     allow_multi_folds = FALSE;
16651 #endif
16652
16653     listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16654     initial_listsv_len = SvCUR(listsv);
16655     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
16656
16657     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16658
16659     assert(RExC_parse <= RExC_end);
16660
16661     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
16662         RExC_parse++;
16663         invert = TRUE;
16664         allow_multi_folds = FALSE;
16665         MARK_NAUGHTY(1);
16666         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16667     }
16668
16669     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16670     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16671         int maybe_class = handle_possible_posix(pRExC_state,
16672                                                 RExC_parse,
16673                                                 &not_posix_region_end,
16674                                                 NULL,
16675                                                 TRUE /* checking only */);
16676         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16677             ckWARN4reg(not_posix_region_end,
16678                     "POSIX syntax [%c %c] belongs inside character classes%s",
16679                     *RExC_parse, *RExC_parse,
16680                     (maybe_class == OOB_NAMEDCLASS)
16681                     ? ((POSIXCC_NOTYET(*RExC_parse))
16682                         ? " (but this one isn't implemented)"
16683                         : " (but this one isn't fully valid)")
16684                     : ""
16685                     );
16686         }
16687     }
16688
16689     /* If the caller wants us to just parse a single element, accomplish this
16690      * by faking the loop ending condition */
16691     if (stop_at_1 && RExC_end > RExC_parse) {
16692         stop_ptr = RExC_parse + 1;
16693     }
16694
16695     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16696     if (UCHARAT(RExC_parse) == ']')
16697         goto charclassloop;
16698
16699     while (1) {
16700
16701         if (   posix_warnings
16702             && av_tindex_skip_len_mg(posix_warnings) >= 0
16703             && RExC_parse > not_posix_region_end)
16704         {
16705             /* Warnings about posix class issues are considered tentative until
16706              * we are far enough along in the parse that we can no longer
16707              * change our mind, at which point we output them.  This is done
16708              * each time through the loop so that a later class won't zap them
16709              * before they have been dealt with. */
16710             output_posix_warnings(pRExC_state, posix_warnings);
16711         }
16712
16713         if  (RExC_parse >= stop_ptr) {
16714             break;
16715         }
16716
16717         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16718
16719         if  (UCHARAT(RExC_parse) == ']') {
16720             break;
16721         }
16722
16723       charclassloop:
16724
16725         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16726         save_value = value;
16727         save_prevvalue = prevvalue;
16728
16729         if (!range) {
16730             rangebegin = RExC_parse;
16731             element_count++;
16732             non_portable_endpoint = 0;
16733         }
16734         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16735             value = utf8n_to_uvchr((U8*)RExC_parse,
16736                                    RExC_end - RExC_parse,
16737                                    &numlen, UTF8_ALLOW_DEFAULT);
16738             RExC_parse += numlen;
16739         }
16740         else
16741             value = UCHARAT(RExC_parse++);
16742
16743         if (value == '[') {
16744             char * posix_class_end;
16745             namedclass = handle_possible_posix(pRExC_state,
16746                                                RExC_parse,
16747                                                &posix_class_end,
16748                                                do_posix_warnings ? &posix_warnings : NULL,
16749                                                FALSE    /* die if error */);
16750             if (namedclass > OOB_NAMEDCLASS) {
16751
16752                 /* If there was an earlier attempt to parse this particular
16753                  * posix class, and it failed, it was a false alarm, as this
16754                  * successful one proves */
16755                 if (   posix_warnings
16756                     && av_tindex_skip_len_mg(posix_warnings) >= 0
16757                     && not_posix_region_end >= RExC_parse
16758                     && not_posix_region_end <= posix_class_end)
16759                 {
16760                     av_undef(posix_warnings);
16761                 }
16762
16763                 RExC_parse = posix_class_end;
16764             }
16765             else if (namedclass == OOB_NAMEDCLASS) {
16766                 not_posix_region_end = posix_class_end;
16767             }
16768             else {
16769                 namedclass = OOB_NAMEDCLASS;
16770             }
16771         }
16772         else if (   RExC_parse - 1 > not_posix_region_end
16773                  && MAYBE_POSIXCC(value))
16774         {
16775             (void) handle_possible_posix(
16776                         pRExC_state,
16777                         RExC_parse - 1,  /* -1 because parse has already been
16778                                             advanced */
16779                         &not_posix_region_end,
16780                         do_posix_warnings ? &posix_warnings : NULL,
16781                         TRUE /* checking only */);
16782         }
16783         else if (  strict && ! skip_white
16784                  && (   _generic_isCC(value, _CC_VERTSPACE)
16785                      || is_VERTWS_cp_high(value)))
16786         {
16787             vFAIL("Literal vertical space in [] is illegal except under /x");
16788         }
16789         else if (value == '\\') {
16790             /* Is a backslash; get the code point of the char after it */
16791
16792             if (RExC_parse >= RExC_end) {
16793                 vFAIL("Unmatched [");
16794             }
16795
16796             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16797                 value = utf8n_to_uvchr((U8*)RExC_parse,
16798                                    RExC_end - RExC_parse,
16799                                    &numlen, UTF8_ALLOW_DEFAULT);
16800                 RExC_parse += numlen;
16801             }
16802             else
16803                 value = UCHARAT(RExC_parse++);
16804
16805             /* Some compilers cannot handle switching on 64-bit integer
16806              * values, therefore value cannot be an UV.  Yes, this will
16807              * be a problem later if we want switch on Unicode.
16808              * A similar issue a little bit later when switching on
16809              * namedclass. --jhi */
16810
16811             /* If the \ is escaping white space when white space is being
16812              * skipped, it means that that white space is wanted literally, and
16813              * is already in 'value'.  Otherwise, need to translate the escape
16814              * into what it signifies. */
16815             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16816
16817             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
16818             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
16819             case 's':   namedclass = ANYOF_SPACE;       break;
16820             case 'S':   namedclass = ANYOF_NSPACE;      break;
16821             case 'd':   namedclass = ANYOF_DIGIT;       break;
16822             case 'D':   namedclass = ANYOF_NDIGIT;      break;
16823             case 'v':   namedclass = ANYOF_VERTWS;      break;
16824             case 'V':   namedclass = ANYOF_NVERTWS;     break;
16825             case 'h':   namedclass = ANYOF_HORIZWS;     break;
16826             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
16827             case 'N':  /* Handle \N{NAME} in class */
16828                 {
16829                     const char * const backslash_N_beg = RExC_parse - 2;
16830                     int cp_count;
16831
16832                     if (! grok_bslash_N(pRExC_state,
16833                                         NULL,      /* No regnode */
16834                                         &value,    /* Yes single value */
16835                                         &cp_count, /* Multiple code pt count */
16836                                         flagp,
16837                                         strict,
16838                                         depth)
16839                     ) {
16840
16841                         if (*flagp & NEED_UTF8)
16842                             FAIL("panic: grok_bslash_N set NEED_UTF8");
16843
16844                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
16845
16846                         if (cp_count < 0) {
16847                             vFAIL("\\N in a character class must be a named character: \\N{...}");
16848                         }
16849                         else if (cp_count == 0) {
16850                             ckWARNreg(RExC_parse,
16851                               "Ignoring zero length \\N{} in character class");
16852                         }
16853                         else { /* cp_count > 1 */
16854                             if (! RExC_in_multi_char_class) {
16855                                 if (invert || range || *RExC_parse == '-') {
16856                                     if (strict) {
16857                                         RExC_parse--;
16858                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16859                                     }
16860                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16861                                     break; /* <value> contains the first code
16862                                               point. Drop out of the switch to
16863                                               process it */
16864                                 }
16865                                 else {
16866                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
16867                                                  RExC_parse - backslash_N_beg);
16868                                     multi_char_matches
16869                                         = add_multi_match(multi_char_matches,
16870                                                           multi_char_N,
16871                                                           cp_count);
16872                                 }
16873                             }
16874                         } /* End of cp_count != 1 */
16875
16876                         /* This element should not be processed further in this
16877                          * class */
16878                         element_count--;
16879                         value = save_value;
16880                         prevvalue = save_prevvalue;
16881                         continue;   /* Back to top of loop to get next char */
16882                     }
16883
16884                     /* Here, is a single code point, and <value> contains it */
16885                     unicode_range = TRUE;   /* \N{} are Unicode */
16886                 }
16887                 break;
16888             case 'p':
16889             case 'P':
16890                 {
16891                 char *e;
16892                 char *i;
16893
16894                 /* We will handle any undefined properties ourselves */
16895                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16896                                        /* And we actually would prefer to get
16897                                         * the straight inversion list of the
16898                                         * swash, since we will be accessing it
16899                                         * anyway, to save a little time */
16900                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16901
16902                 SvREFCNT_dec(swash); /* Free any left-overs */
16903
16904                 /* \p means they want Unicode semantics */
16905                 REQUIRE_UNI_RULES(flagp, 0);
16906
16907                 if (RExC_parse >= RExC_end)
16908                     vFAIL2("Empty \\%c", (U8)value);
16909                 if (*RExC_parse == '{') {
16910                     const U8 c = (U8)value;
16911                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16912                     if (!e) {
16913                         RExC_parse++;
16914                         vFAIL2("Missing right brace on \\%c{}", c);
16915                     }
16916
16917                     RExC_parse++;
16918
16919                     /* White space is allowed adjacent to the braces and after
16920                      * any '^', even when not under /x */
16921                     while (isSPACE(*RExC_parse)) {
16922                          RExC_parse++;
16923                     }
16924
16925                     if (UCHARAT(RExC_parse) == '^') {
16926
16927                         /* toggle.  (The rhs xor gets the single bit that
16928                          * differs between P and p; the other xor inverts just
16929                          * that bit) */
16930                         value ^= 'P' ^ 'p';
16931
16932                         RExC_parse++;
16933                         while (isSPACE(*RExC_parse)) {
16934                             RExC_parse++;
16935                         }
16936                     }
16937
16938                     if (e == RExC_parse)
16939                         vFAIL2("Empty \\%c{}", c);
16940
16941                     n = e - RExC_parse;
16942                     while (isSPACE(*(RExC_parse + n - 1)))
16943                         n--;
16944
16945                 }   /* The \p isn't immediately followed by a '{' */
16946                 else if (! isALPHA(*RExC_parse)) {
16947                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16948                     vFAIL2("Character following \\%c must be '{' or a "
16949                            "single-character Unicode property name",
16950                            (U8) value);
16951                 }
16952                 else {
16953                     e = RExC_parse;
16954                     n = 1;
16955                 }
16956                 {
16957                     char* name = RExC_parse;
16958                     char* base_name;    /* name after any packages are stripped */
16959                     char* lookup_name = NULL;
16960                     const char * const colon_colon = "::";
16961                     bool invert;
16962
16963                     SV* invlist;
16964
16965                     /* Temporary workaround for [perl #133136].  For this
16966                     * precise input that is in the .t that is failing, load
16967                     * utf8.pm, which is what the test wants, so that that
16968                     * .t passes */
16969                     if (     memEQs(RExC_start, e + 1 - RExC_start,
16970                                     "foo\\p{Alnum}")
16971                         && ! hv_common(GvHVn(PL_incgv),
16972                                        NULL,
16973                                        "utf8.pm", sizeof("utf8.pm") - 1,
16974                                        0, HV_FETCH_ISEXISTS, NULL, 0))
16975                     {
16976                         require_pv("utf8.pm");
16977                     }
16978                     invlist = parse_uniprop_string(name, n, FOLD, &invert);
16979                     if (invlist) {
16980                         if (invert) {
16981                             value ^= 'P' ^ 'p';
16982                         }
16983                     }
16984                     else {
16985
16986                     /* Try to get the definition of the property into
16987                      * <invlist>.  If /i is in effect, the effective property
16988                      * will have its name be <__NAME_i>.  The design is
16989                      * discussed in commit
16990                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16991                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16992                     SAVEFREEPV(name);
16993
16994                     for (i = RExC_parse; i < RExC_parse + n; i++) {
16995                         if (isCNTRL(*i) && *i != '\t') {
16996                             RExC_parse = e + 1;
16997                             vFAIL2("Can't find Unicode property definition \"%s\"", name);
16998                         }
16999                     }
17000
17001                     if (FOLD) {
17002                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
17003
17004                         /* The function call just below that uses this can fail
17005                          * to return, leaking memory if we don't do this */
17006                         SAVEFREEPV(lookup_name);
17007                     }
17008
17009                     /* Look up the property name, and get its swash and
17010                      * inversion list, if the property is found  */
17011                     swash = _core_swash_init("utf8",
17012                                              (lookup_name)
17013                                               ? lookup_name
17014                                               : name,
17015                                              &PL_sv_undef,
17016                                              1, /* binary */
17017                                              0, /* not tr/// */
17018                                              NULL, /* No inversion list */
17019                                              &swash_init_flags
17020                                             );
17021                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
17022                         HV* curpkg = (IN_PERL_COMPILETIME)
17023                                       ? PL_curstash
17024                                       : CopSTASH(PL_curcop);
17025                         UV final_n = n;
17026                         bool has_pkg;
17027
17028                         if (swash) {    /* Got a swash but no inversion list.
17029                                            Something is likely wrong that will
17030                                            be sorted-out later */
17031                             SvREFCNT_dec_NN(swash);
17032                             swash = NULL;
17033                         }
17034
17035                         /* Here didn't find it.  It could be a an error (like a
17036                          * typo) in specifying a Unicode property, or it could
17037                          * be a user-defined property that will be available at
17038                          * run-time.  The names of these must begin with 'In'
17039                          * or 'Is' (after any packages are stripped off).  So
17040                          * if not one of those, or if we accept only
17041                          * compile-time properties, is an error; otherwise add
17042                          * it to the list for run-time look up. */
17043                         if ((base_name = rninstr(name, name + n,
17044                                                  colon_colon, colon_colon + 2)))
17045                         { /* Has ::.  We know this must be a user-defined
17046                              property */
17047                             base_name += 2;
17048                             final_n -= base_name - name;
17049                             has_pkg = TRUE;
17050                         }
17051                         else {
17052                             base_name = name;
17053                             has_pkg = FALSE;
17054                         }
17055
17056                         if (   final_n < 3
17057                             || base_name[0] != 'I'
17058                             || (base_name[1] != 's' && base_name[1] != 'n')
17059                             || ret_invlist)
17060                         {
17061                             const char * const msg
17062                                 = (has_pkg)
17063                                   ? "Illegal user-defined property name"
17064                                   : "Can't find Unicode property definition";
17065                             RExC_parse = e + 1;
17066
17067                             /* diag_listed_as: Can't find Unicode property definition "%s" */
17068                             vFAIL3utf8f("%s \"%" UTF8f "\"",
17069                                 msg, UTF8fARG(UTF, n, name));
17070                         }
17071
17072                         /* If the property name doesn't already have a package
17073                          * name, add the current one to it so that it can be
17074                          * referred to outside it. [perl #121777] */
17075                         if (! has_pkg && curpkg) {
17076                             char* pkgname = HvNAME(curpkg);
17077                             if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
17078                                 char* full_name = Perl_form(aTHX_
17079                                                             "%s::%s",
17080                                                             pkgname,
17081                                                             name);
17082                                 n = strlen(full_name);
17083                                 name = savepvn(full_name, n);
17084                                 SAVEFREEPV(name);
17085                             }
17086                         }
17087                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
17088                                         (value == 'p' ? '+' : '!'),
17089                                         (FOLD) ? "__" : "",
17090                                         UTF8fARG(UTF, n, name),
17091                                         (FOLD) ? "_i" : "");
17092                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17093
17094                         /* We don't know yet what this matches, so have to flag
17095                          * it */
17096                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17097                     }
17098                     else {
17099
17100                         /* Here, did get the swash and its inversion list.  If
17101                          * the swash is from a user-defined property, then this
17102                          * whole character class should be regarded as such */
17103                         if (swash_init_flags
17104                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
17105                         {
17106                             has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17107                         }
17108                     }
17109                     }
17110                     if (invlist) {
17111                         if (! (has_runtime_dependency
17112                                                 & HAS_USER_DEFINED_PROPERTY) &&
17113                             /* We warn on matching an above-Unicode code point
17114                              * if the match would return true, except don't
17115                              * warn for \p{All}, which has exactly one element
17116                              * = 0 */
17117                             (_invlist_contains_cp(invlist, 0x110000)
17118                                 && (! (_invlist_len(invlist) == 1
17119                                        && *invlist_array(invlist) == 0))))
17120                         {
17121                             warn_super = TRUE;
17122                         }
17123
17124                         /* Invert if asking for the complement */
17125                         if (value == 'P') {
17126                             _invlist_union_complement_2nd(properties,
17127                                                           invlist,
17128                                                           &properties);
17129
17130                             /* The swash can't be used as-is, because we've
17131                              * inverted things; delay removing it to here after
17132                              * have copied its invlist above */
17133                             if (! swash) {
17134                                 SvREFCNT_dec_NN(invlist);
17135                             }
17136                             SvREFCNT_dec(swash);
17137                             swash = NULL;
17138                         }
17139                         else {
17140                             _invlist_union(properties, invlist, &properties);
17141                             if (! swash) {
17142                                 SvREFCNT_dec_NN(invlist);
17143                             }
17144                         }
17145                     }
17146                 }
17147
17148                 RExC_parse = e + 1;
17149                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17150                                                 named */
17151                 }
17152                 break;
17153             case 'n':   value = '\n';                   break;
17154             case 'r':   value = '\r';                   break;
17155             case 't':   value = '\t';                   break;
17156             case 'f':   value = '\f';                   break;
17157             case 'b':   value = '\b';                   break;
17158             case 'e':   value = ESC_NATIVE;             break;
17159             case 'a':   value = '\a';                   break;
17160             case 'o':
17161                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17162                 {
17163                     const char* error_msg;
17164                     bool valid = grok_bslash_o(&RExC_parse,
17165                                                RExC_end,
17166                                                &value,
17167                                                &error_msg,
17168                                                TO_OUTPUT_WARNINGS(RExC_parse),
17169                                                strict,
17170                                                silence_non_portable,
17171                                                UTF);
17172                     if (! valid) {
17173                         vFAIL(error_msg);
17174                     }
17175                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17176                 }
17177                 non_portable_endpoint++;
17178                 break;
17179             case 'x':
17180                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17181                 {
17182                     const char* error_msg;
17183                     bool valid = grok_bslash_x(&RExC_parse,
17184                                                RExC_end,
17185                                                &value,
17186                                                &error_msg,
17187                                                TO_OUTPUT_WARNINGS(RExC_parse),
17188                                                strict,
17189                                                silence_non_portable,
17190                                                UTF);
17191                     if (! valid) {
17192                         vFAIL(error_msg);
17193                     }
17194                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17195                 }
17196                 non_portable_endpoint++;
17197                 break;
17198             case 'c':
17199                 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17200                 UPDATE_WARNINGS_LOC(RExC_parse);
17201                 RExC_parse++;
17202                 non_portable_endpoint++;
17203                 break;
17204             case '0': case '1': case '2': case '3': case '4':
17205             case '5': case '6': case '7':
17206                 {
17207                     /* Take 1-3 octal digits */
17208                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17209                     numlen = (strict) ? 4 : 3;
17210                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17211                     RExC_parse += numlen;
17212                     if (numlen != 3) {
17213                         if (strict) {
17214                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
17215                             vFAIL("Need exactly 3 octal digits");
17216                         }
17217                         else if (   numlen < 3 /* like \08, \178 */
17218                                  && RExC_parse < RExC_end
17219                                  && isDIGIT(*RExC_parse)
17220                                  && ckWARN(WARN_REGEXP))
17221                         {
17222                             reg_warn_non_literal_string(
17223                                  RExC_parse + 1,
17224                                  form_short_octal_warning(RExC_parse, numlen));
17225                         }
17226                     }
17227                     non_portable_endpoint++;
17228                     break;
17229                 }
17230             default:
17231                 /* Allow \_ to not give an error */
17232                 if (isWORDCHAR(value) && value != '_') {
17233                     if (strict) {
17234                         vFAIL2("Unrecognized escape \\%c in character class",
17235                                (int)value);
17236                     }
17237                     else {
17238                         ckWARN2reg(RExC_parse,
17239                             "Unrecognized escape \\%c in character class passed through",
17240                             (int)value);
17241                     }
17242                 }
17243                 break;
17244             }   /* End of switch on char following backslash */
17245         } /* end of handling backslash escape sequences */
17246
17247         /* Here, we have the current token in 'value' */
17248
17249         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17250             U8 classnum;
17251
17252             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17253              * literal, as is the character that began the false range, i.e.
17254              * the 'a' in the examples */
17255             if (range) {
17256                 const int w = (RExC_parse >= rangebegin)
17257                                 ? RExC_parse - rangebegin
17258                                 : 0;
17259                 if (strict) {
17260                     vFAIL2utf8f(
17261                         "False [] range \"%" UTF8f "\"",
17262                         UTF8fARG(UTF, w, rangebegin));
17263                 }
17264                 else {
17265                     ckWARN2reg(RExC_parse,
17266                         "False [] range \"%" UTF8f "\"",
17267                         UTF8fARG(UTF, w, rangebegin));
17268                     cp_list = add_cp_to_invlist(cp_list, '-');
17269                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17270                                                             prevvalue);
17271                 }
17272
17273                 range = 0; /* this was not a true range */
17274                 element_count += 2; /* So counts for three values */
17275             }
17276
17277             classnum = namedclass_to_classnum(namedclass);
17278
17279             if (LOC && namedclass < ANYOF_POSIXL_MAX
17280 #ifndef HAS_ISASCII
17281                 && classnum != _CC_ASCII
17282 #endif
17283             ) {
17284                 SV* scratch_list = NULL;
17285
17286                 /* What the Posix classes (like \w, [:space:]) match in locale
17287                  * isn't knowable under locale until actual match time.  A
17288                  * special node is used for these which has extra space for a
17289                  * bitmap, with a bit reserved for each named class that is to
17290                  * be matched against.  This isn't needed for \p{} and
17291                  * pseudo-classes, as they are not affected by locale, and
17292                  * hence are dealt with separately */
17293                 POSIXL_SET(posixl, namedclass);
17294                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17295                 anyof_flags |= ANYOF_MATCHES_POSIXL;
17296
17297                 /* The above-Latin1 characters are not subject to locale rules.
17298                  * Just add them to the unconditionally-matched list */
17299
17300                 /* Get the list of the above-Latin1 code points this matches */
17301                 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17302                                         PL_XPosix_ptrs[classnum],
17303
17304                                         /* Odd numbers are complements, like
17305                                         * NDIGIT, NASCII, ... */
17306                                         namedclass % 2 != 0,
17307                                         &scratch_list);
17308                 /* Checking if 'cp_list' is NULL first saves an extra clone.
17309                  * Its reference count will be decremented at the next union,
17310                  * etc, or if this is the only instance, at the end of the
17311                  * routine */
17312                 if (! cp_list) {
17313                     cp_list = scratch_list;
17314                 }
17315                 else {
17316                     _invlist_union(cp_list, scratch_list, &cp_list);
17317                     SvREFCNT_dec_NN(scratch_list);
17318                 }
17319                 continue;   /* Go get next character */
17320             }
17321             else {
17322
17323                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17324                  * matter (or is a Unicode property, which is skipped here). */
17325                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17326                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17327
17328                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17329                          * nor /l make a difference in what these match,
17330                          * therefore we just add what they match to cp_list. */
17331                         if (classnum != _CC_VERTSPACE) {
17332                             assert(   namedclass == ANYOF_HORIZWS
17333                                    || namedclass == ANYOF_NHORIZWS);
17334
17335                             /* It turns out that \h is just a synonym for
17336                              * XPosixBlank */
17337                             classnum = _CC_BLANK;
17338                         }
17339
17340                         _invlist_union_maybe_complement_2nd(
17341                                 cp_list,
17342                                 PL_XPosix_ptrs[classnum],
17343                                 namedclass % 2 != 0,    /* Complement if odd
17344                                                           (NHORIZWS, NVERTWS)
17345                                                         */
17346                                 &cp_list);
17347                     }
17348                 }
17349                 else if (   AT_LEAST_UNI_SEMANTICS
17350                          || classnum == _CC_ASCII
17351                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17352                                                    || classnum == _CC_XDIGIT)))
17353                 {
17354                     /* We usually have to worry about /d affecting what POSIX
17355                      * classes match, with special code needed because we won't
17356                      * know until runtime what all matches.  But there is no
17357                      * extra work needed under /u and /a; and [:ascii:] is
17358                      * unaffected by /d; and :digit: and :xdigit: don't have
17359                      * runtime differences under /d.  So we can special case
17360                      * these, and avoid some extra work below, and at runtime.
17361                      * */
17362                     _invlist_union_maybe_complement_2nd(
17363                                                      simple_posixes,
17364                                                       ((AT_LEAST_ASCII_RESTRICTED)
17365                                                        ? PL_Posix_ptrs[classnum]
17366                                                        : PL_XPosix_ptrs[classnum]),
17367                                                      namedclass % 2 != 0,
17368                                                      &simple_posixes);
17369                 }
17370                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17371                            complement and use nposixes */
17372                     SV** posixes_ptr = namedclass % 2 == 0
17373                                        ? &posixes
17374                                        : &nposixes;
17375                     _invlist_union_maybe_complement_2nd(
17376                                                      *posixes_ptr,
17377                                                      PL_XPosix_ptrs[classnum],
17378                                                      namedclass % 2 != 0,
17379                                                      posixes_ptr);
17380                 }
17381             }
17382         } /* end of namedclass \blah */
17383
17384         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17385
17386         /* If 'range' is set, 'value' is the ending of a range--check its
17387          * validity.  (If value isn't a single code point in the case of a
17388          * range, we should have figured that out above in the code that
17389          * catches false ranges).  Later, we will handle each individual code
17390          * point in the range.  If 'range' isn't set, this could be the
17391          * beginning of a range, so check for that by looking ahead to see if
17392          * the next real character to be processed is the range indicator--the
17393          * minus sign */
17394
17395         if (range) {
17396 #ifdef EBCDIC
17397             /* For unicode ranges, we have to test that the Unicode as opposed
17398              * to the native values are not decreasing.  (Above 255, there is
17399              * no difference between native and Unicode) */
17400             if (unicode_range && prevvalue < 255 && value < 255) {
17401                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17402                     goto backwards_range;
17403                 }
17404             }
17405             else
17406 #endif
17407             if (prevvalue > value) /* b-a */ {
17408                 int w;
17409 #ifdef EBCDIC
17410               backwards_range:
17411 #endif
17412                 w = RExC_parse - rangebegin;
17413                 vFAIL2utf8f(
17414                     "Invalid [] range \"%" UTF8f "\"",
17415                     UTF8fARG(UTF, w, rangebegin));
17416                 NOT_REACHED; /* NOTREACHED */
17417             }
17418         }
17419         else {
17420             prevvalue = value; /* save the beginning of the potential range */
17421             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17422                 && *RExC_parse == '-')
17423             {
17424                 char* next_char_ptr = RExC_parse + 1;
17425
17426                 /* Get the next real char after the '-' */
17427                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17428
17429                 /* If the '-' is at the end of the class (just before the ']',
17430                  * it is a literal minus; otherwise it is a range */
17431                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17432                     RExC_parse = next_char_ptr;
17433
17434                     /* a bad range like \w-, [:word:]- ? */
17435                     if (namedclass > OOB_NAMEDCLASS) {
17436                         if (strict || ckWARN(WARN_REGEXP)) {
17437                             const int w = RExC_parse >= rangebegin
17438                                           ?  RExC_parse - rangebegin
17439                                           : 0;
17440                             if (strict) {
17441                                 vFAIL4("False [] range \"%*.*s\"",
17442                                     w, w, rangebegin);
17443                             }
17444                             else {
17445                                 vWARN4(RExC_parse,
17446                                     "False [] range \"%*.*s\"",
17447                                     w, w, rangebegin);
17448                             }
17449                         }
17450                         cp_list = add_cp_to_invlist(cp_list, '-');
17451                         element_count++;
17452                     } else
17453                         range = 1;      /* yeah, it's a range! */
17454                     continue;   /* but do it the next time */
17455                 }
17456             }
17457         }
17458
17459         if (namedclass > OOB_NAMEDCLASS) {
17460             continue;
17461         }
17462
17463         /* Here, we have a single value this time through the loop, and
17464          * <prevvalue> is the beginning of the range, if any; or <value> if
17465          * not. */
17466
17467         /* non-Latin1 code point implies unicode semantics. */
17468         if (value > 255) {
17469             REQUIRE_UNI_RULES(flagp, 0);
17470         }
17471
17472         /* Ready to process either the single value, or the completed range.
17473          * For single-valued non-inverted ranges, we consider the possibility
17474          * of multi-char folds.  (We made a conscious decision to not do this
17475          * for the other cases because it can often lead to non-intuitive
17476          * results.  For example, you have the peculiar case that:
17477          *  "s s" =~ /^[^\xDF]+$/i => Y
17478          *  "ss"  =~ /^[^\xDF]+$/i => N
17479          *
17480          * See [perl #89750] */
17481         if (FOLD && allow_multi_folds && value == prevvalue) {
17482             if (    value == LATIN_SMALL_LETTER_SHARP_S
17483                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17484                                                         value)))
17485             {
17486                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17487
17488                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17489                 STRLEN foldlen;
17490
17491                 UV folded = _to_uni_fold_flags(
17492                                 value,
17493                                 foldbuf,
17494                                 &foldlen,
17495                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17496                                                    ? FOLD_FLAGS_NOMIX_ASCII
17497                                                    : 0)
17498                                 );
17499
17500                 /* Here, <folded> should be the first character of the
17501                  * multi-char fold of <value>, with <foldbuf> containing the
17502                  * whole thing.  But, if this fold is not allowed (because of
17503                  * the flags), <fold> will be the same as <value>, and should
17504                  * be processed like any other character, so skip the special
17505                  * handling */
17506                 if (folded != value) {
17507
17508                     /* Skip if we are recursed, currently parsing the class
17509                      * again.  Otherwise add this character to the list of
17510                      * multi-char folds. */
17511                     if (! RExC_in_multi_char_class) {
17512                         STRLEN cp_count = utf8_length(foldbuf,
17513                                                       foldbuf + foldlen);
17514                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17515
17516                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17517
17518                         multi_char_matches
17519                                         = add_multi_match(multi_char_matches,
17520                                                           multi_fold,
17521                                                           cp_count);
17522
17523                     }
17524
17525                     /* This element should not be processed further in this
17526                      * class */
17527                     element_count--;
17528                     value = save_value;
17529                     prevvalue = save_prevvalue;
17530                     continue;
17531                 }
17532             }
17533         }
17534
17535         if (strict && ckWARN(WARN_REGEXP)) {
17536             if (range) {
17537
17538                 /* If the range starts above 255, everything is portable and
17539                  * likely to be so for any forseeable character set, so don't
17540                  * warn. */
17541                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17542                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17543                 }
17544                 else if (prevvalue != value) {
17545
17546                     /* Under strict, ranges that stop and/or end in an ASCII
17547                      * printable should have each end point be a portable value
17548                      * for it (preferably like 'A', but we don't warn if it is
17549                      * a (portable) Unicode name or code point), and the range
17550                      * must be be all digits or all letters of the same case.
17551                      * Otherwise, the range is non-portable and unclear as to
17552                      * what it contains */
17553                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17554                         && (          non_portable_endpoint
17555                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17556                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17557                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17558                     ))) {
17559                         vWARN(RExC_parse, "Ranges of ASCII printables should"
17560                                           " be some subset of \"0-9\","
17561                                           " \"A-Z\", or \"a-z\"");
17562                     }
17563                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17564                         SSize_t index_start;
17565                         SSize_t index_final;
17566
17567                         /* But the nature of Unicode and languages mean we
17568                          * can't do the same checks for above-ASCII ranges,
17569                          * except in the case of digit ones.  These should
17570                          * contain only digits from the same group of 10.  The
17571                          * ASCII case is handled just above.  Hence here, the
17572                          * range could be a range of digits.  First some
17573                          * unlikely special cases.  Grandfather in that a range
17574                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17575                          * if its starting value is one of the 10 digits prior
17576                          * to it.  This is because it is an alternate way of
17577                          * writing 19D1, and some people may expect it to be in
17578                          * that group.  But it is bad, because it won't give
17579                          * the expected results.  In Unicode 5.2 it was
17580                          * considered to be in that group (of 11, hence), but
17581                          * this was fixed in the next version */
17582
17583                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17584                             goto warn_bad_digit_range;
17585                         }
17586                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
17587                                           &&     value <= 0x1D7FF))
17588                         {
17589                             /* This is the only other case currently in Unicode
17590                              * where the algorithm below fails.  The code
17591                              * points just above are the end points of a single
17592                              * range containing only decimal digits.  It is 5
17593                              * different series of 0-9.  All other ranges of
17594                              * digits currently in Unicode are just a single
17595                              * series.  (And mktables will notify us if a later
17596                              * Unicode version breaks this.)
17597                              *
17598                              * If the range being checked is at most 9 long,
17599                              * and the digit values represented are in
17600                              * numerical order, they are from the same series.
17601                              * */
17602                             if (         value - prevvalue > 9
17603                                 ||    (((    value - 0x1D7CE) % 10)
17604                                      <= (prevvalue - 0x1D7CE) % 10))
17605                             {
17606                                 goto warn_bad_digit_range;
17607                             }
17608                         }
17609                         else {
17610
17611                             /* For all other ranges of digits in Unicode, the
17612                              * algorithm is just to check if both end points
17613                              * are in the same series, which is the same range.
17614                              * */
17615                             index_start = _invlist_search(
17616                                                     PL_XPosix_ptrs[_CC_DIGIT],
17617                                                     prevvalue);
17618
17619                             /* Warn if the range starts and ends with a digit,
17620                              * and they are not in the same group of 10. */
17621                             if (   index_start >= 0
17622                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17623                                 && (index_final =
17624                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17625                                                     value)) != index_start
17626                                 && index_final >= 0
17627                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17628                             {
17629                               warn_bad_digit_range:
17630                                 vWARN(RExC_parse, "Ranges of digits should be"
17631                                                   " from the same group of"
17632                                                   " 10");
17633                             }
17634                         }
17635                     }
17636                 }
17637             }
17638             if ((! range || prevvalue == value) && non_portable_endpoint) {
17639                 if (isPRINT_A(value)) {
17640                     char literal[3];
17641                     unsigned d = 0;
17642                     if (isBACKSLASHED_PUNCT(value)) {
17643                         literal[d++] = '\\';
17644                     }
17645                     literal[d++] = (char) value;
17646                     literal[d++] = '\0';
17647
17648                     vWARN4(RExC_parse,
17649                            "\"%.*s\" is more clearly written simply as \"%s\"",
17650                            (int) (RExC_parse - rangebegin),
17651                            rangebegin,
17652                            literal
17653                         );
17654                 }
17655                 else if isMNEMONIC_CNTRL(value) {
17656                     vWARN4(RExC_parse,
17657                            "\"%.*s\" is more clearly written simply as \"%s\"",
17658                            (int) (RExC_parse - rangebegin),
17659                            rangebegin,
17660                            cntrl_to_mnemonic((U8) value)
17661                         );
17662                 }
17663             }
17664         }
17665
17666         /* Deal with this element of the class */
17667
17668 #ifndef EBCDIC
17669         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17670                                                     prevvalue, value);
17671 #else
17672         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17673          * that don't require special handling, we can just add the range like
17674          * we do for ASCII platforms */
17675         if ((UNLIKELY(prevvalue == 0) && value >= 255)
17676             || ! (prevvalue < 256
17677                     && (unicode_range
17678                         || (! non_portable_endpoint
17679                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17680                                 || (isUPPER_A(prevvalue)
17681                                     && isUPPER_A(value)))))))
17682         {
17683             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17684                                                         prevvalue, value);
17685         }
17686         else {
17687             /* Here, requires special handling.  This can be because it is a
17688              * range whose code points are considered to be Unicode, and so
17689              * must be individually translated into native, or because its a
17690              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17691              * EBCDIC, but we have defined them to include only the "expected"
17692              * upper or lower case ASCII alphabetics.  Subranges above 255 are
17693              * the same in native and Unicode, so can be added as a range */
17694             U8 start = NATIVE_TO_LATIN1(prevvalue);
17695             unsigned j;
17696             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17697             for (j = start; j <= end; j++) {
17698                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17699             }
17700             if (value > 255) {
17701                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17702                                                             256, value);
17703             }
17704         }
17705 #endif
17706
17707         range = 0; /* this range (if it was one) is done now */
17708     } /* End of loop through all the text within the brackets */
17709
17710     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17711         output_posix_warnings(pRExC_state, posix_warnings);
17712     }
17713
17714     /* If anything in the class expands to more than one character, we have to
17715      * deal with them by building up a substitute parse string, and recursively
17716      * calling reg() on it, instead of proceeding */
17717     if (multi_char_matches) {
17718         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17719         I32 cp_count;
17720         STRLEN len;
17721         char *save_end = RExC_end;
17722         char *save_parse = RExC_parse;
17723         char *save_start = RExC_start;
17724         Size_t constructed_prefix_len = 0; /* This gives the length of the
17725                                               constructed portion of the
17726                                               substitute parse. */
17727         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17728                                        a "|" */
17729         I32 reg_flags;
17730
17731         assert(! invert);
17732         /* Only one level of recursion allowed */
17733         assert(RExC_copy_start_in_constructed == RExC_precomp);
17734
17735 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17736            because too confusing */
17737         if (invert) {
17738             sv_catpvs(substitute_parse, "(?:");
17739         }
17740 #endif
17741
17742         /* Look at the longest folds first */
17743         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17744                         cp_count > 0;
17745                         cp_count--)
17746         {
17747
17748             if (av_exists(multi_char_matches, cp_count)) {
17749                 AV** this_array_ptr;
17750                 SV* this_sequence;
17751
17752                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17753                                                  cp_count, FALSE);
17754                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17755                                                                 &PL_sv_undef)
17756                 {
17757                     if (! first_time) {
17758                         sv_catpvs(substitute_parse, "|");
17759                     }
17760                     first_time = FALSE;
17761
17762                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17763                 }
17764             }
17765         }
17766
17767         /* If the character class contains anything else besides these
17768          * multi-character folds, have to include it in recursive parsing */
17769         if (element_count) {
17770             sv_catpvs(substitute_parse, "|[");
17771             constructed_prefix_len = SvCUR(substitute_parse);
17772             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17773
17774             /* Put in a closing ']' only if not going off the end, as otherwise
17775              * we are adding something that really isn't there */
17776             if (RExC_parse < RExC_end) {
17777                 sv_catpvs(substitute_parse, "]");
17778             }
17779         }
17780
17781         sv_catpvs(substitute_parse, ")");
17782 #if 0
17783         if (invert) {
17784             /* This is a way to get the parse to skip forward a whole named
17785              * sequence instead of matching the 2nd character when it fails the
17786              * first */
17787             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17788         }
17789 #endif
17790
17791         /* Set up the data structure so that any errors will be properly
17792          * reported.  See the comments at the definition of
17793          * REPORT_LOCATION_ARGS for details */
17794         RExC_copy_start_in_input = (char *) orig_parse;
17795         RExC_start = RExC_parse = SvPV(substitute_parse, len);
17796         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17797         RExC_end = RExC_parse + len;
17798         RExC_in_multi_char_class = 1;
17799
17800         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17801
17802         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17803
17804         /* And restore so can parse the rest of the pattern */
17805         RExC_parse = save_parse;
17806         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17807         RExC_end = save_end;
17808         RExC_in_multi_char_class = 0;
17809         SvREFCNT_dec_NN(multi_char_matches);
17810         return ret;
17811     }
17812
17813     /* If folding, we calculate all characters that could fold to or from the
17814      * ones already on the list */
17815     if (cp_foldable_list) {
17816         if (FOLD) {
17817             UV start, end;      /* End points of code point ranges */
17818
17819             SV* fold_intersection = NULL;
17820             SV** use_list;
17821
17822             /* Our calculated list will be for Unicode rules.  For locale
17823              * matching, we have to keep a separate list that is consulted at
17824              * runtime only when the locale indicates Unicode rules.  For
17825              * non-locale, we just use the general list */
17826             if (LOC) {
17827                 use_list = &only_utf8_locale_list;
17828             }
17829             else {
17830                 use_list = &cp_list;
17831             }
17832
17833             /* Only the characters in this class that participate in folds need
17834              * be checked.  Get the intersection of this class and all the
17835              * possible characters that are foldable.  This can quickly narrow
17836              * down a large class */
17837             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
17838                                   &fold_intersection);
17839
17840             /* Now look at the foldable characters in this class individually */
17841             invlist_iterinit(fold_intersection);
17842             while (invlist_iternext(fold_intersection, &start, &end)) {
17843                 UV j;
17844                 UV folded;
17845
17846                 /* Look at every character in the range */
17847                 for (j = start; j <= end; j++) {
17848                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17849                     STRLEN foldlen;
17850                     unsigned int k;
17851                     Size_t folds_count;
17852                     unsigned int first_fold;
17853                     const unsigned int * remaining_folds;
17854
17855                     if (j < 256) {
17856
17857                         if (IS_IN_SOME_FOLD_L1(j)) {
17858
17859                             /* ASCII is always matched; non-ASCII is matched
17860                              * only under Unicode rules (which could happen
17861                              * under /l if the locale is a UTF-8 one */
17862                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17863                                 *use_list = add_cp_to_invlist(*use_list,
17864                                                             PL_fold_latin1[j]);
17865                             }
17866                             else if (j != PL_fold_latin1[j]) {
17867                                 upper_latin1_only_utf8_matches
17868                                         = add_cp_to_invlist(
17869                                                 upper_latin1_only_utf8_matches,
17870                                                 PL_fold_latin1[j]);
17871                             }
17872                         }
17873
17874                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17875                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17876                         {
17877                             add_above_Latin1_folds(pRExC_state,
17878                                                    (U8) j,
17879                                                    use_list);
17880                         }
17881                         continue;
17882                     }
17883
17884                     /* Here is an above Latin1 character.  We don't have the
17885                      * rules hard-coded for it.  First, get its fold.  This is
17886                      * the simple fold, as the multi-character folds have been
17887                      * handled earlier and separated out */
17888                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
17889                                                         (ASCII_FOLD_RESTRICTED)
17890                                                         ? FOLD_FLAGS_NOMIX_ASCII
17891                                                         : 0);
17892
17893                     /* Single character fold of above Latin1.  Add everything
17894                      * in its fold closure to the list that this node should
17895                      * match. */
17896                     folds_count = _inverse_folds(folded, &first_fold,
17897                                                     &remaining_folds);
17898                     for (k = 0; k <= folds_count; k++) {
17899                         UV c = (k == 0)     /* First time through use itself */
17900                                 ? folded
17901                                 : (k == 1)  /* 2nd time use, the first fold */
17902                                    ? first_fold
17903
17904                                      /* Then the remaining ones */
17905                                    : remaining_folds[k-2];
17906
17907                         /* /aa doesn't allow folds between ASCII and non- */
17908                         if ((   ASCII_FOLD_RESTRICTED
17909                             && (isASCII(c) != isASCII(j))))
17910                         {
17911                             continue;
17912                         }
17913
17914                         /* Folds under /l which cross the 255/256 boundary are
17915                          * added to a separate list.  (These are valid only
17916                          * when the locale is UTF-8.) */
17917                         if (c < 256 && LOC) {
17918                             *use_list = add_cp_to_invlist(*use_list, c);
17919                             continue;
17920                         }
17921
17922                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17923                         {
17924                             cp_list = add_cp_to_invlist(cp_list, c);
17925                         }
17926                         else {
17927                             /* Similarly folds involving non-ascii Latin1
17928                              * characters under /d are added to their list */
17929                             upper_latin1_only_utf8_matches
17930                                     = add_cp_to_invlist(
17931                                                 upper_latin1_only_utf8_matches,
17932                                                 c);
17933                         }
17934                     }
17935                 }
17936             }
17937             SvREFCNT_dec_NN(fold_intersection);
17938         }
17939
17940         /* Now that we have finished adding all the folds, there is no reason
17941          * to keep the foldable list separate */
17942         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17943         SvREFCNT_dec_NN(cp_foldable_list);
17944     }
17945
17946     /* And combine the result (if any) with any inversion lists from posix
17947      * classes.  The lists are kept separate up to now because we don't want to
17948      * fold the classes (folding of those is automatically handled by the swash
17949      * fetching code) */
17950     if (simple_posixes) {   /* These are the classes known to be unaffected by
17951                                /a, /aa, and /d */
17952         if (cp_list) {
17953             _invlist_union(cp_list, simple_posixes, &cp_list);
17954             SvREFCNT_dec_NN(simple_posixes);
17955         }
17956         else {
17957             cp_list = simple_posixes;
17958         }
17959     }
17960     if (posixes || nposixes) {
17961         if (! DEPENDS_SEMANTICS) {
17962
17963             /* For everything but /d, we can just add the current 'posixes' and
17964              * 'nposixes' to the main list */
17965             if (posixes) {
17966                 if (cp_list) {
17967                     _invlist_union(cp_list, posixes, &cp_list);
17968                     SvREFCNT_dec_NN(posixes);
17969                 }
17970                 else {
17971                     cp_list = posixes;
17972                 }
17973             }
17974             if (nposixes) {
17975                 if (cp_list) {
17976                     _invlist_union(cp_list, nposixes, &cp_list);
17977                     SvREFCNT_dec_NN(nposixes);
17978                 }
17979                 else {
17980                     cp_list = nposixes;
17981                 }
17982             }
17983         }
17984         else {
17985             /* Under /d, things like \w match upper Latin1 characters only if
17986              * the target string is in UTF-8.  But things like \W match all the
17987              * upper Latin1 characters if the target string is not in UTF-8.
17988              *
17989              * Handle the case with something like \W separately */
17990             if (nposixes) {
17991                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
17992
17993                 /* A complemented posix class matches all upper Latin1
17994                  * characters if not in UTF-8.  And it matches just certain
17995                  * ones when in UTF-8.  That means those certain ones are
17996                  * matched regardless, so can just be added to the
17997                  * unconditional list */
17998                 if (cp_list) {
17999                     _invlist_union(cp_list, nposixes, &cp_list);
18000                     SvREFCNT_dec_NN(nposixes);
18001                     nposixes = NULL;
18002                 }
18003                 else {
18004                     cp_list = nposixes;
18005                 }
18006
18007                 /* Likewise for 'posixes' */
18008                 _invlist_union(posixes, cp_list, &cp_list);
18009
18010                 /* Likewise for anything else in the range that matched only
18011                  * under UTF-8 */
18012                 if (upper_latin1_only_utf8_matches) {
18013                     _invlist_union(cp_list,
18014                                    upper_latin1_only_utf8_matches,
18015                                    &cp_list);
18016                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18017                     upper_latin1_only_utf8_matches = NULL;
18018                 }
18019
18020                 /* If we don't match all the upper Latin1 characters regardless
18021                  * of UTF-8ness, we have to set a flag to match the rest when
18022                  * not in UTF-8 */
18023                 _invlist_subtract(only_non_utf8_list, cp_list,
18024                                   &only_non_utf8_list);
18025                 if (_invlist_len(only_non_utf8_list) != 0) {
18026                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18027                 }
18028                 SvREFCNT_dec_NN(only_non_utf8_list);
18029             }
18030             else {
18031                 /* Here there were no complemented posix classes.  That means
18032                  * the upper Latin1 characters in 'posixes' match only when the
18033                  * target string is in UTF-8.  So we have to add them to the
18034                  * list of those types of code points, while adding the
18035                  * remainder to the unconditional list.
18036                  *
18037                  * First calculate what they are */
18038                 SV* nonascii_but_latin1_properties = NULL;
18039                 _invlist_intersection(posixes, PL_UpperLatin1,
18040                                       &nonascii_but_latin1_properties);
18041
18042                 /* And add them to the final list of such characters. */
18043                 _invlist_union(upper_latin1_only_utf8_matches,
18044                                nonascii_but_latin1_properties,
18045                                &upper_latin1_only_utf8_matches);
18046
18047                 /* Remove them from what now becomes the unconditional list */
18048                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18049                                   &posixes);
18050
18051                 /* And add those unconditional ones to the final list */
18052                 if (cp_list) {
18053                     _invlist_union(cp_list, posixes, &cp_list);
18054                     SvREFCNT_dec_NN(posixes);
18055                     posixes = NULL;
18056                 }
18057                 else {
18058                     cp_list = posixes;
18059                 }
18060
18061                 SvREFCNT_dec(nonascii_but_latin1_properties);
18062
18063                 /* Get rid of any characters from the conditional list that we
18064                  * now know are matched unconditionally, which may make that
18065                  * list empty */
18066                 _invlist_subtract(upper_latin1_only_utf8_matches,
18067                                   cp_list,
18068                                   &upper_latin1_only_utf8_matches);
18069                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18070                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18071                     upper_latin1_only_utf8_matches = NULL;
18072                 }
18073             }
18074         }
18075     }
18076
18077     /* And combine the result (if any) with any inversion list from properties.
18078      * The lists are kept separate up to now so that we can distinguish the two
18079      * in regards to matching above-Unicode.  A run-time warning is generated
18080      * if a Unicode property is matched against a non-Unicode code point. But,
18081      * we allow user-defined properties to match anything, without any warning,
18082      * and we also suppress the warning if there is a portion of the character
18083      * class that isn't a Unicode property, and which matches above Unicode, \W
18084      * or [\x{110000}] for example.
18085      * (Note that in this case, unlike the Posix one above, there is no
18086      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18087      * forces Unicode semantics */
18088     if (properties) {
18089         if (cp_list) {
18090
18091             /* If it matters to the final outcome, see if a non-property
18092              * component of the class matches above Unicode.  If so, the
18093              * warning gets suppressed.  This is true even if just a single
18094              * such code point is specified, as, though not strictly correct if
18095              * another such code point is matched against, the fact that they
18096              * are using above-Unicode code points indicates they should know
18097              * the issues involved */
18098             if (warn_super) {
18099                 warn_super = ! (invert
18100                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18101             }
18102
18103             _invlist_union(properties, cp_list, &cp_list);
18104             SvREFCNT_dec_NN(properties);
18105         }
18106         else {
18107             cp_list = properties;
18108         }
18109
18110         if (warn_super) {
18111             anyof_flags
18112              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18113
18114             /* Because an ANYOF node is the only one that warns, this node
18115              * can't be optimized into something else */
18116             optimizable = FALSE;
18117         }
18118     }
18119
18120     /* Here, we have calculated what code points should be in the character
18121      * class.
18122      *
18123      * Now we can see about various optimizations.  Fold calculation (which we
18124      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18125      * would invert to include K, which under /i would match k, which it
18126      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18127      * folded until runtime */
18128
18129     /* If we didn't do folding, it's because some information isn't available
18130      * until runtime; set the run-time fold flag for these.  (We don't have to
18131      * worry about properties folding, as that is taken care of by the swash
18132      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
18133      * locales, or the class matches at least one 0-255 range code point */
18134     if (LOC && FOLD) {
18135
18136         /* Some things on the list might be unconditionally included because of
18137          * other components.  Remove them, and clean up the list if it goes to
18138          * 0 elements */
18139         if (only_utf8_locale_list && cp_list) {
18140             _invlist_subtract(only_utf8_locale_list, cp_list,
18141                               &only_utf8_locale_list);
18142
18143             if (_invlist_len(only_utf8_locale_list) == 0) {
18144                 SvREFCNT_dec_NN(only_utf8_locale_list);
18145                 only_utf8_locale_list = NULL;
18146             }
18147         }
18148         if (only_utf8_locale_list) {
18149             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18150             anyof_flags
18151                  |= ANYOFL_FOLD
18152                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18153         }
18154         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18155             UV start, end;
18156             invlist_iterinit(cp_list);
18157             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18158                 anyof_flags |= ANYOFL_FOLD;
18159                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18160             }
18161             invlist_iterfinish(cp_list);
18162         }
18163     }
18164     else if (   DEPENDS_SEMANTICS
18165              && (    upper_latin1_only_utf8_matches
18166                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18167     {
18168         RExC_seen_d_op = TRUE;
18169         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18170     }
18171
18172     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18173      * compile time. */
18174     if (     cp_list
18175         &&   invert
18176         && ! has_runtime_dependency)
18177     {
18178         _invlist_invert(cp_list);
18179
18180         /* Any swash can't be used as-is, because we've inverted things */
18181         if (swash) {
18182             SvREFCNT_dec_NN(swash);
18183             swash = NULL;
18184         }
18185
18186         invert = FALSE;
18187     }
18188
18189     if (ret_invlist) {
18190         *ret_invlist = cp_list;
18191         SvREFCNT_dec(swash);
18192
18193         return RExC_emit;
18194     }
18195
18196     /* All possible optimizations below still have these characteristics.
18197      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18198      * routine) */
18199     *flagp |= HASWIDTH|SIMPLE;
18200
18201     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18202         RExC_contains_locale = 1;
18203     }
18204
18205     /* Some character classes are equivalent to other nodes.  Such nodes take
18206      * up less room, and some nodes require fewer operations to execute, than
18207      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18208      * improve efficiency. */
18209
18210     if (optimizable) {
18211         PERL_UINT_FAST8_T i;
18212         Size_t partial_cp_count = 0;
18213         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18214         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18215
18216         if (cp_list) { /* Count the code points in enough ranges that we would
18217                           see all the ones possible in any fold in this version
18218                           of Unicode */
18219
18220             invlist_iterinit(cp_list);
18221             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18222                 if (invlist_iternext(cp_list, &start[i], &end[i])) {
18223                     partial_cp_count += end[i] - start[i] + 1;
18224                 }
18225             }
18226
18227             invlist_iterfinish(cp_list);
18228         }
18229
18230         /* If we know at compile time that this matches every possible code
18231          * point, any run-time dependencies don't matter */
18232         if (start[0] == 0 && end[0] == UV_MAX) {
18233             if (invert) {
18234                 ret = reganode(pRExC_state, OPFAIL, 0);
18235             }
18236             else {
18237                 ret = reg_node(pRExC_state, SANY);
18238                 MARK_NAUGHTY(1);
18239             }
18240             goto not_anyof;
18241         }
18242
18243         /* Similarly, for /l posix classes, if both a class and its
18244          * complement match, any run-time dependencies don't matter */
18245         if (posixl) {
18246             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18247                                                         namedclass += 2)
18248             {
18249                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18250                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18251                 {
18252                     if (invert) {
18253                         ret = reganode(pRExC_state, OPFAIL, 0);
18254                     }
18255                     else {
18256                         ret = reg_node(pRExC_state, SANY);
18257                         MARK_NAUGHTY(1);
18258                     }
18259                     goto not_anyof;
18260                 }
18261             }
18262             /* For well-behaved locales, some classes are subsets of others,
18263              * so complementing the subset and including the non-complemented
18264              * superset should match everything, like [\D[:alnum:]], and
18265              * [[:^alpha:][:alnum:]], but some implementations of locales are
18266              * buggy, and khw thinks its a bad idea to have optimization change
18267              * behavior, even if it avoids an OS bug in a given case */
18268
18269 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18270
18271             /* If is a single posix /l class, can optimize to just that op.
18272              * Such a node will not match anything in the Latin1 range, as that
18273              * is not determinable until runtime, but will match whatever the
18274              * class does outside that range.  (Note that some classes won't
18275              * match anything outside the range, like [:ascii:]) */
18276             if (    isSINGLE_BIT_SET(posixl)
18277                 && (partial_cp_count == 0 || start[0] > 255))
18278             {
18279                 U8 classnum;
18280                 SV * class_above_latin1 = NULL;
18281                 bool already_inverted;
18282                 bool are_equivalent;
18283
18284                 /* Compute which bit is set, which is the same thing as, e.g.,
18285                  * ANYOF_CNTRL.  From
18286                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18287                  * */
18288                 static const int MultiplyDeBruijnBitPosition2[32] =
18289                     {
18290                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18291                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18292                     };
18293
18294                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18295                                                           * 0x077CB531U) >> 27];
18296                 classnum = namedclass_to_classnum(namedclass);
18297
18298                 /* The named classes are such that the inverted number is one
18299                  * larger than the non-inverted one */
18300                 already_inverted = namedclass
18301                                  - classnum_to_namedclass(classnum);
18302
18303                 /* Create an inversion list of the official property, inverted
18304                  * if the constructed node list is inverted, and restricted to
18305                  * only the above latin1 code points, which are the only ones
18306                  * known at compile time */
18307                 _invlist_intersection_maybe_complement_2nd(
18308                                                     PL_AboveLatin1,
18309                                                     PL_XPosix_ptrs[classnum],
18310                                                     already_inverted,
18311                                                     &class_above_latin1);
18312                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18313                                                                         FALSE);
18314                 SvREFCNT_dec_NN(class_above_latin1);
18315
18316                 if (are_equivalent) {
18317
18318                     /* Resolve the run-time inversion flag with this possibly
18319                      * inverted class */
18320                     invert = invert ^ already_inverted;
18321
18322                     ret = reg_node(pRExC_state,
18323                                    POSIXL + invert * (NPOSIXL - POSIXL));
18324                     FLAGS(REGNODE_p(ret)) = classnum;
18325                     goto not_anyof;
18326                 }
18327             }
18328         }
18329
18330         /* khw can't think of any other possible transformation involving
18331          * these. */
18332         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18333             goto is_anyof;
18334         }
18335
18336         if (! has_runtime_dependency) {
18337
18338             /* If the list is empty, nothing matches.  This happens, for
18339              * example, when a Unicode property that doesn't match anything is
18340              * the only element in the character class (perluniprops.pod notes
18341              * such properties). */
18342             if (partial_cp_count == 0) {
18343                 assert (! invert);
18344                 ret = reganode(pRExC_state, OPFAIL, 0);
18345                 goto not_anyof;
18346             }
18347
18348             /* If matches everything but \n */
18349             if (   start[0] == 0 && end[0] == '\n' - 1
18350                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18351             {
18352                 assert (! invert);
18353                 ret = reg_node(pRExC_state, REG_ANY);
18354                 MARK_NAUGHTY(1);
18355                 goto not_anyof;
18356             }
18357         }
18358
18359         /* Next see if can optimize classes that contain just a few code points
18360          * into an EXACTish node.  The reason to do this is to let the
18361          * optimizer join this node with adjacent EXACTish ones.
18362          *
18363          * An EXACTFish node can be generated even if not under /i, and vice
18364          * versa.  But care must be taken.  An EXACTFish node has to be such
18365          * that it only matches precisely the code points in the class, but we
18366          * want to generate the least restrictive one that does that, to
18367          * increase the odds of being able to join with an adjacent node.  For
18368          * example, if the class contains [kK], we have to make it an EXACTFAA
18369          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18370          * /i or not is irrelevant in this case.  Less obvious is the pattern
18371          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18372          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18373          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18374          * that includes \X{02BC}, there is a multi-char fold that does, and so
18375          * the node generated for it must be an EXACTFish one.  On the other
18376          * hand qr/:/i should generate a plain EXACT node since the colon
18377          * participates in no fold whatsoever, and having it EXACT tells the
18378          * optimizer the target string cannot match unless it has a colon in
18379          * it.
18380          *
18381          * We don't typically generate an EXACTish node if doing so would
18382          * require changing the pattern to UTF-8, as that affects /d and
18383          * otherwise is slower.  However, under /i, not changing to UTF-8 can
18384          * miss some potential multi-character folds.  We calculate the
18385          * EXACTish node, and then decide if something would be missed if we
18386          * don't upgrade */
18387         if (   ! posixl
18388             && ! invert
18389
18390                 /* Only try if there are no more code points in the class than
18391                  * in the max possible fold */
18392             &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
18393
18394             && (start[0] < 256 || UTF || FOLD))
18395         {
18396             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18397             {
18398                 /* We can always make a single code point class into an
18399                  * EXACTish node. */
18400
18401                 if (LOC) {
18402
18403                     /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
18404                      * as that means there is a fold not known until runtime so
18405                      * shows as only a single code point here. */
18406                     op = (FOLD) ? EXACTFL : EXACTL;
18407                 }
18408                 else if (! FOLD) { /* Not /l and not /i */
18409                     op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
18410                 }
18411                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18412                                               small */
18413
18414                     /* Under /i, it gets a little tricky.  A code point that
18415                      * doesn't participate in a fold should be an EXACT node.
18416                      * We know this one isn't the result of a simple fold, or
18417                      * there'd be more than one code point in the list, but it
18418                      * could be part of a multi- character fold.  In that case
18419                      * we better not create an EXACT node, as we would wrongly
18420                      * be telling the optimizer that this code point must be in
18421                      * the target string, and that is wrong.  This is because
18422                      * if the sequence around this code point forms a
18423                      * multi-char fold, what needs to be in the string could be
18424                      * the code point that folds to the sequence.
18425                      *
18426                      * This handles the case of below-255 code points, as we
18427                      * have an easy look up for those.  The next clause handles
18428                      * the above-256 one */
18429                     op = IS_IN_SOME_FOLD_L1(start[0])
18430                          ? EXACTFU
18431                          : EXACT;
18432                 }
18433                 else {  /* /i, larger code point.  Since we are under /i, and
18434                            have just this code point, we know that it can't
18435                            fold to something else, so PL_InMultiCharFold
18436                            applies to it */
18437                     op = _invlist_contains_cp(PL_InMultiCharFold,
18438                                               start[0])
18439                          ? EXACTFU_ONLY8
18440                          : EXACT_ONLY8;
18441                 }
18442
18443                 value = start[0];
18444             }
18445             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18446                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18447             {
18448                 /* Here, the only runtime dependency, if any, is from /d, and
18449                  * the class matches more than one code point, and the lowest
18450                  * code point participates in some fold.  It might be that the
18451                  * other code points are /i equivalent to this one, and hence
18452                  * they would representable by an EXACTFish node.  Above, we
18453                  * eliminated classes that contain too many code points to be
18454                  * EXACTFish, with the test for MAX_FOLD_FROMS
18455                  *
18456                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18457                  * We do this because we have EXACTFAA at our disposal for the
18458                  * ASCII range */
18459                 if (partial_cp_count == 2 && isASCII(start[0])) {
18460
18461                     /* The only ASCII characters that participate in folds are
18462                      * alphabetics */
18463                     assert(isALPHA(start[0]));
18464                     if (   end[0] == start[0]   /* First range is a single
18465                                                    character, so 2nd exists */
18466                         && isALPHA_FOLD_EQ(start[0], start[1]))
18467                     {
18468
18469                         /* Here, is part of an ASCII fold pair */
18470
18471                         if (   ASCII_FOLD_RESTRICTED
18472                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18473                         {
18474                             /* If the second clause just above was true, it
18475                              * means we can't be under /i, or else the list
18476                              * would have included more than this fold pair.
18477                              * Therefore we have to exclude the possibility of
18478                              * whatever else it is that folds to these, by
18479                              * using EXACTFAA */
18480                             op = EXACTFAA;
18481                         }
18482                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18483
18484                             /* Here, there's no simple fold that start[0] is part
18485                              * of, but there is a multi-character one.  If we
18486                              * are not under /i, we want to exclude that
18487                              * possibility; if under /i, we want to include it
18488                              * */
18489                             op = (FOLD) ? EXACTFU : EXACTFAA;
18490                         }
18491                         else {
18492
18493                             /* Here, the only possible fold start[0] particpates in
18494                              * is with start[1].  /i or not isn't relevant */
18495                             op = EXACTFU;
18496                         }
18497
18498                         value = toFOLD(start[0]);
18499                     }
18500                 }
18501                 else if (  ! upper_latin1_only_utf8_matches
18502                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18503                                                                           == 2
18504                              && PL_fold_latin1[
18505                                invlist_highest(upper_latin1_only_utf8_matches)]
18506                              == start[0]))
18507                 {
18508                     /* Here, the smallest character is non-ascii or there are
18509                      * more than 2 code points matched by this node.  Also, we
18510                      * either don't have /d UTF-8 dependent matches, or if we
18511                      * do, they look like they could be a single character that
18512                      * is the fold of the lowest one in the always-match list.
18513                      * This test quickly excludes most of the false positives
18514                      * when there are /d UTF-8 depdendent matches.  These are
18515                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18516                      * SMALL LETTER A WITH GRAVE iff the target string is
18517                      * UTF-8.  (We don't have to worry above about exceeding
18518                      * the array bounds of PL_fold_latin1[] because any code
18519                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18520                      *
18521                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18522                      * points) in the ASCII range, so we can't use it here to
18523                      * artificially restrict the fold domain, so we check if
18524                      * the class does or does not match some EXACTFish node.
18525                      * Further, if we aren't under /i, and and the folded-to
18526                      * character is part of a multi-character fold, we can't do
18527                      * this optimization, as the sequence around it could be
18528                      * that multi-character fold, and we don't here know the
18529                      * context, so we have to assume it is that multi-char
18530                      * fold, to prevent potential bugs.
18531                      *
18532                      * To do the general case, we first find the fold of the
18533                      * lowest code point (which may be higher than the lowest
18534                      * one), then find everything that folds to it.  (The data
18535                      * structure we have only maps from the folded code points,
18536                      * so we have to do the earlier step.) */
18537
18538                     Size_t foldlen;
18539                     U8 foldbuf[UTF8_MAXBYTES_CASE];
18540                     UV folded = _to_uni_fold_flags(start[0],
18541                                                         foldbuf, &foldlen, 0);
18542                     unsigned int first_fold;
18543                     const unsigned int * remaining_folds;
18544                     Size_t folds_to_this_cp_count = _inverse_folds(
18545                                                             folded,
18546                                                             &first_fold,
18547                                                             &remaining_folds);
18548                     Size_t folds_count = folds_to_this_cp_count + 1;
18549                     SV * fold_list = _new_invlist(folds_count);
18550                     unsigned int i;
18551
18552                     /* If there are UTF-8 dependent matches, create a temporary
18553                      * list of what this node matches, including them. */
18554                     SV * all_cp_list = NULL;
18555                     SV ** use_this_list = &cp_list;
18556
18557                     if (upper_latin1_only_utf8_matches) {
18558                         all_cp_list = _new_invlist(0);
18559                         use_this_list = &all_cp_list;
18560                         _invlist_union(cp_list,
18561                                        upper_latin1_only_utf8_matches,
18562                                        use_this_list);
18563                     }
18564
18565                     /* Having gotten everything that participates in the fold
18566                      * containing the lowest code point, we turn that into an
18567                      * inversion list, making sure everything is included. */
18568                     fold_list = add_cp_to_invlist(fold_list, start[0]);
18569                     fold_list = add_cp_to_invlist(fold_list, folded);
18570                     fold_list = add_cp_to_invlist(fold_list, first_fold);
18571                     for (i = 0; i < folds_to_this_cp_count - 1; i++) {
18572                         fold_list = add_cp_to_invlist(fold_list,
18573                                                         remaining_folds[i]);
18574                     }
18575
18576                     /* If the fold list is identical to what's in this ANYOF
18577                      * node, the node can be represented by an EXACTFish one
18578                      * instead */
18579                     if (_invlistEQ(*use_this_list, fold_list,
18580                                    0 /* Don't complement */ )
18581                     ) {
18582
18583                         /* But, we have to be careful, as mentioned above.
18584                          * Just the right sequence of characters could match
18585                          * this if it is part of a multi-character fold.  That
18586                          * IS what we want if we are under /i.  But it ISN'T
18587                          * what we want if not under /i, as it could match when
18588                          * it shouldn't.  So, when we aren't under /i and this
18589                          * character participates in a multi-char fold, we
18590                          * don't optimize into an EXACTFish node.  So, for each
18591                          * case below we have to check if we are folding
18592                          * and if not, if it is not part of a multi-char fold.
18593                          * */
18594                         if (start[0] > 255) {    /* Highish code point */
18595                             if (FOLD || ! _invlist_contains_cp(
18596                                             PL_InMultiCharFold, folded))
18597                             {
18598                                 op = (LOC)
18599                                      ? EXACTFLU8
18600                                      : (ASCII_FOLD_RESTRICTED)
18601                                        ? EXACTFAA
18602                                        : EXACTFU_ONLY8;
18603                                 value = folded;
18604                             }
18605                         }   /* Below, the lowest code point < 256 */
18606                         else if (    FOLD
18607                                  &&  folded == 's'
18608                                  &&  DEPENDS_SEMANTICS)
18609                         {   /* An EXACTF node containing a single character
18610                                 's', can be an EXACTFU if it doesn't get
18611                                 joined with an adjacent 's' */
18612                             op = EXACTFU_S_EDGE;
18613                             value = folded;
18614                         }
18615                         else if (    FOLD
18616                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
18617                         {
18618                             if (upper_latin1_only_utf8_matches) {
18619                                 op = EXACTF;
18620
18621                                 /* We can't use the fold, as that only matches
18622                                  * under UTF-8 */
18623                                 value = start[0];
18624                             }
18625                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
18626                                      && ! UTF)
18627                             {   /* EXACTFUP is a special node for this
18628                                    character */
18629                                 op = (ASCII_FOLD_RESTRICTED)
18630                                      ? EXACTFAA
18631                                      : EXACTFUP;
18632                                 value = MICRO_SIGN;
18633                             }
18634                             else if (     ASCII_FOLD_RESTRICTED
18635                                      && ! isASCII(start[0]))
18636                             {   /* For ASCII under /iaa, we can use EXACTFU
18637                                    below */
18638                                 op = EXACTFAA;
18639                                 value = folded;
18640                             }
18641                             else {
18642                                 op = EXACTFU;
18643                                 value = folded;
18644                             }
18645                         }
18646                     }
18647
18648                     SvREFCNT_dec_NN(fold_list);
18649                     SvREFCNT_dec(all_cp_list);
18650                 }
18651             }
18652
18653             if (op != END) {
18654
18655                 /* Here, we have calculated what EXACTish node we would use.
18656                  * But we don't use it if it would require converting the
18657                  * pattern to UTF-8, unless not using it could cause us to miss
18658                  * some folds (hence be buggy) */
18659
18660                 if (! UTF && value > 255) {
18661                     SV * in_multis = NULL;
18662
18663                     assert(FOLD);
18664
18665                     /* If there is no code point that is part of a multi-char
18666                      * fold, then there aren't any matches, so we don't do this
18667                      * optimization.  Otherwise, it could match depending on
18668                      * the context around us, so we do upgrade */
18669                     _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
18670                     if (UNLIKELY(_invlist_len(in_multis) != 0)) {
18671                         REQUIRE_UTF8(flagp);
18672                     }
18673                     else {
18674                         op = END;
18675                     }
18676                 }
18677
18678                 if (op != END) {
18679                     U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
18680
18681                     ret = regnode_guts(pRExC_state, op, len, "exact");
18682                     FILL_NODE(ret, op);
18683                     RExC_emit += 1 + STR_SZ(len);
18684                     STR_LEN(REGNODE_p(ret)) = len;
18685                     if (len == 1) {
18686                         *STRING(REGNODE_p(ret)) = value;
18687                     }
18688                     else {
18689                         uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
18690                     }
18691                     goto not_anyof;
18692                 }
18693             }
18694         }
18695
18696         if (! has_runtime_dependency) {
18697
18698             /* See if this can be turned into an ANYOFM node.  Think about the
18699              * bit patterns in two different bytes.  In some positions, the
18700              * bits in each will be 1; and in other positions both will be 0;
18701              * and in some positions the bit will be 1 in one byte, and 0 in
18702              * the other.  Let 'n' be the number of positions where the bits
18703              * differ.  We create a mask which has exactly 'n' 0 bits, each in
18704              * a position where the two bytes differ.  Now take the set of all
18705              * bytes that when ANDed with the mask yield the same result.  That
18706              * set has 2**n elements, and is representable by just two 8 bit
18707              * numbers: the result and the mask.  Importantly, matching the set
18708              * can be vectorized by creating a word full of the result bytes,
18709              * and a word full of the mask bytes, yielding a significant speed
18710              * up.  Here, see if this node matches such a set.  As a concrete
18711              * example consider [01], and the byte representing '0' which is
18712              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
18713              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
18714              * 0x30.  Any other bytes ANDed yield something else.  So [01],
18715              * which is a common usage, is optimizable into ANYOFM, and can
18716              * benefit from the speed up.  We can only do this on UTF-8
18717              * invariant bytes, because they have the same bit patterns under
18718              * UTF-8 as not. */
18719             PERL_UINT_FAST8_T inverted = 0;
18720 #ifdef EBCDIC
18721             const PERL_UINT_FAST8_T max_permissible = 0xFF;
18722 #else
18723             const PERL_UINT_FAST8_T max_permissible = 0x7F;
18724 #endif
18725             /* If doesn't fit the criteria for ANYOFM, invert and try again.
18726              * If that works we will instead later generate an NANYOFM, and
18727              * invert back when through */
18728             if (invlist_highest(cp_list) > max_permissible) {
18729                 _invlist_invert(cp_list);
18730                 inverted = 1;
18731             }
18732
18733             if (invlist_highest(cp_list) <= max_permissible) {
18734                 UV this_start, this_end;
18735                 UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
18736                 U8 bits_differing = 0;
18737                 Size_t full_cp_count = 0;
18738                 bool first_time = TRUE;
18739
18740                 /* Go through the bytes and find the bit positions that differ
18741                  * */
18742                 invlist_iterinit(cp_list);
18743                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
18744                     unsigned int i = this_start;
18745
18746                     if (first_time) {
18747                         if (! UVCHR_IS_INVARIANT(i)) {
18748                             goto done_anyofm;
18749                         }
18750
18751                         first_time = FALSE;
18752                         lowest_cp = this_start;
18753
18754                         /* We have set up the code point to compare with.
18755                          * Don't compare it with itself */
18756                         i++;
18757                     }
18758
18759                     /* Find the bit positions that differ from the lowest code
18760                      * point in the node.  Keep track of all such positions by
18761                      * OR'ing */
18762                     for (; i <= this_end; i++) {
18763                         if (! UVCHR_IS_INVARIANT(i)) {
18764                             goto done_anyofm;
18765                         }
18766
18767                         bits_differing  |= i ^ lowest_cp;
18768                     }
18769
18770                     full_cp_count += this_end - this_start + 1;
18771                 }
18772                 invlist_iterfinish(cp_list);
18773
18774                 /* At the end of the loop, we count how many bits differ from
18775                  * the bits in lowest code point, call the count 'd'.  If the
18776                  * set we found contains 2**d elements, it is the closure of
18777                  * all code points that differ only in those bit positions.  To
18778                  * convince yourself of that, first note that the number in the
18779                  * closure must be a power of 2, which we test for.  The only
18780                  * way we could have that count and it be some differing set,
18781                  * is if we got some code points that don't differ from the
18782                  * lowest code point in any position, but do differ from each
18783                  * other in some other position.  That means one code point has
18784                  * a 1 in that position, and another has a 0.  But that would
18785                  * mean that one of them differs from the lowest code point in
18786                  * that position, which possibility we've already excluded.  */
18787                 if (  (inverted || full_cp_count > 1)
18788                     && full_cp_count == 1U << PL_bitcount[bits_differing])
18789                 {
18790                     U8 ANYOFM_mask;
18791
18792                     op = ANYOFM + inverted;;
18793
18794                     /* We need to make the bits that differ be 0's */
18795                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
18796
18797                     /* The argument is the lowest code point */
18798                     ret = reganode(pRExC_state, op, lowest_cp);
18799                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18800                 }
18801             }
18802           done_anyofm:
18803
18804             if (inverted) {
18805                 _invlist_invert(cp_list);
18806             }
18807
18808             if (op != END) {
18809                 goto not_anyof;
18810             }
18811         }
18812
18813         if (! posixl) {
18814             PERL_UINT_FAST8_T type;
18815             SV * intersection = NULL;
18816             SV* d_invlist = NULL;
18817
18818             /* See if this matches any of the POSIX classes.  The POSIXA and
18819              * POSIXD ones are about the same speed as ANYOF ops, but take less
18820              * room; the ones that have above-Latin1 code point matches are
18821              * somewhat faster than ANYOF.  */
18822
18823             for (type = POSIXA; type >= POSIXD; type--) {
18824                 int posix_class;
18825
18826                 if (type == POSIXL) {   /* But not /l posix classes */
18827                     continue;
18828                 }
18829
18830                 for (posix_class = 0;
18831                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18832                      posix_class++)
18833                 {
18834                     SV** our_code_points = &cp_list;
18835                     SV** official_code_points;
18836                     int try_inverted;
18837
18838                     if (type == POSIXA) {
18839                         official_code_points = &PL_Posix_ptrs[posix_class];
18840                     }
18841                     else {
18842                         official_code_points = &PL_XPosix_ptrs[posix_class];
18843                     }
18844
18845                     /* Skip non-existent classes of this type.  e.g. \v only
18846                      * has an entry in PL_XPosix_ptrs */
18847                     if (! *official_code_points) {
18848                         continue;
18849                     }
18850
18851                     /* Try both the regular class, and its inversion */
18852                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18853                         bool this_inverted = invert ^ try_inverted;
18854
18855                         if (type != POSIXD) {
18856
18857                             /* This class that isn't /d can't match if we have
18858                              * /d dependencies */
18859                             if (has_runtime_dependency
18860                                                     & HAS_D_RUNTIME_DEPENDENCY)
18861                             {
18862                                 continue;
18863                             }
18864                         }
18865                         else /* is /d */ if (! this_inverted) {
18866
18867                             /* /d classes don't match anything non-ASCII below
18868                              * 256 unconditionally (which cp_list contains) */
18869                             _invlist_intersection(cp_list, PL_UpperLatin1,
18870                                                            &intersection);
18871                             if (_invlist_len(intersection) != 0) {
18872                                 continue;
18873                             }
18874
18875                             SvREFCNT_dec(d_invlist);
18876                             d_invlist = invlist_clone(cp_list, NULL);
18877
18878                             /* But under UTF-8 it turns into using /u rules.
18879                              * Add the things it matches under these conditions
18880                              * so that we check below that these are identical
18881                              * to what the tested class should match */
18882                             if (upper_latin1_only_utf8_matches) {
18883                                 _invlist_union(
18884                                             d_invlist,
18885                                             upper_latin1_only_utf8_matches,
18886                                             &d_invlist);
18887                             }
18888                             our_code_points = &d_invlist;
18889                         }
18890                         else {  /* POSIXD, inverted.  If this doesn't have this
18891                                    flag set, it isn't /d. */
18892                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
18893                             {
18894                                 continue;
18895                             }
18896                             our_code_points = &cp_list;
18897                         }
18898
18899                         /* Here, have weeded out some things.  We want to see
18900                          * if the list of characters this node contains
18901                          * ('*our_code_points') precisely matches those of the
18902                          * class we are currently checking against
18903                          * ('*official_code_points'). */
18904                         if (_invlistEQ(*our_code_points,
18905                                        *official_code_points,
18906                                        try_inverted))
18907                         {
18908                             /* Here, they precisely match.  Optimize this ANYOF
18909                              * node into its equivalent POSIX one of the
18910                              * correct type, possibly inverted */
18911                             ret = reg_node(pRExC_state, (try_inverted)
18912                                                         ? type + NPOSIXA
18913                                                                 - POSIXA
18914                                                         : type);
18915                             FLAGS(REGNODE_p(ret)) = posix_class;
18916                             SvREFCNT_dec(d_invlist);
18917                             SvREFCNT_dec(intersection);
18918                             goto not_anyof;
18919                         }
18920                     }
18921                 }
18922             }
18923             SvREFCNT_dec(d_invlist);
18924             SvREFCNT_dec(intersection);
18925         }
18926
18927         /* If didn't find an optimization and there is no need for a
18928         * bitmap, optimize to indicate that */
18929         if (     start[0] >= NUM_ANYOF_CODE_POINTS
18930             && ! LOC
18931             && ! upper_latin1_only_utf8_matches)
18932         {
18933             op = ANYOFH;
18934         }
18935     }   /* End of seeing if can optimize it into a different node */
18936
18937   is_anyof: /* It's going to be an ANYOF node. */
18938     if (op != ANYOFH) {
18939         op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
18940              ? ANYOFD
18941              : ((posixl)
18942                 ? ANYOFPOSIXL
18943                 : ((LOC)
18944                    ? ANYOFL
18945                    : ANYOF));
18946     }
18947
18948     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
18949     FILL_NODE(ret, op);        /* We set the argument later */
18950     RExC_emit += 1 + regarglen[op];
18951     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
18952
18953     /* Here, <cp_list> contains all the code points we can determine at
18954      * compile time that match under all conditions.  Go through it, and
18955      * for things that belong in the bitmap, put them there, and delete from
18956      * <cp_list>.  While we are at it, see if everything above 255 is in the
18957      * list, and if so, set a flag to speed up execution */
18958
18959     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
18960
18961     if (posixl) {
18962         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
18963     }
18964
18965     if (invert) {
18966         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
18967     }
18968
18969     /* Here, the bitmap has been populated with all the Latin1 code points that
18970      * always match.  Can now add to the overall list those that match only
18971      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
18972      * */
18973     if (upper_latin1_only_utf8_matches) {
18974         if (cp_list) {
18975             _invlist_union(cp_list,
18976                            upper_latin1_only_utf8_matches,
18977                            &cp_list);
18978             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18979         }
18980         else {
18981             cp_list = upper_latin1_only_utf8_matches;
18982         }
18983         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
18984     }
18985
18986     /* If there is a swash and more than one element, we can't use the swash in
18987      * the optimization below. */
18988     if (swash && element_count > 1) {
18989         SvREFCNT_dec_NN(swash);
18990         swash = NULL;
18991     }
18992
18993     /* Note that the optimization of using 'swash' if it is the only thing in
18994      * the class doesn't have us change swash at all, so it can include things
18995      * that are also in the bitmap; otherwise we have purposely deleted that
18996      * duplicate information */
18997     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
18998                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18999                    ? listsv : NULL,
19000                   only_utf8_locale_list,
19001                   swash, cBOOL(has_runtime_dependency
19002                                                 & HAS_USER_DEFINED_PROPERTY));
19003     return ret;
19004
19005   not_anyof:
19006
19007     /* Here, the node is getting optimized into something that's not an ANYOF
19008      * one.  Finish up. */
19009
19010     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19011                                            RExC_parse - orig_parse);;
19012     SvREFCNT_dec(cp_list);;
19013     return ret;
19014 }
19015
19016 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19017
19018 STATIC void
19019 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19020                 regnode* const node,
19021                 SV* const cp_list,
19022                 SV* const runtime_defns,
19023                 SV* const only_utf8_locale_list,
19024                 SV* const swash,
19025                 const bool has_user_defined_property)
19026 {
19027     /* Sets the arg field of an ANYOF-type node 'node', using information about
19028      * the node passed-in.  If there is nothing outside the node's bitmap, the
19029      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19030      * the count returned by add_data(), having allocated and stored an array,
19031      * av, that that count references, as follows:
19032      *  av[0] stores the character class description in its textual form.
19033      *        This is used later (regexec.c:Perl_regclass_swash()) to
19034      *        initialize the appropriate swash, and is also useful for dumping
19035      *        the regnode.  This is set to &PL_sv_undef if the textual
19036      *        description is not needed at run-time (as happens if the other
19037      *        elements completely define the class)
19038      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
19039      *        computed from av[0].  But if no further computation need be done,
19040      *        the swash is stored here now (and av[0] is &PL_sv_undef).
19041      *  av[2] stores the inversion list of code points that match only if the
19042      *        current locale is UTF-8
19043      *  av[3] stores the cp_list inversion list for use in addition or instead
19044      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
19045      *        (Otherwise everything needed is already in av[0] and av[1])
19046      *  av[4] is set if any component of the class is from a user-defined
19047      *        property; used only if av[3] exists */
19048
19049     UV n;
19050
19051     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19052
19053     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19054         assert(! (ANYOF_FLAGS(node)
19055                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19056         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19057     }
19058     else {
19059         AV * const av = newAV();
19060         SV *rv;
19061
19062         av_store(av, 0, (runtime_defns)
19063                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
19064         if (swash) {
19065             assert(cp_list);
19066             av_store(av, 1, swash);
19067             SvREFCNT_dec_NN(cp_list);
19068         }
19069         else {
19070             av_store(av, 1, &PL_sv_undef);
19071             if (cp_list) {
19072                 av_store(av, 3, cp_list);
19073                 av_store(av, 4, newSVuv(has_user_defined_property));
19074             }
19075         }
19076
19077         if (only_utf8_locale_list) {
19078             av_store(av, 2, only_utf8_locale_list);
19079         }
19080         else {
19081             av_store(av, 2, &PL_sv_undef);
19082         }
19083
19084         rv = newRV_noinc(MUTABLE_SV(av));
19085         n = add_data(pRExC_state, STR_WITH_LEN("s"));
19086         RExC_rxi->data->data[n] = (void*)rv;
19087         ARG_SET(node, n);
19088     }
19089 }
19090
19091 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19092 SV *
19093 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19094                                         const regnode* node,
19095                                         bool doinit,
19096                                         SV** listsvp,
19097                                         SV** only_utf8_locale_ptr,
19098                                         SV** output_invlist)
19099
19100 {
19101     /* For internal core use only.
19102      * Returns the swash for the input 'node' in the regex 'prog'.
19103      * If <doinit> is 'true', will attempt to create the swash if not already
19104      *    done.
19105      * If <listsvp> is non-null, will return the printable contents of the
19106      *    swash.  This can be used to get debugging information even before the
19107      *    swash exists, by calling this function with 'doinit' set to false, in
19108      *    which case the components that will be used to eventually create the
19109      *    swash are returned  (in a printable form).
19110      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19111      *    store an inversion list of code points that should match only if the
19112      *    execution-time locale is a UTF-8 one.
19113      * If <output_invlist> is not NULL, it is where this routine is to store an
19114      *    inversion list of the code points that would be instead returned in
19115      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19116      *    when this parameter is used, is just the non-code point data that
19117      *    will go into creating the swash.  This currently should be just
19118      *    user-defined properties whose definitions were not known at compile
19119      *    time.  Using this parameter allows for easier manipulation of the
19120      *    swash's data by the caller.  It is illegal to call this function with
19121      *    this parameter set, but not <listsvp>
19122      *
19123      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19124      * that, in spite of this function's name, the swash it returns may include
19125      * the bitmap data as well */
19126
19127     SV *sw  = NULL;
19128     SV *si  = NULL;         /* Input swash initialization string */
19129     SV* invlist = NULL;
19130
19131     RXi_GET_DECL(prog, progi);
19132     const struct reg_data * const data = prog ? progi->data : NULL;
19133
19134     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19135     assert(! output_invlist || listsvp);
19136
19137     if (data && data->count) {
19138         const U32 n = ARG(node);
19139
19140         if (data->what[n] == 's') {
19141             SV * const rv = MUTABLE_SV(data->data[n]);
19142             AV * const av = MUTABLE_AV(SvRV(rv));
19143             SV **const ary = AvARRAY(av);
19144             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
19145
19146             si = *ary;  /* ary[0] = the string to initialize the swash with */
19147
19148             if (av_tindex_skip_len_mg(av) >= 2) {
19149                 if (only_utf8_locale_ptr
19150                     && ary[2]
19151                     && ary[2] != &PL_sv_undef)
19152                 {
19153                     *only_utf8_locale_ptr = ary[2];
19154                 }
19155                 else {
19156                     assert(only_utf8_locale_ptr);
19157                     *only_utf8_locale_ptr = NULL;
19158                 }
19159
19160                 /* Elements 3 and 4 are either both present or both absent. [3]
19161                  * is any inversion list generated at compile time; [4]
19162                  * indicates if that inversion list has any user-defined
19163                  * properties in it. */
19164                 if (av_tindex_skip_len_mg(av) >= 3) {
19165                     invlist = ary[3];
19166                     if (SvUV(ary[4])) {
19167                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
19168                     }
19169                 }
19170                 else {
19171                     invlist = NULL;
19172                 }
19173             }
19174
19175             /* Element [1] is reserved for the set-up swash.  If already there,
19176              * return it; if not, create it and store it there */
19177             if (ary[1] && SvROK(ary[1])) {
19178                 sw = ary[1];
19179             }
19180             else if (doinit && ((si && si != &PL_sv_undef)
19181                                  || (invlist && invlist != &PL_sv_undef))) {
19182                 assert(si);
19183                 sw = _core_swash_init("utf8", /* the utf8 package */
19184                                       "", /* nameless */
19185                                       si,
19186                                       1, /* binary */
19187                                       0, /* not from tr/// */
19188                                       invlist,
19189                                       &swash_init_flags);
19190                 (void)av_store(av, 1, sw);
19191             }
19192         }
19193     }
19194
19195     /* If requested, return a printable version of what this swash matches */
19196     if (listsvp) {
19197         SV* matches_string = NULL;
19198
19199         /* The swash should be used, if possible, to get the data, as it
19200          * contains the resolved data.  But this function can be called at
19201          * compile-time, before everything gets resolved, in which case we
19202          * return the currently best available information, which is the string
19203          * that will eventually be used to do that resolving, 'si' */
19204         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
19205             && (si && si != &PL_sv_undef))
19206         {
19207             /* Here, we only have 'si' (and possibly some passed-in data in
19208              * 'invlist', which is handled below)  If the caller only wants
19209              * 'si', use that.  */
19210             if (! output_invlist) {
19211                 matches_string = newSVsv(si);
19212             }
19213             else {
19214                 /* But if the caller wants an inversion list of the node, we
19215                  * need to parse 'si' and place as much as possible in the
19216                  * desired output inversion list, making 'matches_string' only
19217                  * contain the currently unresolvable things */
19218                 const char *si_string = SvPVX(si);
19219                 STRLEN remaining = SvCUR(si);
19220                 UV prev_cp = 0;
19221                 U8 count = 0;
19222
19223                 /* Ignore everything before the first new-line */
19224                 while (*si_string != '\n' && remaining > 0) {
19225                     si_string++;
19226                     remaining--;
19227                 }
19228                 assert(remaining > 0);
19229
19230                 si_string++;
19231                 remaining--;
19232
19233                 while (remaining > 0) {
19234
19235                     /* The data consists of just strings defining user-defined
19236                      * property names, but in prior incarnations, and perhaps
19237                      * somehow from pluggable regex engines, it could still
19238                      * hold hex code point definitions.  Each component of a
19239                      * range would be separated by a tab, and each range by a
19240                      * new-line.  If these are found, instead add them to the
19241                      * inversion list */
19242                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19243                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19244                     STRLEN len = remaining;
19245                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19246
19247                     /* If the hex decode routine found something, it should go
19248                      * up to the next \n */
19249                     if (   *(si_string + len) == '\n') {
19250                         if (count) {    /* 2nd code point on line */
19251                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19252                         }
19253                         else {
19254                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19255                         }
19256                         count = 0;
19257                         goto prepare_for_next_iteration;
19258                     }
19259
19260                     /* If the hex decode was instead for the lower range limit,
19261                      * save it, and go parse the upper range limit */
19262                     if (*(si_string + len) == '\t') {
19263                         assert(count == 0);
19264
19265                         prev_cp = cp;
19266                         count = 1;
19267                       prepare_for_next_iteration:
19268                         si_string += len + 1;
19269                         remaining -= len + 1;
19270                         continue;
19271                     }
19272
19273                     /* Here, didn't find a legal hex number.  Just add it from
19274                      * here to the next \n */
19275
19276                     remaining -= len;
19277                     while (*(si_string + len) != '\n' && remaining > 0) {
19278                         remaining--;
19279                         len++;
19280                     }
19281                     if (*(si_string + len) == '\n') {
19282                         len++;
19283                         remaining--;
19284                     }
19285                     if (matches_string) {
19286                         sv_catpvn(matches_string, si_string, len - 1);
19287                     }
19288                     else {
19289                         matches_string = newSVpvn(si_string, len - 1);
19290                     }
19291                     si_string += len;
19292                     sv_catpvs(matches_string, " ");
19293                 } /* end of loop through the text */
19294
19295                 assert(matches_string);
19296                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19297                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19298                 }
19299             } /* end of has an 'si' but no swash */
19300         }
19301
19302         /* If we have a swash in place, its equivalent inversion list was above
19303          * placed into 'invlist'.  If not, this variable may contain a stored
19304          * inversion list which is information beyond what is in 'si' */
19305         if (invlist) {
19306
19307             /* Again, if the caller doesn't want the output inversion list, put
19308              * everything in 'matches-string' */
19309             if (! output_invlist) {
19310                 if ( ! matches_string) {
19311                     matches_string = newSVpvs("\n");
19312                 }
19313                 sv_catsv(matches_string, invlist_contents(invlist,
19314                                                   TRUE /* traditional style */
19315                                                   ));
19316             }
19317             else if (! *output_invlist) {
19318                 *output_invlist = invlist_clone(invlist, NULL);
19319             }
19320             else {
19321                 _invlist_union(*output_invlist, invlist, output_invlist);
19322             }
19323         }
19324
19325         *listsvp = matches_string;
19326     }
19327
19328     return sw;
19329 }
19330 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19331
19332 /* reg_skipcomment()
19333
19334    Absorbs an /x style # comment from the input stream,
19335    returning a pointer to the first character beyond the comment, or if the
19336    comment terminates the pattern without anything following it, this returns
19337    one past the final character of the pattern (in other words, RExC_end) and
19338    sets the REG_RUN_ON_COMMENT_SEEN flag.
19339
19340    Note it's the callers responsibility to ensure that we are
19341    actually in /x mode
19342
19343 */
19344
19345 PERL_STATIC_INLINE char*
19346 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19347 {
19348     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19349
19350     assert(*p == '#');
19351
19352     while (p < RExC_end) {
19353         if (*(++p) == '\n') {
19354             return p+1;
19355         }
19356     }
19357
19358     /* we ran off the end of the pattern without ending the comment, so we have
19359      * to add an \n when wrapping */
19360     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19361     return p;
19362 }
19363
19364 STATIC void
19365 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19366                                 char ** p,
19367                                 const bool force_to_xmod
19368                          )
19369 {
19370     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19371      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19372      * is /x whitespace, advance '*p' so that on exit it points to the first
19373      * byte past all such white space and comments */
19374
19375     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19376
19377     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19378
19379     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19380
19381     for (;;) {
19382         if (RExC_end - (*p) >= 3
19383             && *(*p)     == '('
19384             && *(*p + 1) == '?'
19385             && *(*p + 2) == '#')
19386         {
19387             while (*(*p) != ')') {
19388                 if ((*p) == RExC_end)
19389                     FAIL("Sequence (?#... not terminated");
19390                 (*p)++;
19391             }
19392             (*p)++;
19393             continue;
19394         }
19395
19396         if (use_xmod) {
19397             const char * save_p = *p;
19398             while ((*p) < RExC_end) {
19399                 STRLEN len;
19400                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19401                     (*p) += len;
19402                 }
19403                 else if (*(*p) == '#') {
19404                     (*p) = reg_skipcomment(pRExC_state, (*p));
19405                 }
19406                 else {
19407                     break;
19408                 }
19409             }
19410             if (*p != save_p) {
19411                 continue;
19412             }
19413         }
19414
19415         break;
19416     }
19417
19418     return;
19419 }
19420
19421 /* nextchar()
19422
19423    Advances the parse position by one byte, unless that byte is the beginning
19424    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
19425    those two cases, the parse position is advanced beyond all such comments and
19426    white space.
19427
19428    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19429 */
19430
19431 STATIC void
19432 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19433 {
19434     PERL_ARGS_ASSERT_NEXTCHAR;
19435
19436     if (RExC_parse < RExC_end) {
19437         assert(   ! UTF
19438                || UTF8_IS_INVARIANT(*RExC_parse)
19439                || UTF8_IS_START(*RExC_parse));
19440
19441         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
19442
19443         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19444                                 FALSE /* Don't force /x */ );
19445     }
19446 }
19447
19448 STATIC void
19449 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19450 {
19451     /* 'size' is the delta to add or subtract from the current memory allocated
19452      * to the regex engine being constructed */
19453
19454     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19455
19456     RExC_size += size;
19457
19458     Renewc(RExC_rxi,
19459            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
19460                                                 /* +1 for REG_MAGIC */
19461            char,
19462            regexp_internal);
19463     if ( RExC_rxi == NULL )
19464         FAIL("Regexp out of space");
19465     RXi_SET(RExC_rx, RExC_rxi);
19466
19467     RExC_emit_start = RExC_rxi->program;
19468     if (size > 0) {
19469         Zero(REGNODE_p(RExC_emit), size, regnode);
19470     }
19471
19472 #ifdef RE_TRACK_PATTERN_OFFSETS
19473     Renew(RExC_offsets, 2*RExC_size+1, U32);
19474     if (size > 0) {
19475         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
19476     }
19477     RExC_offsets[0] = RExC_size;
19478 #endif
19479 }
19480
19481 STATIC regnode_offset
19482 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19483 {
19484     /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
19485      * and increments RExC_size and RExC_emit
19486      *
19487      * It returns the regnode's offset into the regex engine program */
19488
19489     const regnode_offset ret = RExC_emit;
19490
19491     GET_RE_DEBUG_FLAGS_DECL;
19492
19493     PERL_ARGS_ASSERT_REGNODE_GUTS;
19494
19495     SIZE_ALIGN(RExC_size);
19496     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19497     NODE_ALIGN_FILL(REGNODE_p(ret));
19498 #ifndef RE_TRACK_PATTERN_OFFSETS
19499     PERL_UNUSED_ARG(name);
19500     PERL_UNUSED_ARG(op);
19501 #else
19502     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19503
19504     if (RExC_offsets) {         /* MJD */
19505         MJD_OFFSET_DEBUG(
19506               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19507               name, __LINE__,
19508               PL_reg_name[op],
19509               (UV)(RExC_emit) > RExC_offsets[0]
19510                 ? "Overwriting end of array!\n" : "OK",
19511               (UV)(RExC_emit),
19512               (UV)(RExC_parse - RExC_start),
19513               (UV)RExC_offsets[0]));
19514         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19515     }
19516 #endif
19517     return(ret);
19518 }
19519
19520 /*
19521 - reg_node - emit a node
19522 */
19523 STATIC regnode_offset /* Location. */
19524 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19525 {
19526     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19527     regnode_offset ptr = ret;
19528
19529     PERL_ARGS_ASSERT_REG_NODE;
19530
19531     assert(regarglen[op] == 0);
19532
19533     FILL_ADVANCE_NODE(ptr, op);
19534     RExC_emit = ptr;
19535     return(ret);
19536 }
19537
19538 /*
19539 - reganode - emit a node with an argument
19540 */
19541 STATIC regnode_offset /* Location. */
19542 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19543 {
19544     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19545     regnode_offset ptr = ret;
19546
19547     PERL_ARGS_ASSERT_REGANODE;
19548
19549     /* ANYOF are special cased to allow non-length 1 args */
19550     assert(regarglen[op] == 1);
19551
19552     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19553     RExC_emit = ptr;
19554     return(ret);
19555 }
19556
19557 STATIC regnode_offset
19558 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19559 {
19560     /* emit a node with U32 and I32 arguments */
19561
19562     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19563     regnode_offset ptr = ret;
19564
19565     PERL_ARGS_ASSERT_REG2LANODE;
19566
19567     assert(regarglen[op] == 2);
19568
19569     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19570     RExC_emit = ptr;
19571     return(ret);
19572 }
19573
19574 /*
19575 - reginsert - insert an operator in front of already-emitted operand
19576 *
19577 * That means that on exit 'operand' is the offset of the newly inserted
19578 * operator, and the original operand has been relocated.
19579 *
19580 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19581 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19582 *
19583 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19584 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19585 *
19586 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19587 */
19588 STATIC void
19589 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19590                   const regnode_offset operand, const U32 depth)
19591 {
19592     regnode *src;
19593     regnode *dst;
19594     regnode *place;
19595     const int offset = regarglen[(U8)op];
19596     const int size = NODE_STEP_REGNODE + offset;
19597     GET_RE_DEBUG_FLAGS_DECL;
19598
19599     PERL_ARGS_ASSERT_REGINSERT;
19600     PERL_UNUSED_CONTEXT;
19601     PERL_UNUSED_ARG(depth);
19602 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19603     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19604     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19605                                     studying. If this is wrong then we need to adjust RExC_recurse
19606                                     below like we do with RExC_open_parens/RExC_close_parens. */
19607     change_engine_size(pRExC_state, (Ptrdiff_t) size);
19608     src = REGNODE_p(RExC_emit);
19609     RExC_emit += size;
19610     dst = REGNODE_p(RExC_emit);
19611     if (RExC_open_parens) {
19612         int paren;
19613         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19614         /* remember that RExC_npar is rex->nparens + 1,
19615          * iow it is 1 more than the number of parens seen in
19616          * the pattern so far. */
19617         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19618             /* note, RExC_open_parens[0] is the start of the
19619              * regex, it can't move. RExC_close_parens[0] is the end
19620              * of the regex, it *can* move. */
19621             if ( paren && RExC_open_parens[paren] >= operand ) {
19622                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19623                 RExC_open_parens[paren] += size;
19624             } else {
19625                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19626             }
19627             if ( RExC_close_parens[paren] >= operand ) {
19628                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19629                 RExC_close_parens[paren] += size;
19630             } else {
19631                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19632             }
19633         }
19634     }
19635     if (RExC_end_op)
19636         RExC_end_op += size;
19637
19638     while (src > REGNODE_p(operand)) {
19639         StructCopy(--src, --dst, regnode);
19640 #ifdef RE_TRACK_PATTERN_OFFSETS
19641         if (RExC_offsets) {     /* MJD 20010112 */
19642             MJD_OFFSET_DEBUG(
19643                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19644                   "reginsert",
19645                   __LINE__,
19646                   PL_reg_name[op],
19647                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19648                     ? "Overwriting end of array!\n" : "OK",
19649                   (UV)REGNODE_OFFSET(src),
19650                   (UV)REGNODE_OFFSET(dst),
19651                   (UV)RExC_offsets[0]));
19652             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19653             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19654         }
19655 #endif
19656     }
19657
19658     place = REGNODE_p(operand); /* Op node, where operand used to be. */
19659 #ifdef RE_TRACK_PATTERN_OFFSETS
19660     if (RExC_offsets) {         /* MJD */
19661         MJD_OFFSET_DEBUG(
19662               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19663               "reginsert",
19664               __LINE__,
19665               PL_reg_name[op],
19666               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19667               ? "Overwriting end of array!\n" : "OK",
19668               (UV)REGNODE_OFFSET(place),
19669               (UV)(RExC_parse - RExC_start),
19670               (UV)RExC_offsets[0]));
19671         Set_Node_Offset(place, RExC_parse);
19672         Set_Node_Length(place, 1);
19673     }
19674 #endif
19675     src = NEXTOPER(place);
19676     FLAGS(place) = 0;
19677     FILL_NODE(operand, op);
19678
19679     /* Zero out any arguments in the new node */
19680     Zero(src, offset, regnode);
19681 }
19682
19683 /*
19684 - regtail - set the next-pointer at the end of a node chain of p to val.
19685 - SEE ALSO: regtail_study
19686 */
19687 STATIC void
19688 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19689                 const regnode_offset p,
19690                 const regnode_offset val,
19691                 const U32 depth)
19692 {
19693     regnode_offset scan;
19694     GET_RE_DEBUG_FLAGS_DECL;
19695
19696     PERL_ARGS_ASSERT_REGTAIL;
19697 #ifndef DEBUGGING
19698     PERL_UNUSED_ARG(depth);
19699 #endif
19700
19701     /* Find last node. */
19702     scan = (regnode_offset) p;
19703     for (;;) {
19704         regnode * const temp = regnext(REGNODE_p(scan));
19705         DEBUG_PARSE_r({
19706             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19707             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19708             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
19709                 SvPV_nolen_const(RExC_mysv), scan,
19710                     (temp == NULL ? "->" : ""),
19711                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19712             );
19713         });
19714         if (temp == NULL)
19715             break;
19716         scan = REGNODE_OFFSET(temp);
19717     }
19718
19719     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19720         ARG_SET(REGNODE_p(scan), val - scan);
19721     }
19722     else {
19723         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19724     }
19725 }
19726
19727 #ifdef DEBUGGING
19728 /*
19729 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19730 - Look for optimizable sequences at the same time.
19731 - currently only looks for EXACT chains.
19732
19733 This is experimental code. The idea is to use this routine to perform
19734 in place optimizations on branches and groups as they are constructed,
19735 with the long term intention of removing optimization from study_chunk so
19736 that it is purely analytical.
19737
19738 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19739 to control which is which.
19740
19741 */
19742 /* TODO: All four parms should be const */
19743
19744 STATIC U8
19745 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19746                       const regnode_offset val, U32 depth)
19747 {
19748     regnode_offset scan;
19749     U8 exact = PSEUDO;
19750 #ifdef EXPERIMENTAL_INPLACESCAN
19751     I32 min = 0;
19752 #endif
19753     GET_RE_DEBUG_FLAGS_DECL;
19754
19755     PERL_ARGS_ASSERT_REGTAIL_STUDY;
19756
19757
19758     /* Find last node. */
19759
19760     scan = p;
19761     for (;;) {
19762         regnode * const temp = regnext(REGNODE_p(scan));
19763 #ifdef EXPERIMENTAL_INPLACESCAN
19764         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19765             bool unfolded_multi_char;   /* Unexamined in this routine */
19766             if (join_exact(pRExC_state, scan, &min,
19767                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19768                 return EXACT;
19769         }
19770 #endif
19771         if ( exact ) {
19772             switch (OP(REGNODE_p(scan))) {
19773                 case EXACT:
19774                 case EXACT_ONLY8:
19775                 case EXACTL:
19776                 case EXACTF:
19777                 case EXACTFU_S_EDGE:
19778                 case EXACTFAA_NO_TRIE:
19779                 case EXACTFAA:
19780                 case EXACTFU:
19781                 case EXACTFU_ONLY8:
19782                 case EXACTFLU8:
19783                 case EXACTFUP:
19784                 case EXACTFL:
19785                         if( exact == PSEUDO )
19786                             exact= OP(REGNODE_p(scan));
19787                         else if ( exact != OP(REGNODE_p(scan)) )
19788                             exact= 0;
19789                 case NOTHING:
19790                     break;
19791                 default:
19792                     exact= 0;
19793             }
19794         }
19795         DEBUG_PARSE_r({
19796             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
19797             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19798             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
19799                 SvPV_nolen_const(RExC_mysv),
19800                 scan,
19801                 PL_reg_name[exact]);
19802         });
19803         if (temp == NULL)
19804             break;
19805         scan = REGNODE_OFFSET(temp);
19806     }
19807     DEBUG_PARSE_r({
19808         DEBUG_PARSE_MSG("");
19809         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
19810         Perl_re_printf( aTHX_
19811                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19812                       SvPV_nolen_const(RExC_mysv),
19813                       (IV)val,
19814                       (IV)(val - scan)
19815         );
19816     });
19817     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19818         ARG_SET(REGNODE_p(scan), val - scan);
19819     }
19820     else {
19821         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19822     }
19823
19824     return exact;
19825 }
19826 #endif
19827
19828 STATIC SV*
19829 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
19830
19831     /* Returns an inversion list of all the code points matched by the
19832      * ANYOFM/NANYOFM node 'n' */
19833
19834     SV * cp_list = _new_invlist(-1);
19835     const U8 lowest = (U8) ARG(n);
19836     unsigned int i;
19837     U8 count = 0;
19838     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
19839
19840     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
19841
19842     /* Starting with the lowest code point, any code point that ANDed with the
19843      * mask yields the lowest code point is in the set */
19844     for (i = lowest; i <= 0xFF; i++) {
19845         if ((i & FLAGS(n)) == ARG(n)) {
19846             cp_list = add_cp_to_invlist(cp_list, i);
19847             count++;
19848
19849             /* We know how many code points (a power of two) that are in the
19850              * set.  No use looking once we've got that number */
19851             if (count >= needed) break;
19852         }
19853     }
19854
19855     if (OP(n) == NANYOFM) {
19856         _invlist_invert(cp_list);
19857     }
19858     return cp_list;
19859 }
19860
19861 /*
19862  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19863  */
19864 #ifdef DEBUGGING
19865
19866 static void
19867 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19868 {
19869     int bit;
19870     int set=0;
19871
19872     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19873
19874     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19875         if (flags & (1<<bit)) {
19876             if (!set++ && lead)
19877                 Perl_re_printf( aTHX_  "%s", lead);
19878             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
19879         }
19880     }
19881     if (lead)  {
19882         if (set)
19883             Perl_re_printf( aTHX_  "\n");
19884         else
19885             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
19886     }
19887 }
19888
19889 static void
19890 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19891 {
19892     int bit;
19893     int set=0;
19894     regex_charset cs;
19895
19896     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19897
19898     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19899         if (flags & (1<<bit)) {
19900             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
19901                 continue;
19902             }
19903             if (!set++ && lead)
19904                 Perl_re_printf( aTHX_  "%s", lead);
19905             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
19906         }
19907     }
19908     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19909             if (!set++ && lead) {
19910                 Perl_re_printf( aTHX_  "%s", lead);
19911             }
19912             switch (cs) {
19913                 case REGEX_UNICODE_CHARSET:
19914                     Perl_re_printf( aTHX_  "UNICODE");
19915                     break;
19916                 case REGEX_LOCALE_CHARSET:
19917                     Perl_re_printf( aTHX_  "LOCALE");
19918                     break;
19919                 case REGEX_ASCII_RESTRICTED_CHARSET:
19920                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
19921                     break;
19922                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19923                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
19924                     break;
19925                 default:
19926                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
19927                     break;
19928             }
19929     }
19930     if (lead)  {
19931         if (set)
19932             Perl_re_printf( aTHX_  "\n");
19933         else
19934             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
19935     }
19936 }
19937 #endif
19938
19939 void
19940 Perl_regdump(pTHX_ const regexp *r)
19941 {
19942 #ifdef DEBUGGING
19943     int i;
19944     SV * const sv = sv_newmortal();
19945     SV *dsv= sv_newmortal();
19946     RXi_GET_DECL(r, ri);
19947     GET_RE_DEBUG_FLAGS_DECL;
19948
19949     PERL_ARGS_ASSERT_REGDUMP;
19950
19951     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19952
19953     /* Header fields of interest. */
19954     for (i = 0; i < 2; i++) {
19955         if (r->substrs->data[i].substr) {
19956             RE_PV_QUOTED_DECL(s, 0, dsv,
19957                             SvPVX_const(r->substrs->data[i].substr),
19958                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
19959                             PL_dump_re_max_len);
19960             Perl_re_printf( aTHX_
19961                           "%s %s%s at %" IVdf "..%" UVuf " ",
19962                           i ? "floating" : "anchored",
19963                           s,
19964                           RE_SV_TAIL(r->substrs->data[i].substr),
19965                           (IV)r->substrs->data[i].min_offset,
19966                           (UV)r->substrs->data[i].max_offset);
19967         }
19968         else if (r->substrs->data[i].utf8_substr) {
19969             RE_PV_QUOTED_DECL(s, 1, dsv,
19970                             SvPVX_const(r->substrs->data[i].utf8_substr),
19971                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19972                             30);
19973             Perl_re_printf( aTHX_
19974                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19975                           i ? "floating" : "anchored",
19976                           s,
19977                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19978                           (IV)r->substrs->data[i].min_offset,
19979                           (UV)r->substrs->data[i].max_offset);
19980         }
19981     }
19982
19983     if (r->check_substr || r->check_utf8)
19984         Perl_re_printf( aTHX_
19985                       (const char *)
19986                       (   r->check_substr == r->substrs->data[1].substr
19987                        && r->check_utf8   == r->substrs->data[1].utf8_substr
19988                        ? "(checking floating" : "(checking anchored"));
19989     if (r->intflags & PREGf_NOSCAN)
19990         Perl_re_printf( aTHX_  " noscan");
19991     if (r->extflags & RXf_CHECK_ALL)
19992         Perl_re_printf( aTHX_  " isall");
19993     if (r->check_substr || r->check_utf8)
19994         Perl_re_printf( aTHX_  ") ");
19995
19996     if (ri->regstclass) {
19997         regprop(r, sv, ri->regstclass, NULL, NULL);
19998         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
19999     }
20000     if (r->intflags & PREGf_ANCH) {
20001         Perl_re_printf( aTHX_  "anchored");
20002         if (r->intflags & PREGf_ANCH_MBOL)
20003             Perl_re_printf( aTHX_  "(MBOL)");
20004         if (r->intflags & PREGf_ANCH_SBOL)
20005             Perl_re_printf( aTHX_  "(SBOL)");
20006         if (r->intflags & PREGf_ANCH_GPOS)
20007             Perl_re_printf( aTHX_  "(GPOS)");
20008         Perl_re_printf( aTHX_ " ");
20009     }
20010     if (r->intflags & PREGf_GPOS_SEEN)
20011         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20012     if (r->intflags & PREGf_SKIP)
20013         Perl_re_printf( aTHX_  "plus ");
20014     if (r->intflags & PREGf_IMPLICIT)
20015         Perl_re_printf( aTHX_  "implicit ");
20016     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20017     if (r->extflags & RXf_EVAL_SEEN)
20018         Perl_re_printf( aTHX_  "with eval ");
20019     Perl_re_printf( aTHX_  "\n");
20020     DEBUG_FLAGS_r({
20021         regdump_extflags("r->extflags: ", r->extflags);
20022         regdump_intflags("r->intflags: ", r->intflags);
20023     });
20024 #else
20025     PERL_ARGS_ASSERT_REGDUMP;
20026     PERL_UNUSED_CONTEXT;
20027     PERL_UNUSED_ARG(r);
20028 #endif  /* DEBUGGING */
20029 }
20030
20031 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20032 #ifdef DEBUGGING
20033
20034 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20035      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20036      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20037      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20038      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20039      || _CC_VERTSPACE != 15
20040 #   error Need to adjust order of anyofs[]
20041 #  endif
20042 static const char * const anyofs[] = {
20043     "\\w",
20044     "\\W",
20045     "\\d",
20046     "\\D",
20047     "[:alpha:]",
20048     "[:^alpha:]",
20049     "[:lower:]",
20050     "[:^lower:]",
20051     "[:upper:]",
20052     "[:^upper:]",
20053     "[:punct:]",
20054     "[:^punct:]",
20055     "[:print:]",
20056     "[:^print:]",
20057     "[:alnum:]",
20058     "[:^alnum:]",
20059     "[:graph:]",
20060     "[:^graph:]",
20061     "[:cased:]",
20062     "[:^cased:]",
20063     "\\s",
20064     "\\S",
20065     "[:blank:]",
20066     "[:^blank:]",
20067     "[:xdigit:]",
20068     "[:^xdigit:]",
20069     "[:cntrl:]",
20070     "[:^cntrl:]",
20071     "[:ascii:]",
20072     "[:^ascii:]",
20073     "\\v",
20074     "\\V"
20075 };
20076 #endif
20077
20078 /*
20079 - regprop - printable representation of opcode, with run time support
20080 */
20081
20082 void
20083 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20084 {
20085 #ifdef DEBUGGING
20086     int k;
20087     RXi_GET_DECL(prog, progi);
20088     GET_RE_DEBUG_FLAGS_DECL;
20089
20090     PERL_ARGS_ASSERT_REGPROP;
20091
20092     SvPVCLEAR(sv);
20093
20094     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
20095         /* It would be nice to FAIL() here, but this may be called from
20096            regexec.c, and it would be hard to supply pRExC_state. */
20097         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20098                                               (int)OP(o), (int)REGNODE_MAX);
20099     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20100
20101     k = PL_regkind[OP(o)];
20102
20103     if (k == EXACT) {
20104         sv_catpvs(sv, " ");
20105         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20106          * is a crude hack but it may be the best for now since
20107          * we have no flag "this EXACTish node was UTF-8"
20108          * --jhi */
20109         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20110                   PL_colors[0], PL_colors[1],
20111                   PERL_PV_ESCAPE_UNI_DETECT |
20112                   PERL_PV_ESCAPE_NONASCII   |
20113                   PERL_PV_PRETTY_ELLIPSES   |
20114                   PERL_PV_PRETTY_LTGT       |
20115                   PERL_PV_PRETTY_NOCLEAR
20116                   );
20117     } else if (k == TRIE) {
20118         /* print the details of the trie in dumpuntil instead, as
20119          * progi->data isn't available here */
20120         const char op = OP(o);
20121         const U32 n = ARG(o);
20122         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20123                (reg_ac_data *)progi->data->data[n] :
20124                NULL;
20125         const reg_trie_data * const trie
20126             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20127
20128         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20129         DEBUG_TRIE_COMPILE_r({
20130           if (trie->jump)
20131             sv_catpvs(sv, "(JUMP)");
20132           Perl_sv_catpvf(aTHX_ sv,
20133             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20134             (UV)trie->startstate,
20135             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20136             (UV)trie->wordcount,
20137             (UV)trie->minlen,
20138             (UV)trie->maxlen,
20139             (UV)TRIE_CHARCOUNT(trie),
20140             (UV)trie->uniquecharcount
20141           );
20142         });
20143         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20144             sv_catpvs(sv, "[");
20145             (void) put_charclass_bitmap_innards(sv,
20146                                                 ((IS_ANYOF_TRIE(op))
20147                                                  ? ANYOF_BITMAP(o)
20148                                                  : TRIE_BITMAP(trie)),
20149                                                 NULL,
20150                                                 NULL,
20151                                                 NULL,
20152                                                 FALSE
20153                                                );
20154             sv_catpvs(sv, "]");
20155         }
20156     } else if (k == CURLY) {
20157         U32 lo = ARG1(o), hi = ARG2(o);
20158         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20159             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20160         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20161         if (hi == REG_INFTY)
20162             sv_catpvs(sv, "INFTY");
20163         else
20164             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20165         sv_catpvs(sv, "}");
20166     }
20167     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
20168         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20169     else if (k == REF || k == OPEN || k == CLOSE
20170              || k == GROUPP || OP(o)==ACCEPT)
20171     {
20172         AV *name_list= NULL;
20173         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20174         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20175         if ( RXp_PAREN_NAMES(prog) ) {
20176             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20177         } else if ( pRExC_state ) {
20178             name_list= RExC_paren_name_list;
20179         }
20180         if (name_list) {
20181             if ( k != REF || (OP(o) < NREF)) {
20182                 SV **name= av_fetch(name_list, parno, 0 );
20183                 if (name)
20184                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20185             }
20186             else {
20187                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20188                 I32 *nums=(I32*)SvPVX(sv_dat);
20189                 SV **name= av_fetch(name_list, nums[0], 0 );
20190                 I32 n;
20191                 if (name) {
20192                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20193                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20194                                     (n ? "," : ""), (IV)nums[n]);
20195                     }
20196                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20197                 }
20198             }
20199         }
20200         if ( k == REF && reginfo) {
20201             U32 n = ARG(o);  /* which paren pair */
20202             I32 ln = prog->offs[n].start;
20203             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20204                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20205             else if (ln == prog->offs[n].end)
20206                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20207             else {
20208                 const char *s = reginfo->strbeg + ln;
20209                 Perl_sv_catpvf(aTHX_ sv, ": ");
20210                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20211                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20212             }
20213         }
20214     } else if (k == GOSUB) {
20215         AV *name_list= NULL;
20216         if ( RXp_PAREN_NAMES(prog) ) {
20217             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20218         } else if ( pRExC_state ) {
20219             name_list= RExC_paren_name_list;
20220         }
20221
20222         /* Paren and offset */
20223         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20224                 (int)((o + (int)ARG2L(o)) - progi->program) );
20225         if (name_list) {
20226             SV **name= av_fetch(name_list, ARG(o), 0 );
20227             if (name)
20228                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20229         }
20230     }
20231     else if (k == LOGICAL)
20232         /* 2: embedded, otherwise 1 */
20233         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20234     else if (k == ANYOF) {
20235         const U8 flags = ANYOF_FLAGS(o);
20236         bool do_sep = FALSE;    /* Do we need to separate various components of
20237                                    the output? */
20238         /* Set if there is still an unresolved user-defined property */
20239         SV *unresolved                = NULL;
20240
20241         /* Things that are ignored except when the runtime locale is UTF-8 */
20242         SV *only_utf8_locale_invlist = NULL;
20243
20244         /* Code points that don't fit in the bitmap */
20245         SV *nonbitmap_invlist = NULL;
20246
20247         /* And things that aren't in the bitmap, but are small enough to be */
20248         SV* bitmap_range_not_in_bitmap = NULL;
20249
20250         const bool inverted = flags & ANYOF_INVERT;
20251
20252         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20253             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20254                 sv_catpvs(sv, "{utf8-locale-reqd}");
20255             }
20256             if (flags & ANYOFL_FOLD) {
20257                 sv_catpvs(sv, "{i}");
20258             }
20259         }
20260
20261         /* If there is stuff outside the bitmap, get it */
20262         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20263             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20264                                                 &unresolved,
20265                                                 &only_utf8_locale_invlist,
20266                                                 &nonbitmap_invlist);
20267             /* The non-bitmap data may contain stuff that could fit in the
20268              * bitmap.  This could come from a user-defined property being
20269              * finally resolved when this call was done; or much more likely
20270              * because there are matches that require UTF-8 to be valid, and so
20271              * aren't in the bitmap.  This is teased apart later */
20272             _invlist_intersection(nonbitmap_invlist,
20273                                   PL_InBitmap,
20274                                   &bitmap_range_not_in_bitmap);
20275             /* Leave just the things that don't fit into the bitmap */
20276             _invlist_subtract(nonbitmap_invlist,
20277                               PL_InBitmap,
20278                               &nonbitmap_invlist);
20279         }
20280
20281         /* Obey this flag to add all above-the-bitmap code points */
20282         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20283             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20284                                                       NUM_ANYOF_CODE_POINTS,
20285                                                       UV_MAX);
20286         }
20287
20288         /* Ready to start outputting.  First, the initial left bracket */
20289         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20290
20291         if (OP(o) != ANYOFH) {
20292             /* Then all the things that could fit in the bitmap */
20293             do_sep = put_charclass_bitmap_innards(sv,
20294                                                   ANYOF_BITMAP(o),
20295                                                   bitmap_range_not_in_bitmap,
20296                                                   only_utf8_locale_invlist,
20297                                                   o,
20298
20299                                                   /* Can't try inverting for a
20300                                                    * better display if there
20301                                                    * are things that haven't
20302                                                    * been resolved */
20303                                                   unresolved != NULL);
20304             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20305
20306             /* If there are user-defined properties which haven't been defined
20307              * yet, output them.  If the result is not to be inverted, it is
20308              * clearest to output them in a separate [] from the bitmap range
20309              * stuff.  If the result is to be complemented, we have to show
20310              * everything in one [], as the inversion applies to the whole
20311              * thing.  Use {braces} to separate them from anything in the
20312              * bitmap and anything above the bitmap. */
20313             if (unresolved) {
20314                 if (inverted) {
20315                     if (! do_sep) { /* If didn't output anything in the bitmap
20316                                      */
20317                         sv_catpvs(sv, "^");
20318                     }
20319                     sv_catpvs(sv, "{");
20320                 }
20321                 else if (do_sep) {
20322                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20323                                                       PL_colors[0]);
20324                 }
20325                 sv_catsv(sv, unresolved);
20326                 if (inverted) {
20327                     sv_catpvs(sv, "}");
20328                 }
20329                 do_sep = ! inverted;
20330             }
20331         }
20332
20333         /* And, finally, add the above-the-bitmap stuff */
20334         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20335             SV* contents;
20336
20337             /* See if truncation size is overridden */
20338             const STRLEN dump_len = (PL_dump_re_max_len > 256)
20339                                     ? PL_dump_re_max_len
20340                                     : 256;
20341
20342             /* This is output in a separate [] */
20343             if (do_sep) {
20344                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20345             }
20346
20347             /* And, for easy of understanding, it is shown in the
20348              * uncomplemented form if possible.  The one exception being if
20349              * there are unresolved items, where the inversion has to be
20350              * delayed until runtime */
20351             if (inverted && ! unresolved) {
20352                 _invlist_invert(nonbitmap_invlist);
20353                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20354             }
20355
20356             contents = invlist_contents(nonbitmap_invlist,
20357                                         FALSE /* output suitable for catsv */
20358                                        );
20359
20360             /* If the output is shorter than the permissible maximum, just do it. */
20361             if (SvCUR(contents) <= dump_len) {
20362                 sv_catsv(sv, contents);
20363             }
20364             else {
20365                 const char * contents_string = SvPVX(contents);
20366                 STRLEN i = dump_len;
20367
20368                 /* Otherwise, start at the permissible max and work back to the
20369                  * first break possibility */
20370                 while (i > 0 && contents_string[i] != ' ') {
20371                     i--;
20372                 }
20373                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
20374                                        find a legal break */
20375                     i = dump_len;
20376                 }
20377
20378                 sv_catpvn(sv, contents_string, i);
20379                 sv_catpvs(sv, "...");
20380             }
20381
20382             SvREFCNT_dec_NN(contents);
20383             SvREFCNT_dec_NN(nonbitmap_invlist);
20384         }
20385
20386         /* And finally the matching, closing ']' */
20387         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20388
20389         SvREFCNT_dec(unresolved);
20390     }
20391     else if (k == ANYOFM) {
20392         SV * cp_list = get_ANYOFM_contents(o);
20393
20394         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20395         if (OP(o) == NANYOFM) {
20396             _invlist_invert(cp_list);
20397         }
20398
20399         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20400         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20401
20402         SvREFCNT_dec(cp_list);
20403     }
20404     else if (k == POSIXD || k == NPOSIXD) {
20405         U8 index = FLAGS(o) * 2;
20406         if (index < C_ARRAY_LENGTH(anyofs)) {
20407             if (*anyofs[index] != '[')  {
20408                 sv_catpvs(sv, "[");
20409             }
20410             sv_catpv(sv, anyofs[index]);
20411             if (*anyofs[index] != '[')  {
20412                 sv_catpvs(sv, "]");
20413             }
20414         }
20415         else {
20416             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20417         }
20418     }
20419     else if (k == BOUND || k == NBOUND) {
20420         /* Must be synced with order of 'bound_type' in regcomp.h */
20421         const char * const bounds[] = {
20422             "",      /* Traditional */
20423             "{gcb}",
20424             "{lb}",
20425             "{sb}",
20426             "{wb}"
20427         };
20428         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20429         sv_catpv(sv, bounds[FLAGS(o)]);
20430     }
20431     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
20432         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
20433     else if (OP(o) == SBOL)
20434         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20435
20436     /* add on the verb argument if there is one */
20437     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20438         if ( ARG(o) )
20439             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20440                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20441         else
20442             sv_catpvs(sv, ":NULL");
20443     }
20444 #else
20445     PERL_UNUSED_CONTEXT;
20446     PERL_UNUSED_ARG(sv);
20447     PERL_UNUSED_ARG(o);
20448     PERL_UNUSED_ARG(prog);
20449     PERL_UNUSED_ARG(reginfo);
20450     PERL_UNUSED_ARG(pRExC_state);
20451 #endif  /* DEBUGGING */
20452 }
20453
20454
20455
20456 SV *
20457 Perl_re_intuit_string(pTHX_ REGEXP * const r)
20458 {                               /* Assume that RE_INTUIT is set */
20459     struct regexp *const prog = ReANY(r);
20460     GET_RE_DEBUG_FLAGS_DECL;
20461
20462     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20463     PERL_UNUSED_CONTEXT;
20464
20465     DEBUG_COMPILE_r(
20466         {
20467             const char * const s = SvPV_nolen_const(RX_UTF8(r)
20468                       ? prog->check_utf8 : prog->check_substr);
20469
20470             if (!PL_colorset) reginitcolors();
20471             Perl_re_printf( aTHX_
20472                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20473                       PL_colors[4],
20474                       RX_UTF8(r) ? "utf8 " : "",
20475                       PL_colors[5], PL_colors[0],
20476                       s,
20477                       PL_colors[1],
20478                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20479         } );
20480
20481     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20482     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20483 }
20484
20485 /*
20486    pregfree()
20487
20488    handles refcounting and freeing the perl core regexp structure. When
20489    it is necessary to actually free the structure the first thing it
20490    does is call the 'free' method of the regexp_engine associated to
20491    the regexp, allowing the handling of the void *pprivate; member
20492    first. (This routine is not overridable by extensions, which is why
20493    the extensions free is called first.)
20494
20495    See regdupe and regdupe_internal if you change anything here.
20496 */
20497 #ifndef PERL_IN_XSUB_RE
20498 void
20499 Perl_pregfree(pTHX_ REGEXP *r)
20500 {
20501     SvREFCNT_dec(r);
20502 }
20503
20504 void
20505 Perl_pregfree2(pTHX_ REGEXP *rx)
20506 {
20507     struct regexp *const r = ReANY(rx);
20508     GET_RE_DEBUG_FLAGS_DECL;
20509
20510     PERL_ARGS_ASSERT_PREGFREE2;
20511
20512     if (! r)
20513         return;
20514
20515     if (r->mother_re) {
20516         ReREFCNT_dec(r->mother_re);
20517     } else {
20518         CALLREGFREE_PVT(rx); /* free the private data */
20519         SvREFCNT_dec(RXp_PAREN_NAMES(r));
20520     }
20521     if (r->substrs) {
20522         int i;
20523         for (i = 0; i < 2; i++) {
20524             SvREFCNT_dec(r->substrs->data[i].substr);
20525             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20526         }
20527         Safefree(r->substrs);
20528     }
20529     RX_MATCH_COPY_FREE(rx);
20530 #ifdef PERL_ANY_COW
20531     SvREFCNT_dec(r->saved_copy);
20532 #endif
20533     Safefree(r->offs);
20534     SvREFCNT_dec(r->qr_anoncv);
20535     if (r->recurse_locinput)
20536         Safefree(r->recurse_locinput);
20537 }
20538
20539
20540 /*  reg_temp_copy()
20541
20542     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20543     except that dsv will be created if NULL.
20544
20545     This function is used in two main ways. First to implement
20546         $r = qr/....; $s = $$r;
20547
20548     Secondly, it is used as a hacky workaround to the structural issue of
20549     match results
20550     being stored in the regexp structure which is in turn stored in
20551     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20552     could be PL_curpm in multiple contexts, and could require multiple
20553     result sets being associated with the pattern simultaneously, such
20554     as when doing a recursive match with (??{$qr})
20555
20556     The solution is to make a lightweight copy of the regexp structure
20557     when a qr// is returned from the code executed by (??{$qr}) this
20558     lightweight copy doesn't actually own any of its data except for
20559     the starp/end and the actual regexp structure itself.
20560
20561 */
20562
20563
20564 REGEXP *
20565 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20566 {
20567     struct regexp *drx;
20568     struct regexp *const srx = ReANY(ssv);
20569     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20570
20571     PERL_ARGS_ASSERT_REG_TEMP_COPY;
20572
20573     if (!dsv)
20574         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20575     else {
20576         SvOK_off((SV *)dsv);
20577         if (islv) {
20578             /* For PVLVs, the head (sv_any) points to an XPVLV, while
20579              * the LV's xpvlenu_rx will point to a regexp body, which
20580              * we allocate here */
20581             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20582             assert(!SvPVX(dsv));
20583             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20584             temp->sv_any = NULL;
20585             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20586             SvREFCNT_dec_NN(temp);
20587             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20588                ing below will not set it. */
20589             SvCUR_set(dsv, SvCUR(ssv));
20590         }
20591     }
20592     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20593        sv_force_normal(sv) is called.  */
20594     SvFAKE_on(dsv);
20595     drx = ReANY(dsv);
20596
20597     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20598     SvPV_set(dsv, RX_WRAPPED(ssv));
20599     /* We share the same string buffer as the original regexp, on which we
20600        hold a reference count, incremented when mother_re is set below.
20601        The string pointer is copied here, being part of the regexp struct.
20602      */
20603     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20604            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20605     if (!islv)
20606         SvLEN_set(dsv, 0);
20607     if (srx->offs) {
20608         const I32 npar = srx->nparens+1;
20609         Newx(drx->offs, npar, regexp_paren_pair);
20610         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20611     }
20612     if (srx->substrs) {
20613         int i;
20614         Newx(drx->substrs, 1, struct reg_substr_data);
20615         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20616
20617         for (i = 0; i < 2; i++) {
20618             SvREFCNT_inc_void(drx->substrs->data[i].substr);
20619             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20620         }
20621
20622         /* check_substr and check_utf8, if non-NULL, point to either their
20623            anchored or float namesakes, and don't hold a second reference.  */
20624     }
20625     RX_MATCH_COPIED_off(dsv);
20626 #ifdef PERL_ANY_COW
20627     drx->saved_copy = NULL;
20628 #endif
20629     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20630     SvREFCNT_inc_void(drx->qr_anoncv);
20631     if (srx->recurse_locinput)
20632         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20633
20634     return dsv;
20635 }
20636 #endif
20637
20638
20639 /* regfree_internal()
20640
20641    Free the private data in a regexp. This is overloadable by
20642    extensions. Perl takes care of the regexp structure in pregfree(),
20643    this covers the *pprivate pointer which technically perl doesn't
20644    know about, however of course we have to handle the
20645    regexp_internal structure when no extension is in use.
20646
20647    Note this is called before freeing anything in the regexp
20648    structure.
20649  */
20650
20651 void
20652 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20653 {
20654     struct regexp *const r = ReANY(rx);
20655     RXi_GET_DECL(r, ri);
20656     GET_RE_DEBUG_FLAGS_DECL;
20657
20658     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20659
20660     if (! ri) {
20661         return;
20662     }
20663
20664     DEBUG_COMPILE_r({
20665         if (!PL_colorset)
20666             reginitcolors();
20667         {
20668             SV *dsv= sv_newmortal();
20669             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20670                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20671             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20672                 PL_colors[4], PL_colors[5], s);
20673         }
20674     });
20675
20676 #ifdef RE_TRACK_PATTERN_OFFSETS
20677     if (ri->u.offsets)
20678         Safefree(ri->u.offsets);             /* 20010421 MJD */
20679 #endif
20680     if (ri->code_blocks)
20681         S_free_codeblocks(aTHX_ ri->code_blocks);
20682
20683     if (ri->data) {
20684         int n = ri->data->count;
20685
20686         while (--n >= 0) {
20687           /* If you add a ->what type here, update the comment in regcomp.h */
20688             switch (ri->data->what[n]) {
20689             case 'a':
20690             case 'r':
20691             case 's':
20692             case 'S':
20693             case 'u':
20694                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20695                 break;
20696             case 'f':
20697                 Safefree(ri->data->data[n]);
20698                 break;
20699             case 'l':
20700             case 'L':
20701                 break;
20702             case 'T':
20703                 { /* Aho Corasick add-on structure for a trie node.
20704                      Used in stclass optimization only */
20705                     U32 refcount;
20706                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20707 #ifdef USE_ITHREADS
20708                     dVAR;
20709 #endif
20710                     OP_REFCNT_LOCK;
20711                     refcount = --aho->refcount;
20712                     OP_REFCNT_UNLOCK;
20713                     if ( !refcount ) {
20714                         PerlMemShared_free(aho->states);
20715                         PerlMemShared_free(aho->fail);
20716                          /* do this last!!!! */
20717                         PerlMemShared_free(ri->data->data[n]);
20718                         /* we should only ever get called once, so
20719                          * assert as much, and also guard the free
20720                          * which /might/ happen twice. At the least
20721                          * it will make code anlyzers happy and it
20722                          * doesn't cost much. - Yves */
20723                         assert(ri->regstclass);
20724                         if (ri->regstclass) {
20725                             PerlMemShared_free(ri->regstclass);
20726                             ri->regstclass = 0;
20727                         }
20728                     }
20729                 }
20730                 break;
20731             case 't':
20732                 {
20733                     /* trie structure. */
20734                     U32 refcount;
20735                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
20736 #ifdef USE_ITHREADS
20737                     dVAR;
20738 #endif
20739                     OP_REFCNT_LOCK;
20740                     refcount = --trie->refcount;
20741                     OP_REFCNT_UNLOCK;
20742                     if ( !refcount ) {
20743                         PerlMemShared_free(trie->charmap);
20744                         PerlMemShared_free(trie->states);
20745                         PerlMemShared_free(trie->trans);
20746                         if (trie->bitmap)
20747                             PerlMemShared_free(trie->bitmap);
20748                         if (trie->jump)
20749                             PerlMemShared_free(trie->jump);
20750                         PerlMemShared_free(trie->wordinfo);
20751                         /* do this last!!!! */
20752                         PerlMemShared_free(ri->data->data[n]);
20753                     }
20754                 }
20755                 break;
20756             default:
20757                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
20758                                                     ri->data->what[n]);
20759             }
20760         }
20761         Safefree(ri->data->what);
20762         Safefree(ri->data);
20763     }
20764
20765     Safefree(ri);
20766 }
20767
20768 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
20769 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
20770 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
20771
20772 /*
20773    re_dup_guts - duplicate a regexp.
20774
20775    This routine is expected to clone a given regexp structure. It is only
20776    compiled under USE_ITHREADS.
20777
20778    After all of the core data stored in struct regexp is duplicated
20779    the regexp_engine.dupe method is used to copy any private data
20780    stored in the *pprivate pointer. This allows extensions to handle
20781    any duplication it needs to do.
20782
20783    See pregfree() and regfree_internal() if you change anything here.
20784 */
20785 #if defined(USE_ITHREADS)
20786 #ifndef PERL_IN_XSUB_RE
20787 void
20788 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
20789 {
20790     dVAR;
20791     I32 npar;
20792     const struct regexp *r = ReANY(sstr);
20793     struct regexp *ret = ReANY(dstr);
20794
20795     PERL_ARGS_ASSERT_RE_DUP_GUTS;
20796
20797     npar = r->nparens+1;
20798     Newx(ret->offs, npar, regexp_paren_pair);
20799     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
20800
20801     if (ret->substrs) {
20802         /* Do it this way to avoid reading from *r after the StructCopy().
20803            That way, if any of the sv_dup_inc()s dislodge *r from the L1
20804            cache, it doesn't matter.  */
20805         int i;
20806         const bool anchored = r->check_substr
20807             ? r->check_substr == r->substrs->data[0].substr
20808             : r->check_utf8   == r->substrs->data[0].utf8_substr;
20809         Newx(ret->substrs, 1, struct reg_substr_data);
20810         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
20811
20812         for (i = 0; i < 2; i++) {
20813             ret->substrs->data[i].substr =
20814                         sv_dup_inc(ret->substrs->data[i].substr, param);
20815             ret->substrs->data[i].utf8_substr =
20816                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
20817         }
20818
20819         /* check_substr and check_utf8, if non-NULL, point to either their
20820            anchored or float namesakes, and don't hold a second reference.  */
20821
20822         if (ret->check_substr) {
20823             if (anchored) {
20824                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
20825
20826                 ret->check_substr = ret->substrs->data[0].substr;
20827                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
20828             } else {
20829                 assert(r->check_substr == r->substrs->data[1].substr);
20830                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
20831
20832                 ret->check_substr = ret->substrs->data[1].substr;
20833                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
20834             }
20835         } else if (ret->check_utf8) {
20836             if (anchored) {
20837                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20838             } else {
20839                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20840             }
20841         }
20842     }
20843
20844     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
20845     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
20846     if (r->recurse_locinput)
20847         Newx(ret->recurse_locinput, r->nparens + 1, char *);
20848
20849     if (ret->pprivate)
20850         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
20851
20852     if (RX_MATCH_COPIED(dstr))
20853         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
20854     else
20855         ret->subbeg = NULL;
20856 #ifdef PERL_ANY_COW
20857     ret->saved_copy = NULL;
20858 #endif
20859
20860     /* Whether mother_re be set or no, we need to copy the string.  We
20861        cannot refrain from copying it when the storage points directly to
20862        our mother regexp, because that's
20863                1: a buffer in a different thread
20864                2: something we no longer hold a reference on
20865                so we need to copy it locally.  */
20866     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20867     ret->mother_re   = NULL;
20868 }
20869 #endif /* PERL_IN_XSUB_RE */
20870
20871 /*
20872    regdupe_internal()
20873
20874    This is the internal complement to regdupe() which is used to copy
20875    the structure pointed to by the *pprivate pointer in the regexp.
20876    This is the core version of the extension overridable cloning hook.
20877    The regexp structure being duplicated will be copied by perl prior
20878    to this and will be provided as the regexp *r argument, however
20879    with the /old/ structures pprivate pointer value. Thus this routine
20880    may override any copying normally done by perl.
20881
20882    It returns a pointer to the new regexp_internal structure.
20883 */
20884
20885 void *
20886 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20887 {
20888     dVAR;
20889     struct regexp *const r = ReANY(rx);
20890     regexp_internal *reti;
20891     int len;
20892     RXi_GET_DECL(r, ri);
20893
20894     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20895
20896     len = ProgLen(ri);
20897
20898     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20899           char, regexp_internal);
20900     Copy(ri->program, reti->program, len+1, regnode);
20901
20902
20903     if (ri->code_blocks) {
20904         int n;
20905         Newx(reti->code_blocks, 1, struct reg_code_blocks);
20906         Newx(reti->code_blocks->cb, ri->code_blocks->count,
20907                     struct reg_code_block);
20908         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20909              ri->code_blocks->count, struct reg_code_block);
20910         for (n = 0; n < ri->code_blocks->count; n++)
20911              reti->code_blocks->cb[n].src_regex = (REGEXP*)
20912                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20913         reti->code_blocks->count = ri->code_blocks->count;
20914         reti->code_blocks->refcnt = 1;
20915     }
20916     else
20917         reti->code_blocks = NULL;
20918
20919     reti->regstclass = NULL;
20920
20921     if (ri->data) {
20922         struct reg_data *d;
20923         const int count = ri->data->count;
20924         int i;
20925
20926         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20927                 char, struct reg_data);
20928         Newx(d->what, count, U8);
20929
20930         d->count = count;
20931         for (i = 0; i < count; i++) {
20932             d->what[i] = ri->data->what[i];
20933             switch (d->what[i]) {
20934                 /* see also regcomp.h and regfree_internal() */
20935             case 'a': /* actually an AV, but the dup function is identical.
20936                          values seem to be "plain sv's" generally. */
20937             case 'r': /* a compiled regex (but still just another SV) */
20938             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20939                          this use case should go away, the code could have used
20940                          'a' instead - see S_set_ANYOF_arg() for array contents. */
20941             case 'S': /* actually an SV, but the dup function is identical.  */
20942             case 'u': /* actually an HV, but the dup function is identical.
20943                          values are "plain sv's" */
20944                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20945                 break;
20946             case 'f':
20947                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20948                  * patterns which could start with several different things. Pre-TRIE
20949                  * this was more important than it is now, however this still helps
20950                  * in some places, for instance /x?a+/ might produce a SSC equivalent
20951                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20952                  * in regexec.c
20953                  */
20954                 /* This is cheating. */
20955                 Newx(d->data[i], 1, regnode_ssc);
20956                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20957                 reti->regstclass = (regnode*)d->data[i];
20958                 break;
20959             case 'T':
20960                 /* AHO-CORASICK fail table */
20961                 /* Trie stclasses are readonly and can thus be shared
20962                  * without duplication. We free the stclass in pregfree
20963                  * when the corresponding reg_ac_data struct is freed.
20964                  */
20965                 reti->regstclass= ri->regstclass;
20966                 /* FALLTHROUGH */
20967             case 't':
20968                 /* TRIE transition table */
20969                 OP_REFCNT_LOCK;
20970                 ((reg_trie_data*)ri->data->data[i])->refcount++;
20971                 OP_REFCNT_UNLOCK;
20972                 /* FALLTHROUGH */
20973             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20974             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20975                          is not from another regexp */
20976                 d->data[i] = ri->data->data[i];
20977                 break;
20978             default:
20979                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20980                                                            ri->data->what[i]);
20981             }
20982         }
20983
20984         reti->data = d;
20985     }
20986     else
20987         reti->data = NULL;
20988
20989     reti->name_list_idx = ri->name_list_idx;
20990
20991 #ifdef RE_TRACK_PATTERN_OFFSETS
20992     if (ri->u.offsets) {
20993         Newx(reti->u.offsets, 2*len+1, U32);
20994         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
20995     }
20996 #else
20997     SetProgLen(reti, len);
20998 #endif
20999
21000     return (void*)reti;
21001 }
21002
21003 #endif    /* USE_ITHREADS */
21004
21005 #ifndef PERL_IN_XSUB_RE
21006
21007 /*
21008  - regnext - dig the "next" pointer out of a node
21009  */
21010 regnode *
21011 Perl_regnext(pTHX_ regnode *p)
21012 {
21013     I32 offset;
21014
21015     if (!p)
21016         return(NULL);
21017
21018     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
21019         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21020                                                 (int)OP(p), (int)REGNODE_MAX);
21021     }
21022
21023     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21024     if (offset == 0)
21025         return(NULL);
21026
21027     return(p+offset);
21028 }
21029
21030 #endif
21031
21032 STATIC void
21033 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21034 {
21035     va_list args;
21036     STRLEN l1 = strlen(pat1);
21037     STRLEN l2 = strlen(pat2);
21038     char buf[512];
21039     SV *msv;
21040     const char *message;
21041
21042     PERL_ARGS_ASSERT_RE_CROAK2;
21043
21044     if (l1 > 510)
21045         l1 = 510;
21046     if (l1 + l2 > 510)
21047         l2 = 510 - l1;
21048     Copy(pat1, buf, l1 , char);
21049     Copy(pat2, buf + l1, l2 , char);
21050     buf[l1 + l2] = '\n';
21051     buf[l1 + l2 + 1] = '\0';
21052     va_start(args, pat2);
21053     msv = vmess(buf, &args);
21054     va_end(args);
21055     message = SvPV_const(msv, l1);
21056     if (l1 > 512)
21057         l1 = 512;
21058     Copy(message, buf, l1 , char);
21059     /* l1-1 to avoid \n */
21060     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21061 }
21062
21063 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21064
21065 #ifndef PERL_IN_XSUB_RE
21066 void
21067 Perl_save_re_context(pTHX)
21068 {
21069     I32 nparens = -1;
21070     I32 i;
21071
21072     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21073
21074     if (PL_curpm) {
21075         const REGEXP * const rx = PM_GETRE(PL_curpm);
21076         if (rx)
21077             nparens = RX_NPARENS(rx);
21078     }
21079
21080     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21081      * that PL_curpm will be null, but that utf8.pm and the modules it
21082      * loads will only use $1..$3.
21083      * The t/porting/re_context.t test file checks this assumption.
21084      */
21085     if (nparens == -1)
21086         nparens = 3;
21087
21088     for (i = 1; i <= nparens; i++) {
21089         char digits[TYPE_CHARS(long)];
21090         const STRLEN len = my_snprintf(digits, sizeof(digits),
21091                                        "%lu", (long)i);
21092         GV *const *const gvp
21093             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21094
21095         if (gvp) {
21096             GV * const gv = *gvp;
21097             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21098                 save_scalar(gv);
21099         }
21100     }
21101 }
21102 #endif
21103
21104 #ifdef DEBUGGING
21105
21106 STATIC void
21107 S_put_code_point(pTHX_ SV *sv, UV c)
21108 {
21109     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21110
21111     if (c > 255) {
21112         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21113     }
21114     else if (isPRINT(c)) {
21115         const char string = (char) c;
21116
21117         /* We use {phrase} as metanotation in the class, so also escape literal
21118          * braces */
21119         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21120             sv_catpvs(sv, "\\");
21121         sv_catpvn(sv, &string, 1);
21122     }
21123     else if (isMNEMONIC_CNTRL(c)) {
21124         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21125     }
21126     else {
21127         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21128     }
21129 }
21130
21131 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21132
21133 STATIC void
21134 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21135 {
21136     /* Appends to 'sv' a displayable version of the range of code points from
21137      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21138      * that have them, when they occur at the beginning or end of the range.
21139      * It uses hex to output the remaining code points, unless 'allow_literals'
21140      * is true, in which case the printable ASCII ones are output as-is (though
21141      * some of these will be escaped by put_code_point()).
21142      *
21143      * NOTE:  This is designed only for printing ranges of code points that fit
21144      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21145      */
21146
21147     const unsigned int min_range_count = 3;
21148
21149     assert(start <= end);
21150
21151     PERL_ARGS_ASSERT_PUT_RANGE;
21152
21153     while (start <= end) {
21154         UV this_end;
21155         const char * format;
21156
21157         if (end - start < min_range_count) {
21158
21159             /* Output chars individually when they occur in short ranges */
21160             for (; start <= end; start++) {
21161                 put_code_point(sv, start);
21162             }
21163             break;
21164         }
21165
21166         /* If permitted by the input options, and there is a possibility that
21167          * this range contains a printable literal, look to see if there is
21168          * one. */
21169         if (allow_literals && start <= MAX_PRINT_A) {
21170
21171             /* If the character at the beginning of the range isn't an ASCII
21172              * printable, effectively split the range into two parts:
21173              *  1) the portion before the first such printable,
21174              *  2) the rest
21175              * and output them separately. */
21176             if (! isPRINT_A(start)) {
21177                 UV temp_end = start + 1;
21178
21179                 /* There is no point looking beyond the final possible
21180                  * printable, in MAX_PRINT_A */
21181                 UV max = MIN(end, MAX_PRINT_A);
21182
21183                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21184                     temp_end++;
21185                 }
21186
21187                 /* Here, temp_end points to one beyond the first printable if
21188                  * found, or to one beyond 'max' if not.  If none found, make
21189                  * sure that we use the entire range */
21190                 if (temp_end > MAX_PRINT_A) {
21191                     temp_end = end + 1;
21192                 }
21193
21194                 /* Output the first part of the split range: the part that
21195                  * doesn't have printables, with the parameter set to not look
21196                  * for literals (otherwise we would infinitely recurse) */
21197                 put_range(sv, start, temp_end - 1, FALSE);
21198
21199                 /* The 2nd part of the range (if any) starts here. */
21200                 start = temp_end;
21201
21202                 /* We do a continue, instead of dropping down, because even if
21203                  * the 2nd part is non-empty, it could be so short that we want
21204                  * to output it as individual characters, as tested for at the
21205                  * top of this loop.  */
21206                 continue;
21207             }
21208
21209             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21210              * output a sub-range of just the digits or letters, then process
21211              * the remaining portion as usual. */
21212             if (isALPHANUMERIC_A(start)) {
21213                 UV mask = (isDIGIT_A(start))
21214                            ? _CC_DIGIT
21215                              : isUPPER_A(start)
21216                                ? _CC_UPPER
21217                                : _CC_LOWER;
21218                 UV temp_end = start + 1;
21219
21220                 /* Find the end of the sub-range that includes just the
21221                  * characters in the same class as the first character in it */
21222                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21223                     temp_end++;
21224                 }
21225                 temp_end--;
21226
21227                 /* For short ranges, don't duplicate the code above to output
21228                  * them; just call recursively */
21229                 if (temp_end - start < min_range_count) {
21230                     put_range(sv, start, temp_end, FALSE);
21231                 }
21232                 else {  /* Output as a range */
21233                     put_code_point(sv, start);
21234                     sv_catpvs(sv, "-");
21235                     put_code_point(sv, temp_end);
21236                 }
21237                 start = temp_end + 1;
21238                 continue;
21239             }
21240
21241             /* We output any other printables as individual characters */
21242             if (isPUNCT_A(start) || isSPACE_A(start)) {
21243                 while (start <= end && (isPUNCT_A(start)
21244                                         || isSPACE_A(start)))
21245                 {
21246                     put_code_point(sv, start);
21247                     start++;
21248                 }
21249                 continue;
21250             }
21251         } /* End of looking for literals */
21252
21253         /* Here is not to output as a literal.  Some control characters have
21254          * mnemonic names.  Split off any of those at the beginning and end of
21255          * the range to print mnemonically.  It isn't possible for many of
21256          * these to be in a row, so this won't overwhelm with output */
21257         if (   start <= end
21258             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21259         {
21260             while (isMNEMONIC_CNTRL(start) && start <= end) {
21261                 put_code_point(sv, start);
21262                 start++;
21263             }
21264
21265             /* If this didn't take care of the whole range ... */
21266             if (start <= end) {
21267
21268                 /* Look backwards from the end to find the final non-mnemonic
21269                  * */
21270                 UV temp_end = end;
21271                 while (isMNEMONIC_CNTRL(temp_end)) {
21272                     temp_end--;
21273                 }
21274
21275                 /* And separately output the interior range that doesn't start
21276                  * or end with mnemonics */
21277                 put_range(sv, start, temp_end, FALSE);
21278
21279                 /* Then output the mnemonic trailing controls */
21280                 start = temp_end + 1;
21281                 while (start <= end) {
21282                     put_code_point(sv, start);
21283                     start++;
21284                 }
21285                 break;
21286             }
21287         }
21288
21289         /* As a final resort, output the range or subrange as hex. */
21290
21291         this_end = (end < NUM_ANYOF_CODE_POINTS)
21292                     ? end
21293                     : NUM_ANYOF_CODE_POINTS - 1;
21294 #if NUM_ANYOF_CODE_POINTS > 256
21295         format = (this_end < 256)
21296                  ? "\\x%02" UVXf "-\\x%02" UVXf
21297                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21298 #else
21299         format = "\\x%02" UVXf "-\\x%02" UVXf;
21300 #endif
21301         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21302         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21303         GCC_DIAG_RESTORE_STMT;
21304         break;
21305     }
21306 }
21307
21308 STATIC void
21309 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21310 {
21311     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21312      * 'invlist' */
21313
21314     UV start, end;
21315     bool allow_literals = TRUE;
21316
21317     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21318
21319     /* Generally, it is more readable if printable characters are output as
21320      * literals, but if a range (nearly) spans all of them, it's best to output
21321      * it as a single range.  This code will use a single range if all but 2
21322      * ASCII printables are in it */
21323     invlist_iterinit(invlist);
21324     while (invlist_iternext(invlist, &start, &end)) {
21325
21326         /* If the range starts beyond the final printable, it doesn't have any
21327          * in it */
21328         if (start > MAX_PRINT_A) {
21329             break;
21330         }
21331
21332         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
21333          * all but two, the range must start and end no later than 2 from
21334          * either end */
21335         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21336             if (end > MAX_PRINT_A) {
21337                 end = MAX_PRINT_A;
21338             }
21339             if (start < ' ') {
21340                 start = ' ';
21341             }
21342             if (end - start >= MAX_PRINT_A - ' ' - 2) {
21343                 allow_literals = FALSE;
21344             }
21345             break;
21346         }
21347     }
21348     invlist_iterfinish(invlist);
21349
21350     /* Here we have figured things out.  Output each range */
21351     invlist_iterinit(invlist);
21352     while (invlist_iternext(invlist, &start, &end)) {
21353         if (start >= NUM_ANYOF_CODE_POINTS) {
21354             break;
21355         }
21356         put_range(sv, start, end, allow_literals);
21357     }
21358     invlist_iterfinish(invlist);
21359
21360     return;
21361 }
21362
21363 STATIC SV*
21364 S_put_charclass_bitmap_innards_common(pTHX_
21365         SV* invlist,            /* The bitmap */
21366         SV* posixes,            /* Under /l, things like [:word:], \S */
21367         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
21368         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
21369         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
21370         const bool invert       /* Is the result to be inverted? */
21371 )
21372 {
21373     /* Create and return an SV containing a displayable version of the bitmap
21374      * and associated information determined by the input parameters.  If the
21375      * output would have been only the inversion indicator '^', NULL is instead
21376      * returned. */
21377
21378     SV * output;
21379
21380     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21381
21382     if (invert) {
21383         output = newSVpvs("^");
21384     }
21385     else {
21386         output = newSVpvs("");
21387     }
21388
21389     /* First, the code points in the bitmap that are unconditionally there */
21390     put_charclass_bitmap_innards_invlist(output, invlist);
21391
21392     /* Traditionally, these have been placed after the main code points */
21393     if (posixes) {
21394         sv_catsv(output, posixes);
21395     }
21396
21397     if (only_utf8 && _invlist_len(only_utf8)) {
21398         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21399         put_charclass_bitmap_innards_invlist(output, only_utf8);
21400     }
21401
21402     if (not_utf8 && _invlist_len(not_utf8)) {
21403         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21404         put_charclass_bitmap_innards_invlist(output, not_utf8);
21405     }
21406
21407     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21408         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21409         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21410
21411         /* This is the only list in this routine that can legally contain code
21412          * points outside the bitmap range.  The call just above to
21413          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21414          * output them here.  There's about a half-dozen possible, and none in
21415          * contiguous ranges longer than 2 */
21416         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21417             UV start, end;
21418             SV* above_bitmap = NULL;
21419
21420             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21421
21422             invlist_iterinit(above_bitmap);
21423             while (invlist_iternext(above_bitmap, &start, &end)) {
21424                 UV i;
21425
21426                 for (i = start; i <= end; i++) {
21427                     put_code_point(output, i);
21428                 }
21429             }
21430             invlist_iterfinish(above_bitmap);
21431             SvREFCNT_dec_NN(above_bitmap);
21432         }
21433     }
21434
21435     if (invert && SvCUR(output) == 1) {
21436         return NULL;
21437     }
21438
21439     return output;
21440 }
21441
21442 STATIC bool
21443 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21444                                      char *bitmap,
21445                                      SV *nonbitmap_invlist,
21446                                      SV *only_utf8_locale_invlist,
21447                                      const regnode * const node,
21448                                      const bool force_as_is_display)
21449 {
21450     /* Appends to 'sv' a displayable version of the innards of the bracketed
21451      * character class defined by the other arguments:
21452      *  'bitmap' points to the bitmap, or NULL if to ignore that.
21453      *  'nonbitmap_invlist' is an inversion list of the code points that are in
21454      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
21455      *      none.  The reasons for this could be that they require some
21456      *      condition such as the target string being or not being in UTF-8
21457      *      (under /d), or because they came from a user-defined property that
21458      *      was not resolved at the time of the regex compilation (under /u)
21459      *  'only_utf8_locale_invlist' is an inversion list of the code points that
21460      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
21461      *  'node' is the regex pattern ANYOF node.  It is needed only when the
21462      *      above two parameters are not null, and is passed so that this
21463      *      routine can tease apart the various reasons for them.
21464      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
21465      *      to invert things to see if that leads to a cleaner display.  If
21466      *      FALSE, this routine is free to use its judgment about doing this.
21467      *
21468      * It returns TRUE if there was actually something output.  (It may be that
21469      * the bitmap, etc is empty.)
21470      *
21471      * When called for outputting the bitmap of a non-ANYOF node, just pass the
21472      * bitmap, with the succeeding parameters set to NULL, and the final one to
21473      * FALSE.
21474      */
21475
21476     /* In general, it tries to display the 'cleanest' representation of the
21477      * innards, choosing whether to display them inverted or not, regardless of
21478      * whether the class itself is to be inverted.  However,  there are some
21479      * cases where it can't try inverting, as what actually matches isn't known
21480      * until runtime, and hence the inversion isn't either. */
21481     bool inverting_allowed = ! force_as_is_display;
21482
21483     int i;
21484     STRLEN orig_sv_cur = SvCUR(sv);
21485
21486     SV* invlist;            /* Inversion list we accumulate of code points that
21487                                are unconditionally matched */
21488     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
21489                                UTF-8 */
21490     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
21491                              */
21492     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
21493     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
21494                                        is UTF-8 */
21495
21496     SV* as_is_display;      /* The output string when we take the inputs
21497                                literally */
21498     SV* inverted_display;   /* The output string when we invert the inputs */
21499
21500     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21501
21502     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
21503                                                    to match? */
21504     /* We are biased in favor of displaying things without them being inverted,
21505      * as that is generally easier to understand */
21506     const int bias = 5;
21507
21508     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21509
21510     /* Start off with whatever code points are passed in.  (We clone, so we
21511      * don't change the caller's list) */
21512     if (nonbitmap_invlist) {
21513         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21514         invlist = invlist_clone(nonbitmap_invlist, NULL);
21515     }
21516     else {  /* Worst case size is every other code point is matched */
21517         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21518     }
21519
21520     if (flags) {
21521         if (OP(node) == ANYOFD) {
21522
21523             /* This flag indicates that the code points below 0x100 in the
21524              * nonbitmap list are precisely the ones that match only when the
21525              * target is UTF-8 (they should all be non-ASCII). */
21526             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21527             {
21528                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21529                 _invlist_subtract(invlist, only_utf8, &invlist);
21530             }
21531
21532             /* And this flag for matching all non-ASCII 0xFF and below */
21533             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21534             {
21535                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21536             }
21537         }
21538         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21539
21540             /* If either of these flags are set, what matches isn't
21541              * determinable except during execution, so don't know enough here
21542              * to invert */
21543             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21544                 inverting_allowed = FALSE;
21545             }
21546
21547             /* What the posix classes match also varies at runtime, so these
21548              * will be output symbolically. */
21549             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21550                 int i;
21551
21552                 posixes = newSVpvs("");
21553                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21554                     if (ANYOF_POSIXL_TEST(node, i)) {
21555                         sv_catpv(posixes, anyofs[i]);
21556                     }
21557                 }
21558             }
21559         }
21560     }
21561
21562     /* Accumulate the bit map into the unconditional match list */
21563     if (bitmap) {
21564         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21565             if (BITMAP_TEST(bitmap, i)) {
21566                 int start = i++;
21567                 for (;
21568                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21569                      i++)
21570                 { /* empty */ }
21571                 invlist = _add_range_to_invlist(invlist, start, i-1);
21572             }
21573         }
21574     }
21575
21576     /* Make sure that the conditional match lists don't have anything in them
21577      * that match unconditionally; otherwise the output is quite confusing.
21578      * This could happen if the code that populates these misses some
21579      * duplication. */
21580     if (only_utf8) {
21581         _invlist_subtract(only_utf8, invlist, &only_utf8);
21582     }
21583     if (not_utf8) {
21584         _invlist_subtract(not_utf8, invlist, &not_utf8);
21585     }
21586
21587     if (only_utf8_locale_invlist) {
21588
21589         /* Since this list is passed in, we have to make a copy before
21590          * modifying it */
21591         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21592
21593         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21594
21595         /* And, it can get really weird for us to try outputting an inverted
21596          * form of this list when it has things above the bitmap, so don't even
21597          * try */
21598         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21599             inverting_allowed = FALSE;
21600         }
21601     }
21602
21603     /* Calculate what the output would be if we take the input as-is */
21604     as_is_display = put_charclass_bitmap_innards_common(invlist,
21605                                                     posixes,
21606                                                     only_utf8,
21607                                                     not_utf8,
21608                                                     only_utf8_locale,
21609                                                     invert);
21610
21611     /* If have to take the output as-is, just do that */
21612     if (! inverting_allowed) {
21613         if (as_is_display) {
21614             sv_catsv(sv, as_is_display);
21615             SvREFCNT_dec_NN(as_is_display);
21616         }
21617     }
21618     else { /* But otherwise, create the output again on the inverted input, and
21619               use whichever version is shorter */
21620
21621         int inverted_bias, as_is_bias;
21622
21623         /* We will apply our bias to whichever of the the results doesn't have
21624          * the '^' */
21625         if (invert) {
21626             invert = FALSE;
21627             as_is_bias = bias;
21628             inverted_bias = 0;
21629         }
21630         else {
21631             invert = TRUE;
21632             as_is_bias = 0;
21633             inverted_bias = bias;
21634         }
21635
21636         /* Now invert each of the lists that contribute to the output,
21637          * excluding from the result things outside the possible range */
21638
21639         /* For the unconditional inversion list, we have to add in all the
21640          * conditional code points, so that when inverted, they will be gone
21641          * from it */
21642         _invlist_union(only_utf8, invlist, &invlist);
21643         _invlist_union(not_utf8, invlist, &invlist);
21644         _invlist_union(only_utf8_locale, invlist, &invlist);
21645         _invlist_invert(invlist);
21646         _invlist_intersection(invlist, PL_InBitmap, &invlist);
21647
21648         if (only_utf8) {
21649             _invlist_invert(only_utf8);
21650             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21651         }
21652         else if (not_utf8) {
21653
21654             /* If a code point matches iff the target string is not in UTF-8,
21655              * then complementing the result has it not match iff not in UTF-8,
21656              * which is the same thing as matching iff it is UTF-8. */
21657             only_utf8 = not_utf8;
21658             not_utf8 = NULL;
21659         }
21660
21661         if (only_utf8_locale) {
21662             _invlist_invert(only_utf8_locale);
21663             _invlist_intersection(only_utf8_locale,
21664                                   PL_InBitmap,
21665                                   &only_utf8_locale);
21666         }
21667
21668         inverted_display = put_charclass_bitmap_innards_common(
21669                                             invlist,
21670                                             posixes,
21671                                             only_utf8,
21672                                             not_utf8,
21673                                             only_utf8_locale, invert);
21674
21675         /* Use the shortest representation, taking into account our bias
21676          * against showing it inverted */
21677         if (   inverted_display
21678             && (   ! as_is_display
21679                 || (  SvCUR(inverted_display) + inverted_bias
21680                     < SvCUR(as_is_display)    + as_is_bias)))
21681         {
21682             sv_catsv(sv, inverted_display);
21683         }
21684         else if (as_is_display) {
21685             sv_catsv(sv, as_is_display);
21686         }
21687
21688         SvREFCNT_dec(as_is_display);
21689         SvREFCNT_dec(inverted_display);
21690     }
21691
21692     SvREFCNT_dec_NN(invlist);
21693     SvREFCNT_dec(only_utf8);
21694     SvREFCNT_dec(not_utf8);
21695     SvREFCNT_dec(posixes);
21696     SvREFCNT_dec(only_utf8_locale);
21697
21698     return SvCUR(sv) > orig_sv_cur;
21699 }
21700
21701 #define CLEAR_OPTSTART                                                       \
21702     if (optstart) STMT_START {                                               \
21703         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
21704                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21705         optstart=NULL;                                                       \
21706     } STMT_END
21707
21708 #define DUMPUNTIL(b,e)                                                       \
21709                     CLEAR_OPTSTART;                                          \
21710                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21711
21712 STATIC const regnode *
21713 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
21714             const regnode *last, const regnode *plast,
21715             SV* sv, I32 indent, U32 depth)
21716 {
21717     U8 op = PSEUDO;     /* Arbitrary non-END op. */
21718     const regnode *next;
21719     const regnode *optstart= NULL;
21720
21721     RXi_GET_DECL(r, ri);
21722     GET_RE_DEBUG_FLAGS_DECL;
21723
21724     PERL_ARGS_ASSERT_DUMPUNTIL;
21725
21726 #ifdef DEBUG_DUMPUNTIL
21727     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
21728         last ? last-start : 0, plast ? plast-start : 0);
21729 #endif
21730
21731     if (plast && plast < last)
21732         last= plast;
21733
21734     while (PL_regkind[op] != END && (!last || node < last)) {
21735         assert(node);
21736         /* While that wasn't END last time... */
21737         NODE_ALIGN(node);
21738         op = OP(node);
21739         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
21740             indent--;
21741         next = regnext((regnode *)node);
21742
21743         /* Where, what. */
21744         if (OP(node) == OPTIMIZED) {
21745             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
21746                 optstart = node;
21747             else
21748                 goto after_print;
21749         } else
21750             CLEAR_OPTSTART;
21751
21752         regprop(r, sv, node, NULL, NULL);
21753         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
21754                       (int)(2*indent + 1), "", SvPVX_const(sv));
21755
21756         if (OP(node) != OPTIMIZED) {
21757             if (next == NULL)           /* Next ptr. */
21758                 Perl_re_printf( aTHX_  " (0)");
21759             else if (PL_regkind[(U8)op] == BRANCH
21760                      && PL_regkind[OP(next)] != BRANCH )
21761                 Perl_re_printf( aTHX_  " (FAIL)");
21762             else
21763                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
21764             Perl_re_printf( aTHX_ "\n");
21765         }
21766
21767       after_print:
21768         if (PL_regkind[(U8)op] == BRANCHJ) {
21769             assert(next);
21770             {
21771                 const regnode *nnode = (OP(next) == LONGJMP
21772                                        ? regnext((regnode *)next)
21773                                        : next);
21774                 if (last && nnode > last)
21775                     nnode = last;
21776                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
21777             }
21778         }
21779         else if (PL_regkind[(U8)op] == BRANCH) {
21780             assert(next);
21781             DUMPUNTIL(NEXTOPER(node), next);
21782         }
21783         else if ( PL_regkind[(U8)op]  == TRIE ) {
21784             const regnode *this_trie = node;
21785             const char op = OP(node);
21786             const U32 n = ARG(node);
21787             const reg_ac_data * const ac = op>=AHOCORASICK ?
21788                (reg_ac_data *)ri->data->data[n] :
21789                NULL;
21790             const reg_trie_data * const trie =
21791                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
21792 #ifdef DEBUGGING
21793             AV *const trie_words
21794                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
21795 #endif
21796             const regnode *nextbranch= NULL;
21797             I32 word_idx;
21798             SvPVCLEAR(sv);
21799             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
21800                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
21801
21802                 Perl_re_indentf( aTHX_  "%s ",
21803                     indent+3,
21804                     elem_ptr
21805                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
21806                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
21807                                 PL_colors[0], PL_colors[1],
21808                                 (SvUTF8(*elem_ptr)
21809                                  ? PERL_PV_ESCAPE_UNI
21810                                  : 0)
21811                                 | PERL_PV_PRETTY_ELLIPSES
21812                                 | PERL_PV_PRETTY_LTGT
21813                             )
21814                     : "???"
21815                 );
21816                 if (trie->jump) {
21817                     U16 dist= trie->jump[word_idx+1];
21818                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
21819                                (UV)((dist ? this_trie + dist : next) - start));
21820                     if (dist) {
21821                         if (!nextbranch)
21822                             nextbranch= this_trie + trie->jump[0];
21823                         DUMPUNTIL(this_trie + dist, nextbranch);
21824                     }
21825                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
21826                         nextbranch= regnext((regnode *)nextbranch);
21827                 } else {
21828                     Perl_re_printf( aTHX_  "\n");
21829                 }
21830             }
21831             if (last && next > last)
21832                 node= last;
21833             else
21834                 node= next;
21835         }
21836         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
21837             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
21838                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
21839         }
21840         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
21841             assert(next);
21842             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
21843         }
21844         else if ( op == PLUS || op == STAR) {
21845             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
21846         }
21847         else if (PL_regkind[(U8)op] == EXACT) {
21848             /* Literal string, where present. */
21849             node += NODE_SZ_STR(node) - 1;
21850             node = NEXTOPER(node);
21851         }
21852         else {
21853             node = NEXTOPER(node);
21854             node += regarglen[(U8)op];
21855         }
21856         if (op == CURLYX || op == OPEN || op == SROPEN)
21857             indent++;
21858     }
21859     CLEAR_OPTSTART;
21860 #ifdef DEBUG_DUMPUNTIL
21861     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
21862 #endif
21863     return node;
21864 }
21865
21866 #endif  /* DEBUGGING */
21867
21868 #ifndef PERL_IN_XSUB_RE
21869
21870 #include "uni_keywords.h"
21871
21872 void
21873 Perl_init_uniprops(pTHX)
21874 {
21875     /* Set up the inversion list global variables */
21876
21877     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21878     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
21879     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
21880     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
21881     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
21882     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
21883     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
21884     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
21885     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
21886     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
21887     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
21888     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
21889     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
21890     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
21891     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
21892     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
21893
21894     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21895     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
21896     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
21897     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
21898     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
21899     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
21900     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
21901     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
21902     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
21903     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
21904     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
21905     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
21906     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
21907     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
21908     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
21909     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
21910
21911     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
21912     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
21913     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
21914     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
21915     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
21916
21917     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
21918     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
21919     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
21920
21921     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
21922
21923     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
21924     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
21925
21926     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
21927     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
21928
21929     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
21930     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21931                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
21932     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21933                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
21934     PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
21935                                             UNI__PERL_NON_FINAL_FOLDS]);
21936
21937     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
21938     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
21939     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
21940     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
21941     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
21942     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
21943     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
21944
21945 #ifdef UNI_XIDC
21946     /* The below are used only by deprecated functions.  They could be removed */
21947     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
21948     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
21949     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
21950 #endif
21951 }
21952
21953 SV *
21954 Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
21955                                 const bool to_fold, bool * invert)
21956 {
21957     /* Parse the interior meat of \p{} passed to this in 'name' with length
21958      * 'name_len', and return an inversion list if a property with 'name' is
21959      * found, or NULL if not.  'name' point to the input with leading and
21960      * trailing space trimmed.  'to_fold' indicates if /i is in effect.
21961      *
21962      * When the return is an inversion list, '*invert' will be set to a boolean
21963      * indicating if it should be inverted or not
21964      *
21965      * This currently doesn't handle all cases.  A NULL return indicates the
21966      * caller should try a different approach
21967      */
21968
21969     char* lookup_name;
21970     bool stricter = FALSE;
21971     bool is_nv_type = FALSE;         /* nv= or numeric_value=, or possibly one
21972                                         of the cjk numeric properties (though
21973                                         it requires extra effort to compile
21974                                         them) */
21975     unsigned int i;
21976     unsigned int j = 0, lookup_len;
21977     int equals_pos = -1;        /* Where the '=' is found, or negative if none */
21978     int slash_pos = -1;        /* Where the '/' is found, or negative if none */
21979     int table_index = 0;
21980     bool starts_with_In_or_Is = FALSE;
21981     Size_t lookup_offset = 0;
21982
21983     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
21984
21985     /* The input will be modified into 'lookup_name' */
21986     Newx(lookup_name, name_len, char);
21987     SAVEFREEPV(lookup_name);
21988
21989     /* Parse the input. */
21990     for (i = 0; i < name_len; i++) {
21991         char cur = name[i];
21992
21993         /* These characters can be freely ignored in most situations.  Later it
21994          * may turn out we shouldn't have ignored them, and we have to reparse,
21995          * but we don't have enough information yet to make that decision */
21996         if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
21997             continue;
21998         }
21999
22000         /* Case differences are also ignored.  Our lookup routine assumes
22001          * everything is lowercase */
22002         if (isUPPER_A(cur)) {
22003             lookup_name[j++] = toLOWER(cur);
22004             continue;
22005         }
22006
22007         /* A double colon is either an error, or a package qualifier to a
22008          * subroutine user-defined property; neither of which do we currently
22009          * handle
22010          *
22011          * But a single colon is a synonym for '=' */
22012         if (cur == ':') {
22013             if (i < name_len - 1 && name[i+1] == ':') {
22014                 return NULL;
22015             }
22016             cur = '=';
22017         }
22018
22019         /* Otherwise, this character is part of the name. */
22020         lookup_name[j++] = cur;
22021
22022         /* Only the equals sign needs further processing */
22023         if (cur == '=') {
22024             equals_pos = j; /* Note where it occurred in the input */
22025             break;
22026         }
22027     }
22028
22029     /* Here, we are either done with the whole property name, if it was simple;
22030      * or are positioned just after the '=' if it is compound. */
22031
22032     if (equals_pos >= 0) {
22033         assert(! stricter); /* We shouldn't have set this yet */
22034
22035         /* Space immediately after the '=' is ignored */
22036         i++;
22037         for (; i < name_len; i++) {
22038             if (! isSPACE_A(name[i])) {
22039                 break;
22040             }
22041         }
22042
22043         /* Certain properties need special handling.  They may optionally be
22044          * prefixed by 'is'.  Ignore that prefix for the purposes of checking
22045          * if this is one of those properties */
22046         if (memBEGINPs(lookup_name, name_len, "is")) {
22047             lookup_offset = 2;
22048         }
22049
22050         /* Then check if it is one of these properties.  This is hard-coded
22051          * because easier this way, and the list is unlikely to change.  There
22052          * are several properties like this in the Unihan DB, which is unlikely
22053          * to be compiled, and they all end with 'numeric'.  The interiors
22054          * aren't checked for the precise property.  This would stop working if
22055          * a cjk property were to be created that ended with 'numeric' and
22056          * wasn't a numeric type */
22057         is_nv_type = memEQs(lookup_name + lookup_offset,
22058                        j - 1 - lookup_offset, "numericvalue")
22059                   || memEQs(lookup_name + lookup_offset,
22060                       j - 1 - lookup_offset, "nv")
22061                   || (   memENDPs(lookup_name + lookup_offset,
22062                             j - 1 - lookup_offset, "numeric")
22063                       && (   memBEGINPs(lookup_name + lookup_offset,
22064                                       j - 1 - lookup_offset, "cjk")
22065                           || memBEGINPs(lookup_name + lookup_offset,
22066                                       j - 1 - lookup_offset, "k")));
22067         if (   is_nv_type
22068             || memEQs(lookup_name + lookup_offset,
22069                       j - 1 - lookup_offset, "canonicalcombiningclass")
22070             || memEQs(lookup_name + lookup_offset,
22071                       j - 1 - lookup_offset, "ccc")
22072             || memEQs(lookup_name + lookup_offset,
22073                       j - 1 - lookup_offset, "age")
22074             || memEQs(lookup_name + lookup_offset,
22075                       j - 1 - lookup_offset, "in")
22076             || memEQs(lookup_name + lookup_offset,
22077                       j - 1 - lookup_offset, "presentin"))
22078         {
22079             unsigned int k;
22080
22081             /* What makes these properties special is that the stuff after the
22082              * '=' is a number.  Therefore, we can't throw away '-'
22083              * willy-nilly, as those could be a minus sign.  Other stricter
22084              * rules also apply.  However, these properties all can have the
22085              * rhs not be a number, in which case they contain at least one
22086              * alphabetic.  In those cases, the stricter rules don't apply.
22087              * But the numeric type properties can have the alphas [Ee] to
22088              * signify an exponent, and it is still a number with stricter
22089              * rules.  So look for an alpha that signifys not-strict */
22090             stricter = TRUE;
22091             for (k = i; k < name_len; k++) {
22092                 if (   isALPHA_A(name[k])
22093                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
22094                 {
22095                     stricter = FALSE;
22096                     break;
22097                 }
22098             }
22099         }
22100
22101         if (stricter) {
22102
22103             /* A number may have a leading '+' or '-'.  The latter is retained
22104              * */
22105             if (name[i] == '+') {
22106                 i++;
22107             }
22108             else if (name[i] == '-') {
22109                 lookup_name[j++] = '-';
22110                 i++;
22111             }
22112
22113             /* Skip leading zeros including single underscores separating the
22114              * zeros, or between the final leading zero and the first other
22115              * digit */
22116             for (; i < name_len - 1; i++) {
22117                 if (   name[i] != '0'
22118                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
22119                 {
22120                     break;
22121                 }
22122             }
22123         }
22124     }
22125     else {  /* No '=' */
22126
22127        /* We are now in a position to determine if this property should have
22128         * been parsed using stricter rules.  Only a few are like that, and
22129         * unlikely to change. */
22130         if (   memBEGINPs(lookup_name, j, "perl")
22131             && memNEs(lookup_name + 4, j - 4, "space")
22132             && memNEs(lookup_name + 4, j - 4, "word"))
22133         {
22134             stricter = TRUE;
22135
22136             /* We set the inputs back to 0 and the code below will reparse,
22137              * using strict */
22138             i = j = 0;
22139         }
22140     }
22141
22142     /* Here, we have either finished the property, or are positioned to parse
22143      * the remainder, and we know if stricter rules apply.  Finish out, if not
22144      * already done */
22145     for (; i < name_len; i++) {
22146         char cur = name[i];
22147
22148         /* In all instances, case differences are ignored, and we normalize to
22149          * lowercase */
22150         if (isUPPER_A(cur)) {
22151             lookup_name[j++] = toLOWER(cur);
22152             continue;
22153         }
22154
22155         /* An underscore is skipped, but not under strict rules unless it
22156          * separates two digits */
22157         if (cur == '_') {
22158             if (    stricter
22159                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
22160                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
22161             {
22162                 lookup_name[j++] = '_';
22163             }
22164             continue;
22165         }
22166
22167         /* Hyphens are skipped except under strict */
22168         if (cur == '-' && ! stricter) {
22169             continue;
22170         }
22171
22172         /* XXX Bug in documentation.  It says white space skipped adjacent to
22173          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
22174          * in a number */
22175         if (isSPACE_A(cur) && ! stricter) {
22176             continue;
22177         }
22178
22179         lookup_name[j++] = cur;
22180
22181         /* Unless this is a non-trailing slash, we are done with it */
22182         if (i >= name_len - 1 || cur != '/') {
22183             continue;
22184         }
22185
22186         slash_pos = j;
22187
22188         /* A slash in the 'numeric value' property indicates that what follows
22189          * is a denominator.  It can have a leading '+' and '0's that should be
22190          * skipped.  But we have never allowed a negative denominator, so treat
22191          * a minus like every other character.  (No need to rule out a second
22192          * '/', as that won't match anything anyway */
22193         if (is_nv_type) {
22194             i++;
22195             if (i < name_len && name[i] == '+') {
22196                 i++;
22197             }
22198
22199             /* Skip leading zeros including underscores separating digits */
22200             for (; i < name_len - 1; i++) {
22201                 if (   name[i] != '0'
22202                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
22203                 {
22204                     break;
22205                 }
22206             }
22207
22208             /* Store the first real character in the denominator */
22209             lookup_name[j++] = name[i];
22210         }
22211     }
22212
22213     /* Here are completely done parsing the input 'name', and 'lookup_name'
22214      * contains a copy, normalized.
22215      *
22216      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
22217      * different from without the underscores.  */
22218     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
22219            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
22220         && UNLIKELY(name[name_len-1] == '_'))
22221     {
22222         lookup_name[j++] = '&';
22223     }
22224     else if (name_len > 2 && name[0] == 'I' && (   name[1] == 'n'
22225                                                 || name[1] == 's'))
22226     {
22227
22228         /* Also, if the original input began with 'In' or 'Is', it could be a
22229          * subroutine call instead of a property names, which currently isn't
22230          * handled by this function.  Subroutine calls can't happen if there is
22231          * an '=' in the name */
22232         if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
22233         {
22234             return NULL;
22235         }
22236
22237         starts_with_In_or_Is = TRUE;
22238     }
22239
22240     lookup_len = j;     /* Use a more mnemonic name starting here */
22241
22242     /* Get the index into our pointer table of the inversion list corresponding
22243      * to the property */
22244     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
22245
22246     /* If it didn't find the property */
22247     if (table_index == 0) {
22248
22249         /* If didn't find the property, we try again stripping off any initial
22250          * 'In' or 'Is' */
22251         if (starts_with_In_or_Is) {
22252             lookup_name += 2;
22253             lookup_len -= 2;
22254             equals_pos -= 2;
22255             slash_pos -= 2;
22256
22257             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
22258         }
22259
22260         if (table_index == 0) {
22261             char * canonical;
22262
22263             /* If not found, and not a numeric type property, isn't a legal
22264              * property */
22265             if (! is_nv_type) {
22266                 return NULL;
22267             }
22268
22269             /* But the numeric type properties need more work to decide.  What
22270              * we do is make sure we have the number in canonical form and look
22271              * that up. */
22272
22273             if (slash_pos < 0) {    /* No slash */
22274
22275                 /* When it isn't a rational, take the input, convert it to a
22276                  * NV, then create a canonical string representation of that
22277                  * NV. */
22278
22279                 NV value;
22280
22281                 /* Get the value */
22282                 if (my_atof3(lookup_name + equals_pos, &value,
22283                              lookup_len - equals_pos)
22284                           != lookup_name + lookup_len)
22285                 {
22286                     return NULL;
22287                 }
22288
22289                 /* If the value is an integer, the canonical value is integral */
22290                 if (Perl_ceil(value) == value) {
22291                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
22292                                                 equals_pos, lookup_name, value);
22293                 }
22294                 else {  /* Otherwise, it is %e with a known precision */
22295                     char * exp_ptr;
22296
22297                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
22298                                                 equals_pos, lookup_name,
22299                                                 PL_E_FORMAT_PRECISION, value);
22300
22301                     /* The exponent generated is expecting two digits, whereas
22302                      * %e on some systems will generate three.  Remove leading
22303                      * zeros in excess of 2 from the exponent.  We start
22304                      * looking for them after the '=' */
22305                     exp_ptr = strchr(canonical + equals_pos, 'e');
22306                     if (exp_ptr) {
22307                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
22308                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
22309
22310                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
22311
22312                         if (excess_exponent_len > 0) {
22313                             SSize_t leading_zeros = strspn(cur_ptr, "0");
22314                             SSize_t excess_leading_zeros
22315                                     = MIN(leading_zeros, excess_exponent_len);
22316                             if (excess_leading_zeros > 0) {
22317                                 Move(cur_ptr + excess_leading_zeros,
22318                                      cur_ptr,
22319                                      strlen(cur_ptr) - excess_leading_zeros
22320                                        + 1,  /* Copy the NUL as well */
22321                                      char);
22322                             }
22323                         }
22324                     }
22325                 }
22326             }
22327             else {  /* Has a slash.  Create a rational in canonical form  */
22328                 UV numerator, denominator, gcd, trial;
22329                 const char * end_ptr;
22330                 const char * sign = "";
22331
22332                 /* We can't just find the numerator, denominator, and do the
22333                  * division, then use the method above, because that is
22334                  * inexact.  And the input could be a rational that is within
22335                  * epsilon (given our precision) of a valid rational, and would
22336                  * then incorrectly compare valid.
22337                  *
22338                  * We're only interested in the part after the '=' */
22339                 const char * this_lookup_name = lookup_name + equals_pos;
22340                 lookup_len -= equals_pos;
22341                 slash_pos -= equals_pos;
22342
22343                 /* Handle any leading minus */
22344                 if (this_lookup_name[0] == '-') {
22345                     sign = "-";
22346                     this_lookup_name++;
22347                     lookup_len--;
22348                     slash_pos--;
22349                 }
22350
22351                 /* Convert the numerator to numeric */
22352                 end_ptr = this_lookup_name + slash_pos;
22353                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
22354                     return NULL;
22355                 }
22356
22357                 /* It better have included all characters before the slash */
22358                 if (*end_ptr != '/') {
22359                     return NULL;
22360                 }
22361
22362                 /* Set to look at just the denominator */
22363                 this_lookup_name += slash_pos;
22364                 lookup_len -= slash_pos;
22365                 end_ptr = this_lookup_name + lookup_len;
22366
22367                 /* Convert the denominator to numeric */
22368                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
22369                     return NULL;
22370                 }
22371
22372                 /* It better be the rest of the characters, and don't divide by
22373                  * 0 */
22374                 if (   end_ptr != this_lookup_name + lookup_len
22375                     || denominator == 0)
22376                 {
22377                     return NULL;
22378                 }
22379
22380                 /* Get the greatest common denominator using
22381                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
22382                 gcd = numerator;
22383                 trial = denominator;
22384                 while (trial != 0) {
22385                     UV temp = trial;
22386                     trial = gcd % trial;
22387                     gcd = temp;
22388                 }
22389
22390                 /* If already in lowest possible terms, we have already tried
22391                  * looking this up */
22392                 if (gcd == 1) {
22393                     return NULL;
22394                 }
22395
22396                 /* Reduce the rational, which should put it in canonical form.
22397                  * Then look it up */
22398                 numerator /= gcd;
22399                 denominator /= gcd;
22400
22401                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
22402                         equals_pos, lookup_name, sign, numerator, denominator);
22403             }
22404
22405             /* Here, we have the number in canonical form.  Try that */
22406             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
22407             if (table_index == 0) {
22408                 return NULL;
22409             }
22410         }
22411     }
22412
22413     /* The return is an index into a table of ptrs.  A negative return
22414      * signifies that the real index is the absolute value, but the result
22415      * needs to be inverted */
22416     if (table_index < 0) {
22417         *invert = TRUE;
22418         table_index = -table_index;
22419     }
22420     else {
22421         *invert = FALSE;
22422     }
22423
22424     /* Out-of band indices indicate a deprecated property.  The proper index is
22425      * modulo it with the table size.  And dividing by the table size yields
22426      * an offset into a table constructed to contain the corresponding warning
22427      * message */
22428     if (table_index > MAX_UNI_KEYWORD_INDEX) {
22429         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
22430         table_index %= MAX_UNI_KEYWORD_INDEX;
22431         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
22432                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
22433                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
22434     }
22435
22436     /* In a few properties, a different property is used under /i.  These are
22437      * unlikely to change, so are hard-coded here. */
22438     if (to_fold) {
22439         if (   table_index == UNI_XPOSIXUPPER
22440             || table_index == UNI_XPOSIXLOWER
22441             || table_index == UNI_TITLE)
22442         {
22443             table_index = UNI_CASED;
22444         }
22445         else if (   table_index == UNI_UPPERCASELETTER
22446                  || table_index == UNI_LOWERCASELETTER
22447 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
22448                  || table_index == UNI_TITLECASELETTER
22449 #  endif
22450         ) {
22451             table_index = UNI_CASEDLETTER;
22452         }
22453         else if (  table_index == UNI_POSIXUPPER
22454                 || table_index == UNI_POSIXLOWER)
22455         {
22456             table_index = UNI_POSIXALPHA;
22457         }
22458     }
22459
22460     /* Create and return the inversion list */
22461     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
22462 }
22463
22464 #endif
22465
22466 /*
22467  * ex: set ts=8 sts=4 sw=4 et:
22468  */