regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_inline.h"
  90 #include "invlist_inline.h"
  91 #include "unicode_constants.h"
  92
  93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99
 100 #ifndef STATIC
 101 #define STATIC  static
 102 #endif
 103
 104 /* this is a chain of data about sub patterns we are processing that
 105    need to be handled separately/specially in study_chunk. Its so
 106    we can simulate recursion without losing state.  */
 107 struct scan_frame;
 108 typedef struct scan_frame {
 109     regnode *last_regnode;      /* last node to process in this frame */
 110     regnode *next_regnode;      /* next node to process when last is reached */
 111     U32 prev_recursed_depth;
 112     I32 stopparen;              /* what stopparen do we use */
 113
 114     struct scan_frame *this_prev_frame; /* this previous frame */
 115     struct scan_frame *prev_frame;      /* previous frame */
 116     struct scan_frame *next_frame;      /* next frame */
 117 } scan_frame;
 118
 119 /* Certain characters are output as a sequence with the first being a
 120  * backslash. */
 121 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 122
 123
 124 struct RExC_state_t {
 125     U32         flags;                  /* RXf_* are we folding, multilining? */
 126     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 127     char        *precomp;               /* uncompiled string. */
 128     char        *precomp_end;           /* pointer to end of uncompiled string. */
 129     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 130     regexp      *rx;                    /* perl core regexp structure */
 131     regexp_internal     *rxi;           /* internal data for regexp object
 132                                            pprivate field */
 133     char        *start;                 /* Start of input for compile */
 134     char        *end;                   /* End of input for compile */
 135     char        *parse;                 /* Input-scan pointer. */
 136     char        *copy_start;            /* start of copy of input within
 137                                            constructed parse string */
 138     char        *copy_start_in_input;   /* Position in input string
 139                                            corresponding to copy_start */
 140     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 141     regnode     *emit_start;            /* Start of emitted-code area */
 142     regnode_offset emit;                /* Code-emit pointer */
 143     I32         naughty;                /* How bad is this pattern? */
 144     I32         sawback;                /* Did we see \1, ...? */
 145     U32         seen;
 146     SSize_t     size;                   /* Number of regnode equivalents in
 147                                            pattern */
 148
 149     /* position beyond 'precomp' of the warning message furthest away from
 150      * 'precomp'.  During the parse, no warnings are raised for any problems
 151      * earlier in the parse than this position.  This works if warnings are
 152      * raised the first time a given spot is parsed, and if only one
 153      * independent warning is raised for any given spot */
 154     Size_t      latest_warn_offset;
 155
 156     I32         npar;                   /* Capture buffer count so far in the
 157                                            parse, (OPEN) plus one. ("par" 0 is
 158                                            the whole pattern)*/
 159     I32         total_par;              /* During initial parse, is either 0,
 160                                            or -1; the latter indicating a
 161                                            reparse is needed.  After that pass,
 162                                            it is what 'npar' became after the
 163                                            pass.  Hence, it being > 0 indicates
 164                                            we are in a reparse situation */
 165     I32         nestroot;               /* root parens we are in - used by
 166                                            accept */
 167     I32         seen_zerolen;
 168     regnode_offset *open_parens;        /* offsets to open parens */
 169     regnode_offset *close_parens;       /* offsets to close parens */
 170     regnode     *end_op;                /* END node in program */
 171     I32         utf8;           /* whether the pattern is utf8 or not */
 172     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 173                                 /* XXX use this for future optimisation of case
 174                                  * where pattern must be upgraded to utf8. */
 175     I32         uni_semantics;  /* If a d charset modifier should use unicode
 176                                    rules, even if the pattern is not in
 177                                    utf8 */
 178     HV          *paren_names;           /* Paren names */
 179
 180     regnode     **recurse;              /* Recurse regops */
 181     I32         recurse_count;          /* Number of recurse regops we have generated */
 182     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 183                                            through */
 184     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 185     I32         in_lookbehind;
 186     I32         contains_locale;
 187     I32         override_recoding;
 188 #ifdef EBCDIC
 189     I32         recode_x_to_native;
 190 #endif
 191     I32         in_multi_char_class;
 192     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 193                                             within pattern */
 194     int         code_index;             /* next code_blocks[] slot */
 195     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 196     scan_frame *frame_head;
 197     scan_frame *frame_last;
 198     U32         frame_count;
 199     AV         *warn_text;
 200 #ifdef ADD_TO_REGEXEC
 201     char        *starttry;              /* -Dr: where regtry was called. */
 202 #define RExC_starttry   (pRExC_state->starttry)
 203 #endif
 204     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 205 #ifdef DEBUGGING
 206     const char  *lastparse;
 207     I32         lastnum;
 208     AV          *paren_name_list;       /* idx -> name */
 209     U32         study_chunk_recursed_count;
 210     SV          *mysv1;
 211     SV          *mysv2;
 212
 213 #define RExC_lastparse  (pRExC_state->lastparse)
 214 #define RExC_lastnum    (pRExC_state->lastnum)
 215 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 216 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 217 #define RExC_mysv       (pRExC_state->mysv1)
 218 #define RExC_mysv1      (pRExC_state->mysv1)
 219 #define RExC_mysv2      (pRExC_state->mysv2)
 220
 221 #endif
 222     bool        seen_d_op;
 223     bool        strict;
 224     bool        study_started;
 225     bool        in_script_run;
 226     bool        use_BRANCHJ;
 227 };
 228
 229 #define RExC_flags      (pRExC_state->flags)
 230 #define RExC_pm_flags   (pRExC_state->pm_flags)
 231 #define RExC_precomp    (pRExC_state->precomp)
 232 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 233 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 242 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 244                                                    under /d from /u ? */
 245
 246
 247 #ifdef RE_TRACK_PATTERN_OFFSETS
 248 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 249                                                          others */
 250 #endif
 251 #define RExC_emit       (pRExC_state->emit)
 252 #define RExC_emit_start (pRExC_state->emit_start)
 253 #define RExC_sawback    (pRExC_state->sawback)
 254 #define RExC_seen       (pRExC_state->seen)
 255 #define RExC_size       (pRExC_state->size)
 256 #define RExC_maxlen        (pRExC_state->maxlen)
 257 #define RExC_npar       (pRExC_state->npar)
 258 #define RExC_total_parens       (pRExC_state->total_par)
 259 #define RExC_nestroot   (pRExC_state->nestroot)
 260 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 261 #define RExC_utf8       (pRExC_state->utf8)
 262 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 263 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 264 #define RExC_open_parens        (pRExC_state->open_parens)
 265 #define RExC_close_parens       (pRExC_state->close_parens)
 266 #define RExC_end_op     (pRExC_state->end_op)
 267 #define RExC_paren_names        (pRExC_state->paren_names)
 268 #define RExC_recurse    (pRExC_state->recurse)
 269 #define RExC_recurse_count      (pRExC_state->recurse_count)
 270 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 271 #define RExC_study_chunk_recursed_bytes  \
 272                                    (pRExC_state->study_chunk_recursed_bytes)
 273 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 274 #define RExC_contains_locale    (pRExC_state->contains_locale)
 275 #ifdef EBCDIC
 276 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 277 #endif
 278 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 279 #define RExC_frame_head (pRExC_state->frame_head)
 280 #define RExC_frame_last (pRExC_state->frame_last)
 281 #define RExC_frame_count (pRExC_state->frame_count)
 282 #define RExC_strict (pRExC_state->strict)
 283 #define RExC_study_started      (pRExC_state->study_started)
 284 #define RExC_warn_text (pRExC_state->warn_text)
 285 #define RExC_in_script_run      (pRExC_state->in_script_run)
 286 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 287
 288 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 289  * a flag to disable back-off on the fixed/floating substrings - if it's
 290  * a high complexity pattern we assume the benefit of avoiding a full match
 291  * is worth the cost of checking for the substrings even if they rarely help.
 292  */
 293 #define RExC_naughty    (pRExC_state->naughty)
 294 #define TOO_NAUGHTY (10)
 295 #define MARK_NAUGHTY(add) \
 296     if (RExC_naughty < TOO_NAUGHTY) \
 297         RExC_naughty += (add)
 298 #define MARK_NAUGHTY_EXP(exp, add) \
 299     if (RExC_naughty < TOO_NAUGHTY) \
 300         RExC_naughty += RExC_naughty / (exp) + (add)
 301
 302 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 303 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 304         ((*s) == '{' && regcurly(s)))
 305
 306 /*
 307  * Flags to be passed up and down.
 308  */
 309 #define WORST           0       /* Worst case. */
 310 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 311                                    non-null ones. */
 312
 313 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 314  * character.  (There needs to be a case: in the switch statement in regexec.c
 315  * for any node marked SIMPLE.)  Note that this is not the same thing as
 316  * REGNODE_SIMPLE */
 317 #define SIMPLE          0x02
 318 #define SPSTART         0x04    /* Starts with * or + */
 319 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 320 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 321 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 322 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 323                                    calcuate sizes as UTF-8 */
 324
 325 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 326
 327 /* whether trie related optimizations are enabled */
 328 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 329 #define TRIE_STUDY_OPT
 330 #define FULL_TRIE_STUDY
 331 #define TRIE_STCLASS
 332 #endif
 333
 334
 335
 336 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 337 #define PBITVAL(paren) (1 << ((paren) & 7))
 338 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 339 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 340 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 341
 342 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 343                                      if (!UTF) {                           \
 344                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 345                                          return 0;                         \
 346                                      }                                     \
 347                              } STMT_END
 348
 349 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 350  * a flag that indicates we need to override /d with /u as a result of
 351  * something in the pattern.  It should only be used in regards to calling
 352  * set_regex_charset() or get_regex_charse() */
 353 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 354     STMT_START {                                                            \
 355             if (DEPENDS_SEMANTICS) {                                        \
 356                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 357                 RExC_uni_semantics = 1;                                     \
 358                 if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) {     \
 359                     /* No need to restart the parse if we haven't seen      \
 360                      * anything that differs between /u and /d, and no need \
 361                      * to restart immediately if we're going to reparse     \
 362                      * anyway to count parens */                            \
 363                     *flagp |= RESTART_PARSE;                                \
 364                     return restart_retval;                                  \
 365                 }                                                           \
 366             }                                                               \
 367     } STMT_END
 368
 369 #define BRANCH_MAX_OFFSET   U16_MAX
 370 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 371     STMT_START {                                                            \
 372                 RExC_use_BRANCHJ = 1;                                       \
 373                 if (LIKELY(RExC_total_parens >= 0)) {                       \
 374                     /* No need to restart the parse immediately if we're    \
 375                      * going to reparse anyway to count parens */           \
 376                     *flagp |= RESTART_PARSE;                                \
 377                     return restart_retval;                                  \
 378                 }                                                           \
 379     } STMT_END
 380
 381 #define REQUIRE_PARENS_PASS                                                 \
 382     STMT_START {                                                            \
 383                     if (RExC_total_parens == 0) RExC_total_parens = -1;     \
 384     } STMT_END
 385
 386 /* This is used to return failure (zero) early from the calling function if
 387  * various flags in 'flags' are set.  Two flags always cause a return:
 388  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 389  * additional flags that should cause a return; 0 if none.  If the return will
 390  * be done, '*flagp' is first set to be all of the flags that caused the
 391  * return. */
 392 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 393     STMT_START {                                                            \
 394             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 395                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 396                 return 0;                                                   \
 397             }                                                               \
 398     } STMT_END
 399
 400 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 401
 402 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 403                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 404 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 405                                     if (MUST_RESTART(*(flagp))) return 0
 406
 407 /* This converts the named class defined in regcomp.h to its equivalent class
 408  * number defined in handy.h. */
 409 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 410 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 411
 412 #define _invlist_union_complement_2nd(a, b, output) \
 413                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 414 #define _invlist_intersection_complement_2nd(a, b, output) \
 415                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 416
 417 /* About scan_data_t.
 418
 419   During optimisation we recurse through the regexp program performing
 420   various inplace (keyhole style) optimisations. In addition study_chunk
 421   and scan_commit populate this data structure with information about
 422   what strings MUST appear in the pattern. We look for the longest
 423   string that must appear at a fixed location, and we look for the
 424   longest string that may appear at a floating location. So for instance
 425   in the pattern:
 426
 427     /FOO[xX]A.*B[xX]BAR/
 428
 429   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 430   strings (because they follow a .* construct). study_chunk will identify
 431   both FOO and BAR as being the longest fixed and floating strings respectively.
 432
 433   The strings can be composites, for instance
 434
 435      /(f)(o)(o)/
 436
 437   will result in a composite fixed substring 'foo'.
 438
 439   For each string some basic information is maintained:
 440
 441   - min_offset
 442     This is the position the string must appear at, or not before.
 443     It also implicitly (when combined with minlenp) tells us how many
 444     characters must match before the string we are searching for.
 445     Likewise when combined with minlenp and the length of the string it
 446     tells us how many characters must appear after the string we have
 447     found.
 448
 449   - max_offset
 450     Only used for floating strings. This is the rightmost point that
 451     the string can appear at. If set to SSize_t_MAX it indicates that the
 452     string can occur infinitely far to the right.
 453     For fixed strings, it is equal to min_offset.
 454
 455   - minlenp
 456     A pointer to the minimum number of characters of the pattern that the
 457     string was found inside. This is important as in the case of positive
 458     lookahead or positive lookbehind we can have multiple patterns
 459     involved. Consider
 460
 461     /(?=FOO).*F/
 462
 463     The minimum length of the pattern overall is 3, the minimum length
 464     of the lookahead part is 3, but the minimum length of the part that
 465     will actually match is 1. So 'FOO's minimum length is 3, but the
 466     minimum length for the F is 1. This is important as the minimum length
 467     is used to determine offsets in front of and behind the string being
 468     looked for.  Since strings can be composites this is the length of the
 469     pattern at the time it was committed with a scan_commit. Note that
 470     the length is calculated by study_chunk, so that the minimum lengths
 471     are not known until the full pattern has been compiled, thus the
 472     pointer to the value.
 473
 474   - lookbehind
 475
 476     In the case of lookbehind the string being searched for can be
 477     offset past the start point of the final matching string.
 478     If this value was just blithely removed from the min_offset it would
 479     invalidate some of the calculations for how many chars must match
 480     before or after (as they are derived from min_offset and minlen and
 481     the length of the string being searched for).
 482     When the final pattern is compiled and the data is moved from the
 483     scan_data_t structure into the regexp structure the information
 484     about lookbehind is factored in, with the information that would
 485     have been lost precalculated in the end_shift field for the
 486     associated string.
 487
 488   The fields pos_min and pos_delta are used to store the minimum offset
 489   and the delta to the maximum offset at the current point in the pattern.
 490
 491 */
 492
 493 struct scan_data_substrs {
 494     SV      *str;       /* longest substring found in pattern */
 495     SSize_t min_offset; /* earliest point in string it can appear */
 496     SSize_t max_offset; /* latest point in string it can appear */
 497     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 498     SSize_t lookbehind; /* is the pos of the string modified by LB */
 499     I32 flags;          /* per substring SF_* and SCF_* flags */
 500 };
 501
 502 typedef struct scan_data_t {
 503     /*I32 len_min;      unused */
 504     /*I32 len_delta;    unused */
 505     SSize_t pos_min;
 506     SSize_t pos_delta;
 507     SV *last_found;
 508     SSize_t last_end;       /* min value, <0 unless valid. */
 509     SSize_t last_start_min;
 510     SSize_t last_start_max;
 511     U8      cur_is_floating; /* whether the last_* values should be set as
 512                               * the next fixed (0) or floating (1)
 513                               * substring */
 514
 515     /* [0] is longest fixed substring so far, [1] is longest float so far */
 516     struct scan_data_substrs  substrs[2];
 517
 518     I32 flags;             /* common SF_* and SCF_* flags */
 519     I32 whilem_c;
 520     SSize_t *last_closep;
 521     regnode_ssc *start_class;
 522 } scan_data_t;
 523
 524 /*
 525  * Forward declarations for pregcomp()'s friends.
 526  */
 527
 528 static const scan_data_t zero_scan_data = {
 529     0, 0, NULL, 0, 0, 0, 0,
 530     {
 531         { NULL, 0, 0, 0, 0, 0 },
 532         { NULL, 0, 0, 0, 0, 0 },
 533     },
 534     0, 0, NULL, NULL
 535 };
 536
 537 /* study flags */
 538
 539 #define SF_BEFORE_SEOL          0x0001
 540 #define SF_BEFORE_MEOL          0x0002
 541 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 542
 543 #define SF_IS_INF               0x0040
 544 #define SF_HAS_PAR              0x0080
 545 #define SF_IN_PAR               0x0100
 546 #define SF_HAS_EVAL             0x0200
 547
 548
 549 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 550  * longest substring in the pattern. When it is not set the optimiser keeps
 551  * track of position, but does not keep track of the actual strings seen,
 552  *
 553  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 554  * /foo/i will not.
 555  *
 556  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 557  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 558  * turned off because of the alternation (BRANCH). */
 559 #define SCF_DO_SUBSTR           0x0400
 560
 561 #define SCF_DO_STCLASS_AND      0x0800
 562 #define SCF_DO_STCLASS_OR       0x1000
 563 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 564 #define SCF_WHILEM_VISITED_POS  0x2000
 565
 566 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 567 #define SCF_SEEN_ACCEPT         0x8000
 568 #define SCF_TRIE_DOING_RESTUDY 0x10000
 569 #define SCF_IN_DEFINE          0x20000
 570
 571
 572
 573
 574 #define UTF cBOOL(RExC_utf8)
 575
 576 /* The enums for all these are ordered so things work out correctly */
 577 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 578 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 579                                                      == REGEX_DEPENDS_CHARSET)
 580 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 581 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 582                                                      >= REGEX_UNICODE_CHARSET)
 583 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 584                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 585 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 586                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 587 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 588                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 589
 590 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 591
 592 /* For programs that want to be strictly Unicode compatible by dying if any
 593  * attempt is made to match a non-Unicode code point against a Unicode
 594  * property.  */
 595 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 596
 597 #define OOB_NAMEDCLASS          -1
 598
 599 /* There is no code point that is out-of-bounds, so this is problematic.  But
 600  * its only current use is to initialize a variable that is always set before
 601  * looked at. */
 602 #define OOB_UNICODE             0xDEADBEEF
 603
 604 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 605
 606
 607 /* length of regex to show in messages that don't mark a position within */
 608 #define RegexLengthToShowInErrorMessages 127
 609
 610 /*
 611  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 612  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 613  * op/pragma/warn/regcomp.
 614  */
 615 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 616 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 617
 618 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 619                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 620
 621 /* The code in this file in places uses one level of recursion with parsing
 622  * rebased to an alternate string constructed by us in memory.  This can take
 623  * the form of something that is completely different from the input, or
 624  * something that uses the input as part of the alternate.  In the first case,
 625  * there should be no possibility of an error, as we are in complete control of
 626  * the alternate string.  But in the second case we don't completely control
 627  * the input portion, so there may be errors in that.  Here's an example:
 628  *      /[abc\x{DF}def]/ui
 629  * is handled specially because \x{df} folds to a sequence of more than one
 630  * character: 'ss'.  What is done is to create and parse an alternate string,
 631  * which looks like this:
 632  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 633  * where it uses the input unchanged in the middle of something it constructs,
 634  * which is a branch for the DF outside the character class, and clustering
 635  * parens around the whole thing. (It knows enough to skip the DF inside the
 636  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 637  * need to be reported.  The general situation looks like this:
 638  *
 639  *                                       |<------- identical ------>|
 640  *              sI                       tI               xI       eI
 641  * Input:       ---------------------------------------------------------------
 642  * Constructed:         ---------------------------------------------------
 643  *                      sC               tC               xC       eC     EC
 644  *                                       |<------- identical ------>|
 645  *
 646  * sI..eI   is the portion of the input pattern we are concerned with here.
 647  * sC..EC   is the constructed substitute parse string.
 648  *  sC..tC  is constructed by us
 649  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 650  *          In the diagram, these are vertically aligned.
 651  *  eC..EC  is also constructed by us.
 652  * xC       is the position in the substitute parse string where we found a
 653  *          problem.
 654  * xI       is the position in the original pattern corresponding to xC.
 655  *
 656  * We want to display a message showing the real input string.  Thus we need to
 657  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 658  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 659  * get:
 660  *      xI = tI + (xC - tC)
 661  *
 662  * When the substitute parse is constructed, the code needs to set:
 663  *      RExC_start (sC)
 664  *      RExC_end (eC)
 665  *      RExC_copy_start_in_input  (tI)
 666  *      RExC_copy_start_in_constructed (tC)
 667  * and restore them when done.
 668  *
 669  * During normal processing of the input pattern, both
 670  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 671  * sI, so that xC equals xI.
 672  */
 673
 674 #define sI              RExC_precomp
 675 #define eI              RExC_precomp_end
 676 #define sC              RExC_start
 677 #define eC              RExC_end
 678 #define tI              RExC_copy_start_in_input
 679 #define tC              RExC_copy_start_in_constructed
 680 #define xI(xC)          (tI + (xC - tC))
 681 #define xI_offset(xC)   (xI(xC) - sI)
 682
 683 #define REPORT_LOCATION_ARGS(xC)                                            \
 684     UTF8fARG(UTF,                                                           \
 685              (xI(xC) > eI) /* Don't run off end */                          \
 686               ? eI - sI   /* Length before the <--HERE */                   \
 687               : ((xI_offset(xC) >= 0)                                       \
 688                  ? xI_offset(xC)                                            \
 689                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 690                                     IVdf " trying to output message for "   \
 691                                     " pattern %.*s",                        \
 692                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 693                                     ((int) (eC - sC)), sC), 0)),            \
 694              sI),         /* The input pattern printed up to the <--HERE */ \
 695     UTF8fARG(UTF,                                                           \
 696              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 697              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 698
 699 /* Used to point after bad bytes for an error message, but avoid skipping
 700  * past a nul byte. */
 701 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
 702
 703 /* Set up to clean up after our imminent demise */
 704 #define PREPARE_TO_DIE                                                      \
 705     STMT_START {                                                            \
 706         if (RExC_rx_sv)                                                     \
 707             SAVEFREESV(RExC_rx_sv);                                         \
 708         if (RExC_open_parens)                                               \
 709             SAVEFREEPV(RExC_open_parens);                                   \
 710         if (RExC_close_parens)                                              \
 711             SAVEFREEPV(RExC_close_parens);                                  \
 712     } STMT_END
 713
 714 /*
 715  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 716  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 717  * "...".
 718  */
 719 #define _FAIL(code) STMT_START {                                        \
 720     const char *ellipses = "";                                          \
 721     IV len = RExC_precomp_end - RExC_precomp;                           \
 722                                                                         \
 723     PREPARE_TO_DIE;                                                     \
 724     if (len > RegexLengthToShowInErrorMessages) {                       \
 725         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 726         len = RegexLengthToShowInErrorMessages - 10;                    \
 727         ellipses = "...";                                               \
 728     }                                                                   \
 729     code;                                                               \
 730 } STMT_END
 731
 732 #define FAIL(msg) _FAIL(                            \
 733     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 734             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 735
 736 #define FAIL2(msg,arg) _FAIL(                       \
 737     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 738             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 739
 740 /*
 741  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 742  */
 743 #define Simple_vFAIL(m) STMT_START {                                    \
 744     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 745             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 746 } STMT_END
 747
 748 /*
 749  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 750  */
 751 #define vFAIL(m) STMT_START {                           \
 752     PREPARE_TO_DIE;                                     \
 753     Simple_vFAIL(m);                                    \
 754 } STMT_END
 755
 756 /*
 757  * Like Simple_vFAIL(), but accepts two arguments.
 758  */
 759 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 760     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 761                       REPORT_LOCATION_ARGS(RExC_parse));        \
 762 } STMT_END
 763
 764 /*
 765  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 766  */
 767 #define vFAIL2(m,a1) STMT_START {                       \
 768     PREPARE_TO_DIE;                                     \
 769     Simple_vFAIL2(m, a1);                               \
 770 } STMT_END
 771
 772
 773 /*
 774  * Like Simple_vFAIL(), but accepts three arguments.
 775  */
 776 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 777     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 778             REPORT_LOCATION_ARGS(RExC_parse));                  \
 779 } STMT_END
 780
 781 /*
 782  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 783  */
 784 #define vFAIL3(m,a1,a2) STMT_START {                    \
 785     PREPARE_TO_DIE;                                     \
 786     Simple_vFAIL3(m, a1, a2);                           \
 787 } STMT_END
 788
 789 /*
 790  * Like Simple_vFAIL(), but accepts four arguments.
 791  */
 792 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 793     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 794             REPORT_LOCATION_ARGS(RExC_parse));                  \
 795 } STMT_END
 796
 797 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 798     PREPARE_TO_DIE;                                     \
 799     Simple_vFAIL4(m, a1, a2, a3);                       \
 800 } STMT_END
 801
 802 /* A specialized version of vFAIL2 that works with UTF8f */
 803 #define vFAIL2utf8f(m, a1) STMT_START {             \
 804     PREPARE_TO_DIE;                                 \
 805     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 806             REPORT_LOCATION_ARGS(RExC_parse));      \
 807 } STMT_END
 808
 809 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 810     PREPARE_TO_DIE;                                     \
 811     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 812             REPORT_LOCATION_ARGS(RExC_parse));          \
 813 } STMT_END
 814
 815 /* Setting this to NULL is a signal to not output warnings */
 816 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
 817 #define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
 818
 819 /* Since a warning can be generated multiple times as the input is reparsed, we
 820  * output it the first time we come to that point in the parse, but suppress it
 821  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 822  * generate any warnings */
 823 #define TO_OUTPUT_WARNINGS(loc)                                         \
 824   (   RExC_copy_start_in_constructed                                    \
 825    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 826
 827 /* After we've emitted a warning, we save the position in the input so we don't
 828  * output it again */
 829 #define UPDATE_WARNINGS_LOC(loc)                                        \
 830     STMT_START {                                                        \
 831         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 832             RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
 833         }                                                               \
 834     } STMT_END
 835
 836 /* 'warns' is the output of the packWARNx macro used in 'code' */
 837 #define _WARN_HELPER(loc, warns, code)                                  \
 838     STMT_START {                                                        \
 839         if (! RExC_copy_start_in_constructed) {                         \
 840             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 841                               " expected at '%s'",                      \
 842                               __FILE__, __LINE__, loc);                 \
 843         }                                                               \
 844         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 845             if (ckDEAD(warns))                                          \
 846                 PREPARE_TO_DIE;                                         \
 847             code;                                                       \
 848             UPDATE_WARNINGS_LOC(loc);                                   \
 849         }                                                               \
 850     } STMT_END
 851
 852 /* m is not necessarily a "literal string", in this macro */
 853 #define reg_warn_non_literal_string(loc, m)                             \
 854     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 855                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 856                                        "%s" REPORT_LOCATION,            \
 857                                   m, REPORT_LOCATION_ARGS(loc)))
 858
 859 #define ckWARNreg(loc,m)                                                \
 860     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 861                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 862                                           m REPORT_LOCATION,            \
 863                                           REPORT_LOCATION_ARGS(loc)))
 864
 865 #define vWARN(loc, m)                                                   \
 866     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 867                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 868                                        m REPORT_LOCATION,               \
 869                                        REPORT_LOCATION_ARGS(loc)))      \
 870
 871 #define vWARN_dep(loc, m)                                               \
 872     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 873                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 874                                        m REPORT_LOCATION,               \
 875                                        REPORT_LOCATION_ARGS(loc)))
 876
 877 #define ckWARNdep(loc,m)                                                \
 878     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 879                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 880                                             m REPORT_LOCATION,          \
 881                                             REPORT_LOCATION_ARGS(loc)))
 882
 883 #define ckWARNregdep(loc,m)                                                 \
 884     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 885                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 886                                                       WARN_REGEXP),         \
 887                                              m REPORT_LOCATION,             \
 888                                              REPORT_LOCATION_ARGS(loc)))
 889
 890 #define ckWARN2reg_d(loc,m, a1)                                             \
 891     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 892                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
 893                                             m REPORT_LOCATION,              \
 894                                             a1, REPORT_LOCATION_ARGS(loc)))
 895
 896 #define ckWARN2reg(loc, m, a1)                                              \
 897     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 898                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 899                                           m REPORT_LOCATION,                \
 900                                           a1, REPORT_LOCATION_ARGS(loc)))
 901
 902 #define vWARN3(loc, m, a1, a2)                                              \
 903     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 904                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
 905                                        m REPORT_LOCATION,                   \
 906                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
 907
 908 #define ckWARN3reg(loc, m, a1, a2)                                          \
 909     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 910                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 911                                           m REPORT_LOCATION,                \
 912                                           a1, a2,                           \
 913                                           REPORT_LOCATION_ARGS(loc)))
 914
 915 #define vWARN4(loc, m, a1, a2, a3)                                      \
 916     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 917                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 918                                        m REPORT_LOCATION,               \
 919                                        a1, a2, a3,                      \
 920                                        REPORT_LOCATION_ARGS(loc)))
 921
 922 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
 923     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 924                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 925                                           m REPORT_LOCATION,            \
 926                                           a1, a2, a3,                   \
 927                                           REPORT_LOCATION_ARGS(loc)))
 928
 929 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
 930     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 931                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 932                                        m REPORT_LOCATION,               \
 933                                        a1, a2, a3, a4,                  \
 934                                        REPORT_LOCATION_ARGS(loc)))
 935
 936 #define ckWARNexperimental(loc, class, m)                               \
 937     _WARN_HELPER(loc, packWARN(class),                                  \
 938                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
 939                                             m REPORT_LOCATION,          \
 940                                             REPORT_LOCATION_ARGS(loc)))
 941
 942 /* Convert between a pointer to a node and its offset from the beginning of the
 943  * program */
 944 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
 945 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
 946
 947 /* Macros for recording node offsets.   20001227 mjd@plover.com
 948  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 949  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 950  * Element 0 holds the number n.
 951  * Position is 1 indexed.
 952  */
 953 #ifndef RE_TRACK_PATTERN_OFFSETS
 954 #define Set_Node_Offset_To_R(offset,byte)
 955 #define Set_Node_Offset(node,byte)
 956 #define Set_Cur_Node_Offset
 957 #define Set_Node_Length_To_R(node,len)
 958 #define Set_Node_Length(node,len)
 959 #define Set_Node_Cur_Length(node,start)
 960 #define Node_Offset(n)
 961 #define Node_Length(n)
 962 #define Set_Node_Offset_Length(node,offset,len)
 963 #define ProgLen(ri) ri->u.proglen
 964 #define SetProgLen(ri,x) ri->u.proglen = x
 965 #define Track_Code(code)
 966 #else
 967 #define ProgLen(ri) ri->u.offsets[0]
 968 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 969 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
 970         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 971                     __LINE__, (int)(offset), (int)(byte)));             \
 972         if((offset) < 0) {                                              \
 973             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 974                                          (int)(offset));                \
 975         } else {                                                        \
 976             RExC_offsets[2*(offset)-1] = (byte);                        \
 977         }                                                               \
 978 } STMT_END
 979
 980 #define Set_Node_Offset(node,byte)                                      \
 981     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
 982 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 983
 984 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 985         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 986                 __LINE__, (int)(node), (int)(len)));                    \
 987         if((node) < 0) {                                                \
 988             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 989                                          (int)(node));                  \
 990         } else {                                                        \
 991             RExC_offsets[2*(node)] = (len);                             \
 992         }                                                               \
 993 } STMT_END
 994
 995 #define Set_Node_Length(node,len) \
 996     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
 997 #define Set_Node_Cur_Length(node, start)                \
 998     Set_Node_Length(node, RExC_parse - start)
 999
1000 /* Get offsets and lengths */
1001 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1002 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1003
1004 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1005     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1006     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1007 } STMT_END
1008
1009 #define Track_Code(code) STMT_START { code } STMT_END
1010 #endif
1011
1012 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1013 #define EXPERIMENTAL_INPLACESCAN
1014 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1015
1016 #ifdef DEBUGGING
1017 int
1018 Perl_re_printf(pTHX_ const char *fmt, ...)
1019 {
1020     va_list ap;
1021     int result;
1022     PerlIO *f= Perl_debug_log;
1023     PERL_ARGS_ASSERT_RE_PRINTF;
1024     va_start(ap, fmt);
1025     result = PerlIO_vprintf(f, fmt, ap);
1026     va_end(ap);
1027     return result;
1028 }
1029
1030 int
1031 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1032 {
1033     va_list ap;
1034     int result;
1035     PerlIO *f= Perl_debug_log;
1036     PERL_ARGS_ASSERT_RE_INDENTF;
1037     va_start(ap, depth);
1038     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1039     result = PerlIO_vprintf(f, fmt, ap);
1040     va_end(ap);
1041     return result;
1042 }
1043 #endif /* DEBUGGING */
1044
1045 #define DEBUG_RExC_seen()                                                   \
1046         DEBUG_OPTIMISE_MORE_r({                                             \
1047             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1048                                                                             \
1049             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1050                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1051                                                                             \
1052             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1053                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1054                                                                             \
1055             if (RExC_seen & REG_GPOS_SEEN)                                  \
1056                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1057                                                                             \
1058             if (RExC_seen & REG_RECURSE_SEEN)                               \
1059                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1060                                                                             \
1061             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1062                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1063                                                                             \
1064             if (RExC_seen & REG_VERBARG_SEEN)                               \
1065                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1066                                                                             \
1067             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1068                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1069                                                                             \
1070             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1071                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1072                                                                             \
1073             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1074                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1075                                                                             \
1076             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1077                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1078                                                                             \
1079             Perl_re_printf( aTHX_ "\n");                                    \
1080         });
1081
1082 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1083   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1084
1085
1086 #ifdef DEBUGGING
1087 static void
1088 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1089                                     const char *close_str)
1090 {
1091     if (!flags)
1092         return;
1093
1094     Perl_re_printf( aTHX_  "%s", open_str);
1095     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1096     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1097     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1098     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1099     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1100     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1101     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1102     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1103     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1104     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1105     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1106     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1107     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1108     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1109     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1110     Perl_re_printf( aTHX_  "%s", close_str);
1111 }
1112
1113
1114 static void
1115 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1116                     U32 depth, int is_inf)
1117 {
1118     GET_RE_DEBUG_FLAGS_DECL;
1119
1120     DEBUG_OPTIMISE_MORE_r({
1121         if (!data)
1122             return;
1123         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1124             depth,
1125             where,
1126             (IV)data->pos_min,
1127             (IV)data->pos_delta,
1128             (UV)data->flags
1129         );
1130
1131         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1132
1133         Perl_re_printf( aTHX_
1134             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1135             (IV)data->whilem_c,
1136             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1137             is_inf ? "INF " : ""
1138         );
1139
1140         if (data->last_found) {
1141             int i;
1142             Perl_re_printf(aTHX_
1143                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1144                     SvPVX_const(data->last_found),
1145                     (IV)data->last_end,
1146                     (IV)data->last_start_min,
1147                     (IV)data->last_start_max
1148             );
1149
1150             for (i = 0; i < 2; i++) {
1151                 Perl_re_printf(aTHX_
1152                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1153                     data->cur_is_floating == i ? "*" : "",
1154                     i ? "Float" : "Fixed",
1155                     SvPVX_const(data->substrs[i].str),
1156                     (IV)data->substrs[i].min_offset,
1157                     (IV)data->substrs[i].max_offset
1158                 );
1159                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1160             }
1161         }
1162
1163         Perl_re_printf( aTHX_ "\n");
1164     });
1165 }
1166
1167
1168 static void
1169 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1170                 regnode *scan, U32 depth, U32 flags)
1171 {
1172     GET_RE_DEBUG_FLAGS_DECL;
1173
1174     DEBUG_OPTIMISE_r({
1175         regnode *Next;
1176
1177         if (!scan)
1178             return;
1179         Next = regnext(scan);
1180         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1181         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1182             depth,
1183             str,
1184             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1185             Next ? (REG_NODE_NUM(Next)) : 0 );
1186         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1187         Perl_re_printf( aTHX_  "\n");
1188    });
1189 }
1190
1191
1192 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1193                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1194
1195 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1196                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1197
1198 #else
1199 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1200 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1201 #endif
1202
1203
1204 /* =========================================================
1205  * BEGIN edit_distance stuff.
1206  *
1207  * This calculates how many single character changes of any type are needed to
1208  * transform a string into another one.  It is taken from version 3.1 of
1209  *
1210  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1211  */
1212
1213 /* Our unsorted dictionary linked list.   */
1214 /* Note we use UVs, not chars. */
1215
1216 struct dictionary{
1217   UV key;
1218   UV value;
1219   struct dictionary* next;
1220 };
1221 typedef struct dictionary item;
1222
1223
1224 PERL_STATIC_INLINE item*
1225 push(UV key, item* curr)
1226 {
1227     item* head;
1228     Newx(head, 1, item);
1229     head->key = key;
1230     head->value = 0;
1231     head->next = curr;
1232     return head;
1233 }
1234
1235
1236 PERL_STATIC_INLINE item*
1237 find(item* head, UV key)
1238 {
1239     item* iterator = head;
1240     while (iterator){
1241         if (iterator->key == key){
1242             return iterator;
1243         }
1244         iterator = iterator->next;
1245     }
1246
1247     return NULL;
1248 }
1249
1250 PERL_STATIC_INLINE item*
1251 uniquePush(item* head, UV key)
1252 {
1253     item* iterator = head;
1254
1255     while (iterator){
1256         if (iterator->key == key) {
1257             return head;
1258         }
1259         iterator = iterator->next;
1260     }
1261
1262     return push(key, head);
1263 }
1264
1265 PERL_STATIC_INLINE void
1266 dict_free(item* head)
1267 {
1268     item* iterator = head;
1269
1270     while (iterator) {
1271         item* temp = iterator;
1272         iterator = iterator->next;
1273         Safefree(temp);
1274     }
1275
1276     head = NULL;
1277 }
1278
1279 /* End of Dictionary Stuff */
1280
1281 /* All calculations/work are done here */
1282 STATIC int
1283 S_edit_distance(const UV* src,
1284                 const UV* tgt,
1285                 const STRLEN x,             /* length of src[] */
1286                 const STRLEN y,             /* length of tgt[] */
1287                 const SSize_t maxDistance
1288 )
1289 {
1290     item *head = NULL;
1291     UV swapCount, swapScore, targetCharCount, i, j;
1292     UV *scores;
1293     UV score_ceil = x + y;
1294
1295     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1296
1297     /* intialize matrix start values */
1298     Newx(scores, ( (x + 2) * (y + 2)), UV);
1299     scores[0] = score_ceil;
1300     scores[1 * (y + 2) + 0] = score_ceil;
1301     scores[0 * (y + 2) + 1] = score_ceil;
1302     scores[1 * (y + 2) + 1] = 0;
1303     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1304
1305     /* work loops    */
1306     /* i = src index */
1307     /* j = tgt index */
1308     for (i=1;i<=x;i++) {
1309         if (i < x)
1310             head = uniquePush(head, src[i]);
1311         scores[(i+1) * (y + 2) + 1] = i;
1312         scores[(i+1) * (y + 2) + 0] = score_ceil;
1313         swapCount = 0;
1314
1315         for (j=1;j<=y;j++) {
1316             if (i == 1) {
1317                 if(j < y)
1318                 head = uniquePush(head, tgt[j]);
1319                 scores[1 * (y + 2) + (j + 1)] = j;
1320                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1321             }
1322
1323             targetCharCount = find(head, tgt[j-1])->value;
1324             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1325
1326             if (src[i-1] != tgt[j-1]){
1327                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1328             }
1329             else {
1330                 swapCount = j;
1331                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1332             }
1333         }
1334
1335         find(head, src[i-1])->value = i;
1336     }
1337
1338     {
1339         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1340         dict_free(head);
1341         Safefree(scores);
1342         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1343     }
1344 }
1345
1346 /* END of edit_distance() stuff
1347  * ========================================================= */
1348
1349 /* is c a control character for which we have a mnemonic? */
1350 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1351
1352 STATIC const char *
1353 S_cntrl_to_mnemonic(const U8 c)
1354 {
1355     /* Returns the mnemonic string that represents character 'c', if one
1356      * exists; NULL otherwise.  The only ones that exist for the purposes of
1357      * this routine are a few control characters */
1358
1359     switch (c) {
1360         case '\a':       return "\\a";
1361         case '\b':       return "\\b";
1362         case ESC_NATIVE: return "\\e";
1363         case '\f':       return "\\f";
1364         case '\n':       return "\\n";
1365         case '\r':       return "\\r";
1366         case '\t':       return "\\t";
1367     }
1368
1369     return NULL;
1370 }
1371
1372 /* Mark that we cannot extend a found fixed substring at this point.
1373    Update the longest found anchored substring or the longest found
1374    floating substrings if needed. */
1375
1376 STATIC void
1377 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1378                     SSize_t *minlenp, int is_inf)
1379 {
1380     const STRLEN l = CHR_SVLEN(data->last_found);
1381     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1382     const STRLEN old_l = CHR_SVLEN(longest_sv);
1383     GET_RE_DEBUG_FLAGS_DECL;
1384
1385     PERL_ARGS_ASSERT_SCAN_COMMIT;
1386
1387     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1388         const U8 i = data->cur_is_floating;
1389         SvSetMagicSV(longest_sv, data->last_found);
1390         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1391
1392         if (!i) /* fixed */
1393             data->substrs[0].max_offset = data->substrs[0].min_offset;
1394         else { /* float */
1395             data->substrs[1].max_offset = (l
1396                           ? data->last_start_max
1397                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1398                                          ? SSize_t_MAX
1399                                          : data->pos_min + data->pos_delta));
1400             if (is_inf
1401                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1402                 data->substrs[1].max_offset = SSize_t_MAX;
1403         }
1404
1405         if (data->flags & SF_BEFORE_EOL)
1406             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1407         else
1408             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1409         data->substrs[i].minlenp = minlenp;
1410         data->substrs[i].lookbehind = 0;
1411     }
1412
1413     SvCUR_set(data->last_found, 0);
1414     {
1415         SV * const sv = data->last_found;
1416         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1417             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1418             if (mg)
1419                 mg->mg_len = 0;
1420         }
1421     }
1422     data->last_end = -1;
1423     data->flags &= ~SF_BEFORE_EOL;
1424     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1425 }
1426
1427 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1428  * list that describes which code points it matches */
1429
1430 STATIC void
1431 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1432 {
1433     /* Set the SSC 'ssc' to match an empty string or any code point */
1434
1435     PERL_ARGS_ASSERT_SSC_ANYTHING;
1436
1437     assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439     /* mortalize so won't leak */
1440     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1441     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1442 }
1443
1444 STATIC int
1445 S_ssc_is_anything(const regnode_ssc *ssc)
1446 {
1447     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1448      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1449      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1450      * in any way, so there's no point in using it */
1451
1452     UV start, end;
1453     bool ret;
1454
1455     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1456
1457     assert(is_ANYOF_SYNTHETIC(ssc));
1458
1459     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1460         return FALSE;
1461     }
1462
1463     /* See if the list consists solely of the range 0 - Infinity */
1464     invlist_iterinit(ssc->invlist);
1465     ret = invlist_iternext(ssc->invlist, &start, &end)
1466           && start == 0
1467           && end == UV_MAX;
1468
1469     invlist_iterfinish(ssc->invlist);
1470
1471     if (ret) {
1472         return TRUE;
1473     }
1474
1475     /* If e.g., both \w and \W are set, matches everything */
1476     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1477         int i;
1478         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1479             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1480                 return TRUE;
1481             }
1482         }
1483     }
1484
1485     return FALSE;
1486 }
1487
1488 STATIC void
1489 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1490 {
1491     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1492      * string, any code point, or any posix class under locale */
1493
1494     PERL_ARGS_ASSERT_SSC_INIT;
1495
1496     Zero(ssc, 1, regnode_ssc);
1497     set_ANYOF_SYNTHETIC(ssc);
1498     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1499     ssc_anything(ssc);
1500
1501     /* If any portion of the regex is to operate under locale rules that aren't
1502      * fully known at compile time, initialization includes it.  The reason
1503      * this isn't done for all regexes is that the optimizer was written under
1504      * the assumption that locale was all-or-nothing.  Given the complexity and
1505      * lack of documentation in the optimizer, and that there are inadequate
1506      * test cases for locale, many parts of it may not work properly, it is
1507      * safest to avoid locale unless necessary. */
1508     if (RExC_contains_locale) {
1509         ANYOF_POSIXL_SETALL(ssc);
1510     }
1511     else {
1512         ANYOF_POSIXL_ZERO(ssc);
1513     }
1514 }
1515
1516 STATIC int
1517 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1518                         const regnode_ssc *ssc)
1519 {
1520     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1521      * to the list of code points matched, and locale posix classes; hence does
1522      * not check its flags) */
1523
1524     UV start, end;
1525     bool ret;
1526
1527     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1528
1529     assert(is_ANYOF_SYNTHETIC(ssc));
1530
1531     invlist_iterinit(ssc->invlist);
1532     ret = invlist_iternext(ssc->invlist, &start, &end)
1533           && start == 0
1534           && end == UV_MAX;
1535
1536     invlist_iterfinish(ssc->invlist);
1537
1538     if (! ret) {
1539         return FALSE;
1540     }
1541
1542     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1543         return FALSE;
1544     }
1545
1546     return TRUE;
1547 }
1548
1549 STATIC SV*
1550 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1551                                const regnode_charclass* const node)
1552 {
1553     /* Returns a mortal inversion list defining which code points are matched
1554      * by 'node', which is of type ANYOF.  Handles complementing the result if
1555      * appropriate.  If some code points aren't knowable at this time, the
1556      * returned list must, and will, contain every code point that is a
1557      * possibility. */
1558
1559     SV* invlist = NULL;
1560     SV* only_utf8_locale_invlist = NULL;
1561     unsigned int i;
1562     const U32 n = ARG(node);
1563     bool new_node_has_latin1 = FALSE;
1564
1565     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1566
1567     /* Look at the data structure created by S_set_ANYOF_arg() */
1568     if (n != ANYOF_ONLY_HAS_BITMAP) {
1569         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1570         AV * const av = MUTABLE_AV(SvRV(rv));
1571         SV **const ary = AvARRAY(av);
1572         assert(RExC_rxi->data->what[n] == 's');
1573
1574         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1575             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
1576         }
1577         else if (ary[0] && ary[0] != &PL_sv_undef) {
1578
1579             /* Here, no compile-time swash, and there are things that won't be
1580              * known until runtime -- we have to assume it could be anything */
1581             invlist = sv_2mortal(_new_invlist(1));
1582             return _add_range_to_invlist(invlist, 0, UV_MAX);
1583         }
1584         else if (ary[3] && ary[3] != &PL_sv_undef) {
1585
1586             /* Here no compile-time swash, and no run-time only data.  Use the
1587              * node's inversion list */
1588             invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1589         }
1590
1591         /* Get the code points valid only under UTF-8 locales */
1592         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1593             && ary[2] && ary[2] != &PL_sv_undef)
1594         {
1595             only_utf8_locale_invlist = ary[2];
1596         }
1597     }
1598
1599     if (! invlist) {
1600         invlist = sv_2mortal(_new_invlist(0));
1601     }
1602
1603     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1604      * code points, and an inversion list for the others, but if there are code
1605      * points that should match only conditionally on the target string being
1606      * UTF-8, those are placed in the inversion list, and not the bitmap.
1607      * Since there are circumstances under which they could match, they are
1608      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1609      * to exclude them here, so that when we invert below, the end result
1610      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1611      * have to do this here before we add the unconditionally matched code
1612      * points */
1613     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1614         _invlist_intersection_complement_2nd(invlist,
1615                                              PL_UpperLatin1,
1616                                              &invlist);
1617     }
1618
1619     /* Add in the points from the bit map */
1620     if (OP(node) != ANYOFH) {
1621         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1622             if (ANYOF_BITMAP_TEST(node, i)) {
1623                 unsigned int start = i++;
1624
1625                 for (;    i < NUM_ANYOF_CODE_POINTS
1626                        && ANYOF_BITMAP_TEST(node, i); ++i)
1627                 {
1628                     /* empty */
1629                 }
1630                 invlist = _add_range_to_invlist(invlist, start, i-1);
1631                 new_node_has_latin1 = TRUE;
1632             }
1633         }
1634     }
1635
1636     /* If this can match all upper Latin1 code points, have to add them
1637      * as well.  But don't add them if inverting, as when that gets done below,
1638      * it would exclude all these characters, including the ones it shouldn't
1639      * that were added just above */
1640     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1641         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1642     {
1643         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1644     }
1645
1646     /* Similarly for these */
1647     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1648         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1649     }
1650
1651     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1652         _invlist_invert(invlist);
1653     }
1654     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1655
1656         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1657          * locale.  We can skip this if there are no 0-255 at all. */
1658         _invlist_union(invlist, PL_Latin1, &invlist);
1659     }
1660
1661     /* Similarly add the UTF-8 locale possible matches.  These have to be
1662      * deferred until after the non-UTF-8 locale ones are taken care of just
1663      * above, or it leads to wrong results under ANYOF_INVERT */
1664     if (only_utf8_locale_invlist) {
1665         _invlist_union_maybe_complement_2nd(invlist,
1666                                             only_utf8_locale_invlist,
1667                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1668                                             &invlist);
1669     }
1670
1671     return invlist;
1672 }
1673
1674 /* These two functions currently do the exact same thing */
1675 #define ssc_init_zero           ssc_init
1676
1677 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1678 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1679
1680 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1681  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1682  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1683
1684 STATIC void
1685 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1686                 const regnode_charclass *and_with)
1687 {
1688     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1689      * another SSC or a regular ANYOF class.  Can create false positives. */
1690
1691     SV* anded_cp_list;
1692     U8  anded_flags;
1693
1694     PERL_ARGS_ASSERT_SSC_AND;
1695
1696     assert(is_ANYOF_SYNTHETIC(ssc));
1697
1698     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1699      * the code point inversion list and just the relevant flags */
1700     if (is_ANYOF_SYNTHETIC(and_with)) {
1701         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1702         anded_flags = ANYOF_FLAGS(and_with);
1703
1704         /* XXX This is a kludge around what appears to be deficiencies in the
1705          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1706          * there are paths through the optimizer where it doesn't get weeded
1707          * out when it should.  And if we don't make some extra provision for
1708          * it like the code just below, it doesn't get added when it should.
1709          * This solution is to add it only when AND'ing, which is here, and
1710          * only when what is being AND'ed is the pristine, original node
1711          * matching anything.  Thus it is like adding it to ssc_anything() but
1712          * only when the result is to be AND'ed.  Probably the same solution
1713          * could be adopted for the same problem we have with /l matching,
1714          * which is solved differently in S_ssc_init(), and that would lead to
1715          * fewer false positives than that solution has.  But if this solution
1716          * creates bugs, the consequences are only that a warning isn't raised
1717          * that should be; while the consequences for having /l bugs is
1718          * incorrect matches */
1719         if (ssc_is_anything((regnode_ssc *)and_with)) {
1720             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1721         }
1722     }
1723     else {
1724         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1725         if (OP(and_with) == ANYOFD) {
1726             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1727         }
1728         else {
1729             anded_flags = ANYOF_FLAGS(and_with)
1730             &( ANYOF_COMMON_FLAGS
1731               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1732               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1733             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1734                 anded_flags &=
1735                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1736             }
1737         }
1738     }
1739
1740     ANYOF_FLAGS(ssc) &= anded_flags;
1741
1742     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1743      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1744      * 'and_with' may be inverted.  When not inverted, we have the situation of
1745      * computing:
1746      *  (C1 | P1) & (C2 | P2)
1747      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1748      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1749      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1750      *                    <=  ((C1 & C2) | P1 | P2)
1751      * Alternatively, the last few steps could be:
1752      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1753      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1754      *                    <=  (C1 | C2 | (P1 & P2))
1755      * We favor the second approach if either P1 or P2 is non-empty.  This is
1756      * because these components are a barrier to doing optimizations, as what
1757      * they match cannot be known until the moment of matching as they are
1758      * dependent on the current locale, 'AND"ing them likely will reduce or
1759      * eliminate them.
1760      * But we can do better if we know that C1,P1 are in their initial state (a
1761      * frequent occurrence), each matching everything:
1762      *  (<everything>) & (C2 | P2) =  C2 | P2
1763      * Similarly, if C2,P2 are in their initial state (again a frequent
1764      * occurrence), the result is a no-op
1765      *  (C1 | P1) & (<everything>) =  C1 | P1
1766      *
1767      * Inverted, we have
1768      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1769      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1770      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1771      * */
1772
1773     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1774         && ! is_ANYOF_SYNTHETIC(and_with))
1775     {
1776         unsigned int i;
1777
1778         ssc_intersection(ssc,
1779                          anded_cp_list,
1780                          FALSE /* Has already been inverted */
1781                          );
1782
1783         /* If either P1 or P2 is empty, the intersection will be also; can skip
1784          * the loop */
1785         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1786             ANYOF_POSIXL_ZERO(ssc);
1787         }
1788         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1789
1790             /* Note that the Posix class component P from 'and_with' actually
1791              * looks like:
1792              *      P = Pa | Pb | ... | Pn
1793              * where each component is one posix class, such as in [\w\s].
1794              * Thus
1795              *      ~P = ~(Pa | Pb | ... | Pn)
1796              *         = ~Pa & ~Pb & ... & ~Pn
1797              *        <= ~Pa | ~Pb | ... | ~Pn
1798              * The last is something we can easily calculate, but unfortunately
1799              * is likely to have many false positives.  We could do better
1800              * in some (but certainly not all) instances if two classes in
1801              * P have known relationships.  For example
1802              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1803              * So
1804              *      :lower: & :print: = :lower:
1805              * And similarly for classes that must be disjoint.  For example,
1806              * since \s and \w can have no elements in common based on rules in
1807              * the POSIX standard,
1808              *      \w & ^\S = nothing
1809              * Unfortunately, some vendor locales do not meet the Posix
1810              * standard, in particular almost everything by Microsoft.
1811              * The loop below just changes e.g., \w into \W and vice versa */
1812
1813             regnode_charclass_posixl temp;
1814             int add = 1;    /* To calculate the index of the complement */
1815
1816             Zero(&temp, 1, regnode_charclass_posixl);
1817             ANYOF_POSIXL_ZERO(&temp);
1818             for (i = 0; i < ANYOF_MAX; i++) {
1819                 assert(i % 2 != 0
1820                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1821                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1822
1823                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1824                     ANYOF_POSIXL_SET(&temp, i + add);
1825                 }
1826                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1827             }
1828             ANYOF_POSIXL_AND(&temp, ssc);
1829
1830         } /* else ssc already has no posixes */
1831     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1832          in its initial state */
1833     else if (! is_ANYOF_SYNTHETIC(and_with)
1834              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1835     {
1836         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1837          * copy it over 'ssc' */
1838         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1839             if (is_ANYOF_SYNTHETIC(and_with)) {
1840                 StructCopy(and_with, ssc, regnode_ssc);
1841             }
1842             else {
1843                 ssc->invlist = anded_cp_list;
1844                 ANYOF_POSIXL_ZERO(ssc);
1845                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1846                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1847                 }
1848             }
1849         }
1850         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1851                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1852         {
1853             /* One or the other of P1, P2 is non-empty. */
1854             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1855                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1856             }
1857             ssc_union(ssc, anded_cp_list, FALSE);
1858         }
1859         else { /* P1 = P2 = empty */
1860             ssc_intersection(ssc, anded_cp_list, FALSE);
1861         }
1862     }
1863 }
1864
1865 STATIC void
1866 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1867                const regnode_charclass *or_with)
1868 {
1869     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1870      * another SSC or a regular ANYOF class.  Can create false positives if
1871      * 'or_with' is to be inverted. */
1872
1873     SV* ored_cp_list;
1874     U8 ored_flags;
1875
1876     PERL_ARGS_ASSERT_SSC_OR;
1877
1878     assert(is_ANYOF_SYNTHETIC(ssc));
1879
1880     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1881      * the code point inversion list and just the relevant flags */
1882     if (is_ANYOF_SYNTHETIC(or_with)) {
1883         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1884         ored_flags = ANYOF_FLAGS(or_with);
1885     }
1886     else {
1887         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1888         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1889         if (OP(or_with) != ANYOFD) {
1890             ored_flags
1891             |= ANYOF_FLAGS(or_with)
1892              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1893                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1894             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1895                 ored_flags |=
1896                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1897             }
1898         }
1899     }
1900
1901     ANYOF_FLAGS(ssc) |= ored_flags;
1902
1903     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1904      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1905      * 'or_with' may be inverted.  When not inverted, we have the simple
1906      * situation of computing:
1907      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1908      * If P1|P2 yields a situation with both a class and its complement are
1909      * set, like having both \w and \W, this matches all code points, and we
1910      * can delete these from the P component of the ssc going forward.  XXX We
1911      * might be able to delete all the P components, but I (khw) am not certain
1912      * about this, and it is better to be safe.
1913      *
1914      * Inverted, we have
1915      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1916      *                         <=  (C1 | P1) | ~C2
1917      *                         <=  (C1 | ~C2) | P1
1918      * (which results in actually simpler code than the non-inverted case)
1919      * */
1920
1921     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1922         && ! is_ANYOF_SYNTHETIC(or_with))
1923     {
1924         /* We ignore P2, leaving P1 going forward */
1925     }   /* else  Not inverted */
1926     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1927         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1928         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1929             unsigned int i;
1930             for (i = 0; i < ANYOF_MAX; i += 2) {
1931                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1932                 {
1933                     ssc_match_all_cp(ssc);
1934                     ANYOF_POSIXL_CLEAR(ssc, i);
1935                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1936                 }
1937             }
1938         }
1939     }
1940
1941     ssc_union(ssc,
1942               ored_cp_list,
1943               FALSE /* Already has been inverted */
1944               );
1945 }
1946
1947 PERL_STATIC_INLINE void
1948 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1949 {
1950     PERL_ARGS_ASSERT_SSC_UNION;
1951
1952     assert(is_ANYOF_SYNTHETIC(ssc));
1953
1954     _invlist_union_maybe_complement_2nd(ssc->invlist,
1955                                         invlist,
1956                                         invert2nd,
1957                                         &ssc->invlist);
1958 }
1959
1960 PERL_STATIC_INLINE void
1961 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1962                          SV* const invlist,
1963                          const bool invert2nd)
1964 {
1965     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1966
1967     assert(is_ANYOF_SYNTHETIC(ssc));
1968
1969     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1970                                                invlist,
1971                                                invert2nd,
1972                                                &ssc->invlist);
1973 }
1974
1975 PERL_STATIC_INLINE void
1976 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1977 {
1978     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1979
1980     assert(is_ANYOF_SYNTHETIC(ssc));
1981
1982     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1983 }
1984
1985 PERL_STATIC_INLINE void
1986 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1987 {
1988     /* AND just the single code point 'cp' into the SSC 'ssc' */
1989
1990     SV* cp_list = _new_invlist(2);
1991
1992     PERL_ARGS_ASSERT_SSC_CP_AND;
1993
1994     assert(is_ANYOF_SYNTHETIC(ssc));
1995
1996     cp_list = add_cp_to_invlist(cp_list, cp);
1997     ssc_intersection(ssc, cp_list,
1998                      FALSE /* Not inverted */
1999                      );
2000     SvREFCNT_dec_NN(cp_list);
2001 }
2002
2003 PERL_STATIC_INLINE void
2004 S_ssc_clear_locale(regnode_ssc *ssc)
2005 {
2006     /* Set the SSC 'ssc' to not match any locale things */
2007     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2008
2009     assert(is_ANYOF_SYNTHETIC(ssc));
2010
2011     ANYOF_POSIXL_ZERO(ssc);
2012     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2013 }
2014
2015 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2016
2017 STATIC bool
2018 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2019 {
2020     /* The synthetic start class is used to hopefully quickly winnow down
2021      * places where a pattern could start a match in the target string.  If it
2022      * doesn't really narrow things down that much, there isn't much point to
2023      * having the overhead of using it.  This function uses some very crude
2024      * heuristics to decide if to use the ssc or not.
2025      *
2026      * It returns TRUE if 'ssc' rules out more than half what it considers to
2027      * be the "likely" possible matches, but of course it doesn't know what the
2028      * actual things being matched are going to be; these are only guesses
2029      *
2030      * For /l matches, it assumes that the only likely matches are going to be
2031      *      in the 0-255 range, uniformly distributed, so half of that is 127
2032      * For /a and /d matches, it assumes that the likely matches will be just
2033      *      the ASCII range, so half of that is 63
2034      * For /u and there isn't anything matching above the Latin1 range, it
2035      *      assumes that that is the only range likely to be matched, and uses
2036      *      half that as the cut-off: 127.  If anything matches above Latin1,
2037      *      it assumes that all of Unicode could match (uniformly), except for
2038      *      non-Unicode code points and things in the General Category "Other"
2039      *      (unassigned, private use, surrogates, controls and formats).  This
2040      *      is a much large number. */
2041
2042     U32 count = 0;      /* Running total of number of code points matched by
2043                            'ssc' */
2044     UV start, end;      /* Start and end points of current range in inversion
2045                            list */
2046     const U32 max_code_points = (LOC)
2047                                 ?  256
2048                                 : ((  ! UNI_SEMANTICS
2049                                     ||  invlist_highest(ssc->invlist) < 256)
2050                                   ? 128
2051                                   : NON_OTHER_COUNT);
2052     const U32 max_match = max_code_points / 2;
2053
2054     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2055
2056     invlist_iterinit(ssc->invlist);
2057     while (invlist_iternext(ssc->invlist, &start, &end)) {
2058         if (start >= max_code_points) {
2059             break;
2060         }
2061         end = MIN(end, max_code_points - 1);
2062         count += end - start + 1;
2063         if (count >= max_match) {
2064             invlist_iterfinish(ssc->invlist);
2065             return FALSE;
2066         }
2067     }
2068
2069     return TRUE;
2070 }
2071
2072
2073 STATIC void
2074 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2075 {
2076     /* The inversion list in the SSC is marked mortal; now we need a more
2077      * permanent copy, which is stored the same way that is done in a regular
2078      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2079      * map */
2080
2081     SV* invlist = invlist_clone(ssc->invlist, NULL);
2082
2083     PERL_ARGS_ASSERT_SSC_FINALIZE;
2084
2085     assert(is_ANYOF_SYNTHETIC(ssc));
2086
2087     /* The code in this file assumes that all but these flags aren't relevant
2088      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2089      * by the time we reach here */
2090     assert(! (ANYOF_FLAGS(ssc)
2091         & ~( ANYOF_COMMON_FLAGS
2092             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2093             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2094
2095     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2096
2097     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2098                                 NULL, NULL, NULL, FALSE);
2099
2100     /* Make sure is clone-safe */
2101     ssc->invlist = NULL;
2102
2103     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2104         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2105         OP(ssc) = ANYOFPOSIXL;
2106     }
2107     else if (RExC_contains_locale) {
2108         OP(ssc) = ANYOFL;
2109     }
2110
2111     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2112 }
2113
2114 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2115 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2116 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2117 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2118                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2119                                : 0 )
2120
2121
2122 #ifdef DEBUGGING
2123 /*
2124    dump_trie(trie,widecharmap,revcharmap)
2125    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2126    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2127
2128    These routines dump out a trie in a somewhat readable format.
2129    The _interim_ variants are used for debugging the interim
2130    tables that are used to generate the final compressed
2131    representation which is what dump_trie expects.
2132
2133    Part of the reason for their existence is to provide a form
2134    of documentation as to how the different representations function.
2135
2136 */
2137
2138 /*
2139   Dumps the final compressed table form of the trie to Perl_debug_log.
2140   Used for debugging make_trie().
2141 */
2142
2143 STATIC void
2144 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2145             AV *revcharmap, U32 depth)
2146 {
2147     U32 state;
2148     SV *sv=sv_newmortal();
2149     int colwidth= widecharmap ? 6 : 4;
2150     U16 word;
2151     GET_RE_DEBUG_FLAGS_DECL;
2152
2153     PERL_ARGS_ASSERT_DUMP_TRIE;
2154
2155     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2156         depth+1, "Match","Base","Ofs" );
2157
2158     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2159         SV ** const tmp = av_fetch( revcharmap, state, 0);
2160         if ( tmp ) {
2161             Perl_re_printf( aTHX_  "%*s",
2162                 colwidth,
2163                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2164                             PL_colors[0], PL_colors[1],
2165                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2166                             PERL_PV_ESCAPE_FIRSTCHAR
2167                 )
2168             );
2169         }
2170     }
2171     Perl_re_printf( aTHX_  "\n");
2172     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2173
2174     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2175         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2176     Perl_re_printf( aTHX_  "\n");
2177
2178     for( state = 1 ; state < trie->statecount ; state++ ) {
2179         const U32 base = trie->states[ state ].trans.base;
2180
2181         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2182
2183         if ( trie->states[ state ].wordnum ) {
2184             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2185         } else {
2186             Perl_re_printf( aTHX_  "%6s", "" );
2187         }
2188
2189         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2190
2191         if ( base ) {
2192             U32 ofs = 0;
2193
2194             while( ( base + ofs  < trie->uniquecharcount ) ||
2195                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2196                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2197                                                                     != state))
2198                     ofs++;
2199
2200             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2201
2202             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2203                 if ( ( base + ofs >= trie->uniquecharcount )
2204                         && ( base + ofs - trie->uniquecharcount
2205                                                         < trie->lasttrans )
2206                         && trie->trans[ base + ofs
2207                                     - trie->uniquecharcount ].check == state )
2208                 {
2209                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2210                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2211                    );
2212                 } else {
2213                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2214                 }
2215             }
2216
2217             Perl_re_printf( aTHX_  "]");
2218
2219         }
2220         Perl_re_printf( aTHX_  "\n" );
2221     }
2222     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2223                                 depth);
2224     for (word=1; word <= trie->wordcount; word++) {
2225         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2226             (int)word, (int)(trie->wordinfo[word].prev),
2227             (int)(trie->wordinfo[word].len));
2228     }
2229     Perl_re_printf( aTHX_  "\n" );
2230 }
2231 /*
2232   Dumps a fully constructed but uncompressed trie in list form.
2233   List tries normally only are used for construction when the number of
2234   possible chars (trie->uniquecharcount) is very high.
2235   Used for debugging make_trie().
2236 */
2237 STATIC void
2238 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2239                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2240                          U32 depth)
2241 {
2242     U32 state;
2243     SV *sv=sv_newmortal();
2244     int colwidth= widecharmap ? 6 : 4;
2245     GET_RE_DEBUG_FLAGS_DECL;
2246
2247     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2248
2249     /* print out the table precompression.  */
2250     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2251             depth+1 );
2252     Perl_re_indentf( aTHX_  "%s",
2253             depth+1, "------:-----+-----------------\n" );
2254
2255     for( state=1 ; state < next_alloc ; state ++ ) {
2256         U16 charid;
2257
2258         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2259             depth+1, (UV)state  );
2260         if ( ! trie->states[ state ].wordnum ) {
2261             Perl_re_printf( aTHX_  "%5s| ","");
2262         } else {
2263             Perl_re_printf( aTHX_  "W%4x| ",
2264                 trie->states[ state ].wordnum
2265             );
2266         }
2267         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2268             SV ** const tmp = av_fetch( revcharmap,
2269                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2270             if ( tmp ) {
2271                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2272                     colwidth,
2273                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2274                               colwidth,
2275                               PL_colors[0], PL_colors[1],
2276                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2277                               | PERL_PV_ESCAPE_FIRSTCHAR
2278                     ) ,
2279                     TRIE_LIST_ITEM(state, charid).forid,
2280                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2281                 );
2282                 if (!(charid % 10))
2283                     Perl_re_printf( aTHX_  "\n%*s| ",
2284                         (int)((depth * 2) + 14), "");
2285             }
2286         }
2287         Perl_re_printf( aTHX_  "\n");
2288     }
2289 }
2290
2291 /*
2292   Dumps a fully constructed but uncompressed trie in table form.
2293   This is the normal DFA style state transition table, with a few
2294   twists to facilitate compression later.
2295   Used for debugging make_trie().
2296 */
2297 STATIC void
2298 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2299                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2300                           U32 depth)
2301 {
2302     U32 state;
2303     U16 charid;
2304     SV *sv=sv_newmortal();
2305     int colwidth= widecharmap ? 6 : 4;
2306     GET_RE_DEBUG_FLAGS_DECL;
2307
2308     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2309
2310     /*
2311        print out the table precompression so that we can do a visual check
2312        that they are identical.
2313      */
2314
2315     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2316
2317     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2318         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2319         if ( tmp ) {
2320             Perl_re_printf( aTHX_  "%*s",
2321                 colwidth,
2322                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2323                             PL_colors[0], PL_colors[1],
2324                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2325                             PERL_PV_ESCAPE_FIRSTCHAR
2326                 )
2327             );
2328         }
2329     }
2330
2331     Perl_re_printf( aTHX_ "\n");
2332     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2333
2334     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2335         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2336     }
2337
2338     Perl_re_printf( aTHX_  "\n" );
2339
2340     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2341
2342         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2343             depth+1,
2344             (UV)TRIE_NODENUM( state ) );
2345
2346         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2347             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2348             if (v)
2349                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2350             else
2351                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2352         }
2353         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2354             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2355                                             (UV)trie->trans[ state ].check );
2356         } else {
2357             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2358                                             (UV)trie->trans[ state ].check,
2359             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2360         }
2361     }
2362 }
2363
2364 #endif
2365
2366
2367 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2368   startbranch: the first branch in the whole branch sequence
2369   first      : start branch of sequence of branch-exact nodes.
2370                May be the same as startbranch
2371   last       : Thing following the last branch.
2372                May be the same as tail.
2373   tail       : item following the branch sequence
2374   count      : words in the sequence
2375   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2376   depth      : indent depth
2377
2378 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2379
2380 A trie is an N'ary tree where the branches are determined by digital
2381 decomposition of the key. IE, at the root node you look up the 1st character and
2382 follow that branch repeat until you find the end of the branches. Nodes can be
2383 marked as "accepting" meaning they represent a complete word. Eg:
2384
2385   /he|she|his|hers/
2386
2387 would convert into the following structure. Numbers represent states, letters
2388 following numbers represent valid transitions on the letter from that state, if
2389 the number is in square brackets it represents an accepting state, otherwise it
2390 will be in parenthesis.
2391
2392       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2393       |    |
2394       |   (2)
2395       |    |
2396      (1)   +-i->(6)-+-s->[7]
2397       |
2398       +-s->(3)-+-h->(4)-+-e->[5]
2399
2400       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2401
2402 This shows that when matching against the string 'hers' we will begin at state 1
2403 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2404 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2405 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2406 single traverse. We store a mapping from accepting to state to which word was
2407 matched, and then when we have multiple possibilities we try to complete the
2408 rest of the regex in the order in which they occurred in the alternation.
2409
2410 The only prior NFA like behaviour that would be changed by the TRIE support is
2411 the silent ignoring of duplicate alternations which are of the form:
2412
2413  / (DUPE|DUPE) X? (?{ ... }) Y /x
2414
2415 Thus EVAL blocks following a trie may be called a different number of times with
2416 and without the optimisation. With the optimisations dupes will be silently
2417 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2418 the following demonstrates:
2419
2420  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2421
2422 which prints out 'word' three times, but
2423
2424  'words'=~/(word|word|word)(?{ print $1 })S/
2425
2426 which doesnt print it out at all. This is due to other optimisations kicking in.
2427
2428 Example of what happens on a structural level:
2429
2430 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2431
2432    1: CURLYM[1] {1,32767}(18)
2433    5:   BRANCH(8)
2434    6:     EXACT <ac>(16)
2435    8:   BRANCH(11)
2436    9:     EXACT <ad>(16)
2437   11:   BRANCH(14)
2438   12:     EXACT <ab>(16)
2439   16:   SUCCEED(0)
2440   17:   NOTHING(18)
2441   18: END(0)
2442
2443 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2444 and should turn into:
2445
2446    1: CURLYM[1] {1,32767}(18)
2447    5:   TRIE(16)
2448         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2449           <ac>
2450           <ad>
2451           <ab>
2452   16:   SUCCEED(0)
2453   17:   NOTHING(18)
2454   18: END(0)
2455
2456 Cases where tail != last would be like /(?foo|bar)baz/:
2457
2458    1: BRANCH(4)
2459    2:   EXACT <foo>(8)
2460    4: BRANCH(7)
2461    5:   EXACT <bar>(8)
2462    7: TAIL(8)
2463    8: EXACT <baz>(10)
2464   10: END(0)
2465
2466 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2467 and would end up looking like:
2468
2469     1: TRIE(8)
2470       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2471         <foo>
2472         <bar>
2473    7: TAIL(8)
2474    8: EXACT <baz>(10)
2475   10: END(0)
2476
2477     d = uvchr_to_utf8_flags(d, uv, 0);
2478
2479 is the recommended Unicode-aware way of saying
2480
2481     *(d++) = uv;
2482 */
2483
2484 #define TRIE_STORE_REVCHAR(val)                                            \
2485     STMT_START {                                                           \
2486         if (UTF) {                                                         \
2487             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2488             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2489             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2490             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2491             SvPOK_on(zlopp);                                               \
2492             SvUTF8_on(zlopp);                                              \
2493             av_push(revcharmap, zlopp);                                    \
2494         } else {                                                           \
2495             char ooooff = (char)val;                                           \
2496             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2497         }                                                                  \
2498         } STMT_END
2499
2500 /* This gets the next character from the input, folding it if not already
2501  * folded. */
2502 #define TRIE_READ_CHAR STMT_START {                                           \
2503     wordlen++;                                                                \
2504     if ( UTF ) {                                                              \
2505         /* if it is UTF then it is either already folded, or does not need    \
2506          * folding */                                                         \
2507         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2508     }                                                                         \
2509     else if (folder == PL_fold_latin1) {                                      \
2510         /* This folder implies Unicode rules, which in the range expressible  \
2511          *  by not UTF is the lower case, with the two exceptions, one of     \
2512          *  which should have been taken care of before calling this */       \
2513         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2514         uvc = toLOWER_L1(*uc);                                                \
2515         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2516         len = 1;                                                              \
2517     } else {                                                                  \
2518         /* raw data, will be folded later if needed */                        \
2519         uvc = (U32)*uc;                                                       \
2520         len = 1;                                                              \
2521     }                                                                         \
2522 } STMT_END
2523
2524
2525
2526 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2527     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2528         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2529         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2530         TRIE_LIST_LEN( state ) = ging;                          \
2531     }                                                           \
2532     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2533     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2534     TRIE_LIST_CUR( state )++;                                   \
2535 } STMT_END
2536
2537 #define TRIE_LIST_NEW(state) STMT_START {                       \
2538     Newx( trie->states[ state ].trans.list,                     \
2539         4, reg_trie_trans_le );                                 \
2540      TRIE_LIST_CUR( state ) = 1;                                \
2541      TRIE_LIST_LEN( state ) = 4;                                \
2542 } STMT_END
2543
2544 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2545     U16 dupe= trie->states[ state ].wordnum;                    \
2546     regnode * const noper_next = regnext( noper );              \
2547                                                                 \
2548     DEBUG_r({                                                   \
2549         /* store the word for dumping */                        \
2550         SV* tmp;                                                \
2551         if (OP(noper) != NOTHING)                               \
2552             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2553         else                                                    \
2554             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2555         av_push( trie_words, tmp );                             \
2556     });                                                         \
2557                                                                 \
2558     curword++;                                                  \
2559     trie->wordinfo[curword].prev   = 0;                         \
2560     trie->wordinfo[curword].len    = wordlen;                   \
2561     trie->wordinfo[curword].accept = state;                     \
2562                                                                 \
2563     if ( noper_next < tail ) {                                  \
2564         if (!trie->jump)                                        \
2565             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2566                                                  sizeof(U16) ); \
2567         trie->jump[curword] = (U16)(noper_next - convert);      \
2568         if (!jumper)                                            \
2569             jumper = noper_next;                                \
2570         if (!nextbranch)                                        \
2571             nextbranch= regnext(cur);                           \
2572     }                                                           \
2573                                                                 \
2574     if ( dupe ) {                                               \
2575         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2576         /* chain, so that when the bits of chain are later    */\
2577         /* linked together, the dups appear in the chain      */\
2578         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2579         trie->wordinfo[dupe].prev = curword;                    \
2580     } else {                                                    \
2581         /* we haven't inserted this word yet.                */ \
2582         trie->states[ state ].wordnum = curword;                \
2583     }                                                           \
2584 } STMT_END
2585
2586
2587 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2588      ( ( base + charid >=  ucharcount                                   \
2589          && base + charid < ubound                                      \
2590          && state == trie->trans[ base - ucharcount + charid ].check    \
2591          && trie->trans[ base - ucharcount + charid ].next )            \
2592            ? trie->trans[ base - ucharcount + charid ].next             \
2593            : ( state==1 ? special : 0 )                                 \
2594       )
2595
2596 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2597 STMT_START {                                                \
2598     TRIE_BITMAP_SET(trie, uvc);                             \
2599     /* store the folded codepoint */                        \
2600     if ( folder )                                           \
2601         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2602                                                             \
2603     if ( !UTF ) {                                           \
2604         /* store first byte of utf8 representation of */    \
2605         /* variant codepoints */                            \
2606         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2607             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2608         }                                                   \
2609     }                                                       \
2610 } STMT_END
2611 #define MADE_TRIE       1
2612 #define MADE_JUMP_TRIE  2
2613 #define MADE_EXACT_TRIE 4
2614
2615 STATIC I32
2616 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2617                   regnode *first, regnode *last, regnode *tail,
2618                   U32 word_count, U32 flags, U32 depth)
2619 {
2620     /* first pass, loop through and scan words */
2621     reg_trie_data *trie;
2622     HV *widecharmap = NULL;
2623     AV *revcharmap = newAV();
2624     regnode *cur;
2625     STRLEN len = 0;
2626     UV uvc = 0;
2627     U16 curword = 0;
2628     U32 next_alloc = 0;
2629     regnode *jumper = NULL;
2630     regnode *nextbranch = NULL;
2631     regnode *convert = NULL;
2632     U32 *prev_states; /* temp array mapping each state to previous one */
2633     /* we just use folder as a flag in utf8 */
2634     const U8 * folder = NULL;
2635
2636     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2637      * which stands for one trie structure, one hash, optionally followed
2638      * by two arrays */
2639 #ifdef DEBUGGING
2640     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2641     AV *trie_words = NULL;
2642     /* along with revcharmap, this only used during construction but both are
2643      * useful during debugging so we store them in the struct when debugging.
2644      */
2645 #else
2646     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2647     STRLEN trie_charcount=0;
2648 #endif
2649     SV *re_trie_maxbuff;
2650     GET_RE_DEBUG_FLAGS_DECL;
2651
2652     PERL_ARGS_ASSERT_MAKE_TRIE;
2653 #ifndef DEBUGGING
2654     PERL_UNUSED_ARG(depth);
2655 #endif
2656
2657     switch (flags) {
2658         case EXACT: case EXACT_ONLY8: case EXACTL: break;
2659         case EXACTFAA:
2660         case EXACTFUP:
2661         case EXACTFU:
2662         case EXACTFLU8: folder = PL_fold_latin1; break;
2663         case EXACTF:  folder = PL_fold; break;
2664         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2665     }
2666
2667     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2668     trie->refcount = 1;
2669     trie->startstate = 1;
2670     trie->wordcount = word_count;
2671     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2672     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2673     if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2674         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2675     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2676                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2677
2678     DEBUG_r({
2679         trie_words = newAV();
2680     });
2681
2682     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2683     assert(re_trie_maxbuff);
2684     if (!SvIOK(re_trie_maxbuff)) {
2685         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2686     }
2687     DEBUG_TRIE_COMPILE_r({
2688         Perl_re_indentf( aTHX_
2689           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2690           depth+1,
2691           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2692           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2693     });
2694
2695    /* Find the node we are going to overwrite */
2696     if ( first == startbranch && OP( last ) != BRANCH ) {
2697         /* whole branch chain */
2698         convert = first;
2699     } else {
2700         /* branch sub-chain */
2701         convert = NEXTOPER( first );
2702     }
2703
2704     /*  -- First loop and Setup --
2705
2706        We first traverse the branches and scan each word to determine if it
2707        contains widechars, and how many unique chars there are, this is
2708        important as we have to build a table with at least as many columns as we
2709        have unique chars.
2710
2711        We use an array of integers to represent the character codes 0..255
2712        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2713        the native representation of the character value as the key and IV's for
2714        the coded index.
2715
2716        *TODO* If we keep track of how many times each character is used we can
2717        remap the columns so that the table compression later on is more
2718        efficient in terms of memory by ensuring the most common value is in the
2719        middle and the least common are on the outside.  IMO this would be better
2720        than a most to least common mapping as theres a decent chance the most
2721        common letter will share a node with the least common, meaning the node
2722        will not be compressible. With a middle is most common approach the worst
2723        case is when we have the least common nodes twice.
2724
2725      */
2726
2727     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2728         regnode *noper = NEXTOPER( cur );
2729         const U8 *uc;
2730         const U8 *e;
2731         int foldlen = 0;
2732         U32 wordlen      = 0;         /* required init */
2733         STRLEN minchars = 0;
2734         STRLEN maxchars = 0;
2735         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2736                                                bitmap?*/
2737
2738         if (OP(noper) == NOTHING) {
2739             /* skip past a NOTHING at the start of an alternation
2740              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2741              */
2742             regnode *noper_next= regnext(noper);
2743             if (noper_next < tail)
2744                 noper= noper_next;
2745         }
2746
2747         if (    noper < tail
2748             && (    OP(noper) == flags
2749                 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2750                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2751                                          || OP(noper) == EXACTFUP))))
2752         {
2753             uc= (U8*)STRING(noper);
2754             e= uc + STR_LEN(noper);
2755         } else {
2756             trie->minlen= 0;
2757             continue;
2758         }
2759
2760
2761         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2762             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2763                                           regardless of encoding */
2764             if (OP( noper ) == EXACTFUP) {
2765                 /* false positives are ok, so just set this */
2766                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2767             }
2768         }
2769
2770         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2771                                            branch */
2772             TRIE_CHARCOUNT(trie)++;
2773             TRIE_READ_CHAR;
2774
2775             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2776              * is in effect.  Under /i, this character can match itself, or
2777              * anything that folds to it.  If not under /i, it can match just
2778              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2779              * all fold to k, and all are single characters.   But some folds
2780              * expand to more than one character, so for example LATIN SMALL
2781              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2782              * the string beginning at 'uc' is 'ffi', it could be matched by
2783              * three characters, or just by the one ligature character. (It
2784              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2785              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2786              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2787              * match.)  The trie needs to know the minimum and maximum number
2788              * of characters that could match so that it can use size alone to
2789              * quickly reject many match attempts.  The max is simple: it is
2790              * the number of folded characters in this branch (since a fold is
2791              * never shorter than what folds to it. */
2792
2793             maxchars++;
2794
2795             /* And the min is equal to the max if not under /i (indicated by
2796              * 'folder' being NULL), or there are no multi-character folds.  If
2797              * there is a multi-character fold, the min is incremented just
2798              * once, for the character that folds to the sequence.  Each
2799              * character in the sequence needs to be added to the list below of
2800              * characters in the trie, but we count only the first towards the
2801              * min number of characters needed.  This is done through the
2802              * variable 'foldlen', which is returned by the macros that look
2803              * for these sequences as the number of bytes the sequence
2804              * occupies.  Each time through the loop, we decrement 'foldlen' by
2805              * how many bytes the current char occupies.  Only when it reaches
2806              * 0 do we increment 'minchars' or look for another multi-character
2807              * sequence. */
2808             if (folder == NULL) {
2809                 minchars++;
2810             }
2811             else if (foldlen > 0) {
2812                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2813             }
2814             else {
2815                 minchars++;
2816
2817                 /* See if *uc is the beginning of a multi-character fold.  If
2818                  * so, we decrement the length remaining to look at, to account
2819                  * for the current character this iteration.  (We can use 'uc'
2820                  * instead of the fold returned by TRIE_READ_CHAR because for
2821                  * non-UTF, the latin1_safe macro is smart enough to account
2822                  * for all the unfolded characters, and because for UTF, the
2823                  * string will already have been folded earlier in the
2824                  * compilation process */
2825                 if (UTF) {
2826                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2827                         foldlen -= UTF8SKIP(uc);
2828                     }
2829                 }
2830                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2831                     foldlen--;
2832                 }
2833             }
2834
2835             /* The current character (and any potential folds) should be added
2836              * to the possible matching characters for this position in this
2837              * branch */
2838             if ( uvc < 256 ) {
2839                 if ( folder ) {
2840                     U8 folded= folder[ (U8) uvc ];
2841                     if ( !trie->charmap[ folded ] ) {
2842                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2843                         TRIE_STORE_REVCHAR( folded );
2844                     }
2845                 }
2846                 if ( !trie->charmap[ uvc ] ) {
2847                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2848                     TRIE_STORE_REVCHAR( uvc );
2849                 }
2850                 if ( set_bit ) {
2851                     /* store the codepoint in the bitmap, and its folded
2852                      * equivalent. */
2853                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2854                     set_bit = 0; /* We've done our bit :-) */
2855                 }
2856             } else {
2857
2858                 /* XXX We could come up with the list of code points that fold
2859                  * to this using PL_utf8_foldclosures, except not for
2860                  * multi-char folds, as there may be multiple combinations
2861                  * there that could work, which needs to wait until runtime to
2862                  * resolve (The comment about LIGATURE FFI above is such an
2863                  * example */
2864
2865                 SV** svpp;
2866                 if ( !widecharmap )
2867                     widecharmap = newHV();
2868
2869                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2870
2871                 if ( !svpp )
2872                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2873
2874                 if ( !SvTRUE( *svpp ) ) {
2875                     sv_setiv( *svpp, ++trie->uniquecharcount );
2876                     TRIE_STORE_REVCHAR(uvc);
2877                 }
2878             }
2879         } /* end loop through characters in this branch of the trie */
2880
2881         /* We take the min and max for this branch and combine to find the min
2882          * and max for all branches processed so far */
2883         if( cur == first ) {
2884             trie->minlen = minchars;
2885             trie->maxlen = maxchars;
2886         } else if (minchars < trie->minlen) {
2887             trie->minlen = minchars;
2888         } else if (maxchars > trie->maxlen) {
2889             trie->maxlen = maxchars;
2890         }
2891     } /* end first pass */
2892     DEBUG_TRIE_COMPILE_r(
2893         Perl_re_indentf( aTHX_
2894                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2895                 depth+1,
2896                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2897                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2898                 (int)trie->minlen, (int)trie->maxlen )
2899     );
2900
2901     /*
2902         We now know what we are dealing with in terms of unique chars and
2903         string sizes so we can calculate how much memory a naive
2904         representation using a flat table  will take. If it's over a reasonable
2905         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2906         conservative but potentially much slower representation using an array
2907         of lists.
2908
2909         At the end we convert both representations into the same compressed
2910         form that will be used in regexec.c for matching with. The latter
2911         is a form that cannot be used to construct with but has memory
2912         properties similar to the list form and access properties similar
2913         to the table form making it both suitable for fast searches and
2914         small enough that its feasable to store for the duration of a program.
2915
2916         See the comment in the code where the compressed table is produced
2917         inplace from the flat tabe representation for an explanation of how
2918         the compression works.
2919
2920     */
2921
2922
2923     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2924     prev_states[1] = 0;
2925
2926     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2927                                                     > SvIV(re_trie_maxbuff) )
2928     {
2929         /*
2930             Second Pass -- Array Of Lists Representation
2931
2932             Each state will be represented by a list of charid:state records
2933             (reg_trie_trans_le) the first such element holds the CUR and LEN
2934             points of the allocated array. (See defines above).
2935
2936             We build the initial structure using the lists, and then convert
2937             it into the compressed table form which allows faster lookups
2938             (but cant be modified once converted).
2939         */
2940
2941         STRLEN transcount = 1;
2942
2943         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2944             depth+1));
2945
2946         trie->states = (reg_trie_state *)
2947             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2948                                   sizeof(reg_trie_state) );
2949         TRIE_LIST_NEW(1);
2950         next_alloc = 2;
2951
2952         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2953
2954             regnode *noper   = NEXTOPER( cur );
2955             U32 state        = 1;         /* required init */
2956             U16 charid       = 0;         /* sanity init */
2957             U32 wordlen      = 0;         /* required init */
2958
2959             if (OP(noper) == NOTHING) {
2960                 regnode *noper_next= regnext(noper);
2961                 if (noper_next < tail)
2962                     noper= noper_next;
2963             }
2964
2965             if (    noper < tail
2966                 && (    OP(noper) == flags
2967                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2968                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
2969                                              || OP(noper) == EXACTFUP))))
2970             {
2971                 const U8 *uc= (U8*)STRING(noper);
2972                 const U8 *e= uc + STR_LEN(noper);
2973
2974                 for ( ; uc < e ; uc += len ) {
2975
2976                     TRIE_READ_CHAR;
2977
2978                     if ( uvc < 256 ) {
2979                         charid = trie->charmap[ uvc ];
2980                     } else {
2981                         SV** const svpp = hv_fetch( widecharmap,
2982                                                     (char*)&uvc,
2983                                                     sizeof( UV ),
2984                                                     0);
2985                         if ( !svpp ) {
2986                             charid = 0;
2987                         } else {
2988                             charid=(U16)SvIV( *svpp );
2989                         }
2990                     }
2991                     /* charid is now 0 if we dont know the char read, or
2992                      * nonzero if we do */
2993                     if ( charid ) {
2994
2995                         U16 check;
2996                         U32 newstate = 0;
2997
2998                         charid--;
2999                         if ( !trie->states[ state ].trans.list ) {
3000                             TRIE_LIST_NEW( state );
3001                         }
3002                         for ( check = 1;
3003                               check <= TRIE_LIST_USED( state );
3004                               check++ )
3005                         {
3006                             if ( TRIE_LIST_ITEM( state, check ).forid
3007                                                                     == charid )
3008                             {
3009                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3010                                 break;
3011                             }
3012                         }
3013                         if ( ! newstate ) {
3014                             newstate = next_alloc++;
3015                             prev_states[newstate] = state;
3016                             TRIE_LIST_PUSH( state, charid, newstate );
3017                             transcount++;
3018                         }
3019                         state = newstate;
3020                     } else {
3021                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3022                     }
3023                 }
3024             }
3025             TRIE_HANDLE_WORD(state);
3026
3027         } /* end second pass */
3028
3029         /* next alloc is the NEXT state to be allocated */
3030         trie->statecount = next_alloc;
3031         trie->states = (reg_trie_state *)
3032             PerlMemShared_realloc( trie->states,
3033                                    next_alloc
3034                                    * sizeof(reg_trie_state) );
3035
3036         /* and now dump it out before we compress it */
3037         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3038                                                          revcharmap, next_alloc,
3039                                                          depth+1)
3040         );
3041
3042         trie->trans = (reg_trie_trans *)
3043             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3044         {
3045             U32 state;
3046             U32 tp = 0;
3047             U32 zp = 0;
3048
3049
3050             for( state=1 ; state < next_alloc ; state ++ ) {
3051                 U32 base=0;
3052
3053                 /*
3054                 DEBUG_TRIE_COMPILE_MORE_r(
3055                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3056                 );
3057                 */
3058
3059                 if (trie->states[state].trans.list) {
3060                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3061                     U16 maxid=minid;
3062                     U16 idx;
3063
3064                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3065                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3066                         if ( forid < minid ) {
3067                             minid=forid;
3068                         } else if ( forid > maxid ) {
3069                             maxid=forid;
3070                         }
3071                     }
3072                     if ( transcount < tp + maxid - minid + 1) {
3073                         transcount *= 2;
3074                         trie->trans = (reg_trie_trans *)
3075                             PerlMemShared_realloc( trie->trans,
3076                                                      transcount
3077                                                      * sizeof(reg_trie_trans) );
3078                         Zero( trie->trans + (transcount / 2),
3079                               transcount / 2,
3080                               reg_trie_trans );
3081                     }
3082                     base = trie->uniquecharcount + tp - minid;
3083                     if ( maxid == minid ) {
3084                         U32 set = 0;
3085                         for ( ; zp < tp ; zp++ ) {
3086                             if ( ! trie->trans[ zp ].next ) {
3087                                 base = trie->uniquecharcount + zp - minid;
3088                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3089                                                                    1).newstate;
3090                                 trie->trans[ zp ].check = state;
3091                                 set = 1;
3092                                 break;
3093                             }
3094                         }
3095                         if ( !set ) {
3096                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3097                                                                    1).newstate;
3098                             trie->trans[ tp ].check = state;
3099                             tp++;
3100                             zp = tp;
3101                         }
3102                     } else {
3103                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3104                             const U32 tid = base
3105                                            - trie->uniquecharcount
3106                                            + TRIE_LIST_ITEM( state, idx ).forid;
3107                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3108                                                                 idx ).newstate;
3109                             trie->trans[ tid ].check = state;
3110                         }
3111                         tp += ( maxid - minid + 1 );
3112                     }
3113                     Safefree(trie->states[ state ].trans.list);
3114                 }
3115                 /*
3116                 DEBUG_TRIE_COMPILE_MORE_r(
3117                     Perl_re_printf( aTHX_  " base: %d\n",base);
3118                 );
3119                 */
3120                 trie->states[ state ].trans.base=base;
3121             }
3122             trie->lasttrans = tp + 1;
3123         }
3124     } else {
3125         /*
3126            Second Pass -- Flat Table Representation.
3127
3128            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3129            each.  We know that we will need Charcount+1 trans at most to store
3130            the data (one row per char at worst case) So we preallocate both
3131            structures assuming worst case.
3132
3133            We then construct the trie using only the .next slots of the entry
3134            structs.
3135
3136            We use the .check field of the first entry of the node temporarily
3137            to make compression both faster and easier by keeping track of how
3138            many non zero fields are in the node.
3139
3140            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3141            transition.
3142
3143            There are two terms at use here: state as a TRIE_NODEIDX() which is
3144            a number representing the first entry of the node, and state as a
3145            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3146            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3147            if there are 2 entrys per node. eg:
3148
3149              A B       A B
3150           1. 2 4    1. 3 7
3151           2. 0 3    3. 0 5
3152           3. 0 0    5. 0 0
3153           4. 0 0    7. 0 0
3154
3155            The table is internally in the right hand, idx form. However as we
3156            also have to deal with the states array which is indexed by nodenum
3157            we have to use TRIE_NODENUM() to convert.
3158
3159         */
3160         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3161             depth+1));
3162
3163         trie->trans = (reg_trie_trans *)
3164             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3165                                   * trie->uniquecharcount + 1,
3166                                   sizeof(reg_trie_trans) );
3167         trie->states = (reg_trie_state *)
3168             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3169                                   sizeof(reg_trie_state) );
3170         next_alloc = trie->uniquecharcount + 1;
3171
3172
3173         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3174
3175             regnode *noper   = NEXTOPER( cur );
3176
3177             U32 state        = 1;         /* required init */
3178
3179             U16 charid       = 0;         /* sanity init */
3180             U32 accept_state = 0;         /* sanity init */
3181
3182             U32 wordlen      = 0;         /* required init */
3183
3184             if (OP(noper) == NOTHING) {
3185                 regnode *noper_next= regnext(noper);
3186                 if (noper_next < tail)
3187                     noper= noper_next;
3188             }
3189
3190             if (    noper < tail
3191                 && (    OP(noper) == flags
3192                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3193                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
3194                                              || OP(noper) == EXACTFUP))))
3195             {
3196                 const U8 *uc= (U8*)STRING(noper);
3197                 const U8 *e= uc + STR_LEN(noper);
3198
3199                 for ( ; uc < e ; uc += len ) {
3200
3201                     TRIE_READ_CHAR;
3202
3203                     if ( uvc < 256 ) {
3204                         charid = trie->charmap[ uvc ];
3205                     } else {
3206                         SV* const * const svpp = hv_fetch( widecharmap,
3207                                                            (char*)&uvc,
3208                                                            sizeof( UV ),
3209                                                            0);
3210                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3211                     }
3212                     if ( charid ) {
3213                         charid--;
3214                         if ( !trie->trans[ state + charid ].next ) {
3215                             trie->trans[ state + charid ].next = next_alloc;
3216                             trie->trans[ state ].check++;
3217                             prev_states[TRIE_NODENUM(next_alloc)]
3218                                     = TRIE_NODENUM(state);
3219                             next_alloc += trie->uniquecharcount;
3220                         }
3221                         state = trie->trans[ state + charid ].next;
3222                     } else {
3223                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3224                     }
3225                     /* charid is now 0 if we dont know the char read, or
3226                      * nonzero if we do */
3227                 }
3228             }
3229             accept_state = TRIE_NODENUM( state );
3230             TRIE_HANDLE_WORD(accept_state);
3231
3232         } /* end second pass */
3233
3234         /* and now dump it out before we compress it */
3235         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3236                                                           revcharmap,
3237                                                           next_alloc, depth+1));
3238
3239         {
3240         /*
3241            * Inplace compress the table.*
3242
3243            For sparse data sets the table constructed by the trie algorithm will
3244            be mostly 0/FAIL transitions or to put it another way mostly empty.
3245            (Note that leaf nodes will not contain any transitions.)
3246
3247            This algorithm compresses the tables by eliminating most such
3248            transitions, at the cost of a modest bit of extra work during lookup:
3249
3250            - Each states[] entry contains a .base field which indicates the
3251            index in the state[] array wheres its transition data is stored.
3252
3253            - If .base is 0 there are no valid transitions from that node.
3254
3255            - If .base is nonzero then charid is added to it to find an entry in
3256            the trans array.
3257
3258            -If trans[states[state].base+charid].check!=state then the
3259            transition is taken to be a 0/Fail transition. Thus if there are fail
3260            transitions at the front of the node then the .base offset will point
3261            somewhere inside the previous nodes data (or maybe even into a node
3262            even earlier), but the .check field determines if the transition is
3263            valid.
3264
3265            XXX - wrong maybe?
3266            The following process inplace converts the table to the compressed
3267            table: We first do not compress the root node 1,and mark all its
3268            .check pointers as 1 and set its .base pointer as 1 as well. This
3269            allows us to do a DFA construction from the compressed table later,
3270            and ensures that any .base pointers we calculate later are greater
3271            than 0.
3272
3273            - We set 'pos' to indicate the first entry of the second node.
3274
3275            - We then iterate over the columns of the node, finding the first and
3276            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3277            and set the .check pointers accordingly, and advance pos
3278            appropriately and repreat for the next node. Note that when we copy
3279            the next pointers we have to convert them from the original
3280            NODEIDX form to NODENUM form as the former is not valid post
3281            compression.
3282
3283            - If a node has no transitions used we mark its base as 0 and do not
3284            advance the pos pointer.
3285
3286            - If a node only has one transition we use a second pointer into the
3287            structure to fill in allocated fail transitions from other states.
3288            This pointer is independent of the main pointer and scans forward
3289            looking for null transitions that are allocated to a state. When it
3290            finds one it writes the single transition into the "hole".  If the
3291            pointer doesnt find one the single transition is appended as normal.
3292
3293            - Once compressed we can Renew/realloc the structures to release the
3294            excess space.
3295
3296            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3297            specifically Fig 3.47 and the associated pseudocode.
3298
3299            demq
3300         */
3301         const U32 laststate = TRIE_NODENUM( next_alloc );
3302         U32 state, charid;
3303         U32 pos = 0, zp=0;
3304         trie->statecount = laststate;
3305
3306         for ( state = 1 ; state < laststate ; state++ ) {
3307             U8 flag = 0;
3308             const U32 stateidx = TRIE_NODEIDX( state );
3309             const U32 o_used = trie->trans[ stateidx ].check;
3310             U32 used = trie->trans[ stateidx ].check;
3311             trie->trans[ stateidx ].check = 0;
3312
3313             for ( charid = 0;
3314                   used && charid < trie->uniquecharcount;
3315                   charid++ )
3316             {
3317                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3318                     if ( trie->trans[ stateidx + charid ].next ) {
3319                         if (o_used == 1) {
3320                             for ( ; zp < pos ; zp++ ) {
3321                                 if ( ! trie->trans[ zp ].next ) {
3322                                     break;
3323                                 }
3324                             }
3325                             trie->states[ state ].trans.base
3326                                                     = zp
3327                                                       + trie->uniquecharcount
3328                                                       - charid ;
3329                             trie->trans[ zp ].next
3330                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3331                                                              + charid ].next );
3332                             trie->trans[ zp ].check = state;
3333                             if ( ++zp > pos ) pos = zp;
3334                             break;
3335                         }
3336                         used--;
3337                     }
3338                     if ( !flag ) {
3339                         flag = 1;
3340                         trie->states[ state ].trans.base
3341                                        = pos + trie->uniquecharcount - charid ;
3342                     }
3343                     trie->trans[ pos ].next
3344                         = SAFE_TRIE_NODENUM(
3345                                        trie->trans[ stateidx + charid ].next );
3346                     trie->trans[ pos ].check = state;
3347                     pos++;
3348                 }
3349             }
3350         }
3351         trie->lasttrans = pos + 1;
3352         trie->states = (reg_trie_state *)
3353             PerlMemShared_realloc( trie->states, laststate
3354                                    * sizeof(reg_trie_state) );
3355         DEBUG_TRIE_COMPILE_MORE_r(
3356             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3357                 depth+1,
3358                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3359                        + 1 ),
3360                 (IV)next_alloc,
3361                 (IV)pos,
3362                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3363             );
3364
3365         } /* end table compress */
3366     }
3367     DEBUG_TRIE_COMPILE_MORE_r(
3368             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3369                 depth+1,
3370                 (UV)trie->statecount,
3371                 (UV)trie->lasttrans)
3372     );
3373     /* resize the trans array to remove unused space */
3374     trie->trans = (reg_trie_trans *)
3375         PerlMemShared_realloc( trie->trans, trie->lasttrans
3376                                * sizeof(reg_trie_trans) );
3377
3378     {   /* Modify the program and insert the new TRIE node */
3379         U8 nodetype =(U8)(flags & 0xFF);
3380         char *str=NULL;
3381
3382 #ifdef DEBUGGING
3383         regnode *optimize = NULL;
3384 #ifdef RE_TRACK_PATTERN_OFFSETS
3385
3386         U32 mjd_offset = 0;
3387         U32 mjd_nodelen = 0;
3388 #endif /* RE_TRACK_PATTERN_OFFSETS */
3389 #endif /* DEBUGGING */
3390         /*
3391            This means we convert either the first branch or the first Exact,
3392            depending on whether the thing following (in 'last') is a branch
3393            or not and whther first is the startbranch (ie is it a sub part of
3394            the alternation or is it the whole thing.)
3395            Assuming its a sub part we convert the EXACT otherwise we convert
3396            the whole branch sequence, including the first.
3397          */
3398         /* Find the node we are going to overwrite */
3399         if ( first != startbranch || OP( last ) == BRANCH ) {
3400             /* branch sub-chain */
3401             NEXT_OFF( first ) = (U16)(last - first);
3402 #ifdef RE_TRACK_PATTERN_OFFSETS
3403             DEBUG_r({
3404                 mjd_offset= Node_Offset((convert));
3405                 mjd_nodelen= Node_Length((convert));
3406             });
3407 #endif
3408             /* whole branch chain */
3409         }
3410 #ifdef RE_TRACK_PATTERN_OFFSETS
3411         else {
3412             DEBUG_r({
3413                 const  regnode *nop = NEXTOPER( convert );
3414                 mjd_offset= Node_Offset((nop));
3415                 mjd_nodelen= Node_Length((nop));
3416             });
3417         }
3418         DEBUG_OPTIMISE_r(
3419             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3420                 depth+1,
3421                 (UV)mjd_offset, (UV)mjd_nodelen)
3422         );
3423 #endif
3424         /* But first we check to see if there is a common prefix we can
3425            split out as an EXACT and put in front of the TRIE node.  */
3426         trie->startstate= 1;
3427         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3428             /* we want to find the first state that has more than
3429              * one transition, if that state is not the first state
3430              * then we have a common prefix which we can remove.
3431              */
3432             U32 state;
3433             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3434                 U32 ofs = 0;
3435                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3436                                        transition, -1 means none */
3437                 U32 count = 0;
3438                 const U32 base = trie->states[ state ].trans.base;
3439
3440                 /* does this state terminate an alternation? */
3441                 if ( trie->states[state].wordnum )
3442                         count = 1;
3443
3444                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3445                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3446                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3447                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3448                     {
3449                         if ( ++count > 1 ) {
3450                             /* we have more than one transition */
3451                             SV **tmp;
3452                             U8 *ch;
3453                             /* if this is the first state there is no common prefix
3454                              * to extract, so we can exit */
3455                             if ( state == 1 ) break;
3456                             tmp = av_fetch( revcharmap, ofs, 0);
3457                             ch = (U8*)SvPV_nolen_const( *tmp );
3458
3459                             /* if we are on count 2 then we need to initialize the
3460                              * bitmap, and store the previous char if there was one
3461                              * in it*/
3462                             if ( count == 2 ) {
3463                                 /* clear the bitmap */
3464                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3465                                 DEBUG_OPTIMISE_r(
3466                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3467                                         depth+1,
3468                                         (UV)state));
3469                                 if (first_ofs >= 0) {
3470                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3471                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3472
3473                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3474                                     DEBUG_OPTIMISE_r(
3475                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3476                                     );
3477                                 }
3478                             }
3479                             /* store the current firstchar in the bitmap */
3480                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3481                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3482                         }
3483                         first_ofs = ofs;
3484                     }
3485                 }
3486                 if ( count == 1 ) {
3487                     /* This state has only one transition, its transition is part
3488                      * of a common prefix - we need to concatenate the char it
3489                      * represents to what we have so far. */
3490                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3491                     STRLEN len;
3492                     char *ch = SvPV( *tmp, len );
3493                     DEBUG_OPTIMISE_r({
3494                         SV *sv=sv_newmortal();
3495                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3496                             depth+1,
3497                             (UV)state, (UV)first_ofs,
3498                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3499                                 PL_colors[0], PL_colors[1],
3500                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3501                                 PERL_PV_ESCAPE_FIRSTCHAR
3502                             )
3503                         );
3504                     });
3505                     if ( state==1 ) {
3506                         OP( convert ) = nodetype;
3507                         str=STRING(convert);
3508                         STR_LEN(convert)=0;
3509                     }
3510                     STR_LEN(convert) += len;
3511                     while (len--)
3512                         *str++ = *ch++;
3513                 } else {
3514 #ifdef DEBUGGING
3515                     if (state>1)
3516                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3517 #endif
3518                     break;
3519                 }
3520             }
3521             trie->prefixlen = (state-1);
3522             if (str) {
3523                 regnode *n = convert+NODE_SZ_STR(convert);
3524                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3525                 trie->startstate = state;
3526                 trie->minlen -= (state - 1);
3527                 trie->maxlen -= (state - 1);
3528 #ifdef DEBUGGING
3529                /* At least the UNICOS C compiler choked on this
3530                 * being argument to DEBUG_r(), so let's just have
3531                 * it right here. */
3532                if (
3533 #ifdef PERL_EXT_RE_BUILD
3534                    1
3535 #else
3536                    DEBUG_r_TEST
3537 #endif
3538                    ) {
3539                    regnode *fix = convert;
3540                    U32 word = trie->wordcount;
3541 #ifdef RE_TRACK_PATTERN_OFFSETS
3542                    mjd_nodelen++;
3543 #endif
3544                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3545                    while( ++fix < n ) {
3546                        Set_Node_Offset_Length(fix, 0, 0);
3547                    }
3548                    while (word--) {
3549                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3550                        if (tmp) {
3551                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3552                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3553                            else
3554                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3555                        }
3556                    }
3557                }
3558 #endif
3559                 if (trie->maxlen) {
3560                     convert = n;
3561                 } else {
3562                     NEXT_OFF(convert) = (U16)(tail - convert);
3563                     DEBUG_r(optimize= n);
3564                 }
3565             }
3566         }
3567         if (!jumper)
3568             jumper = last;
3569         if ( trie->maxlen ) {
3570             NEXT_OFF( convert ) = (U16)(tail - convert);
3571             ARG_SET( convert, data_slot );
3572             /* Store the offset to the first unabsorbed branch in
3573                jump[0], which is otherwise unused by the jump logic.
3574                We use this when dumping a trie and during optimisation. */
3575             if (trie->jump)
3576                 trie->jump[0] = (U16)(nextbranch - convert);
3577
3578             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3579              *   and there is a bitmap
3580              *   and the first "jump target" node we found leaves enough room
3581              * then convert the TRIE node into a TRIEC node, with the bitmap
3582              * embedded inline in the opcode - this is hypothetically faster.
3583              */
3584             if ( !trie->states[trie->startstate].wordnum
3585                  && trie->bitmap
3586                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3587             {
3588                 OP( convert ) = TRIEC;
3589                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3590                 PerlMemShared_free(trie->bitmap);
3591                 trie->bitmap= NULL;
3592             } else
3593                 OP( convert ) = TRIE;
3594
3595             /* store the type in the flags */
3596             convert->flags = nodetype;
3597             DEBUG_r({
3598             optimize = convert
3599                       + NODE_STEP_REGNODE
3600                       + regarglen[ OP( convert ) ];
3601             });
3602             /* XXX We really should free up the resource in trie now,
3603                    as we won't use them - (which resources?) dmq */
3604         }
3605         /* needed for dumping*/
3606         DEBUG_r(if (optimize) {
3607             regnode *opt = convert;
3608
3609             while ( ++opt < optimize) {
3610                 Set_Node_Offset_Length(opt, 0, 0);
3611             }
3612             /*
3613                 Try to clean up some of the debris left after the
3614                 optimisation.
3615              */
3616             while( optimize < jumper ) {
3617                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3618                 OP( optimize ) = OPTIMIZED;
3619                 Set_Node_Offset_Length(optimize, 0, 0);
3620                 optimize++;
3621             }
3622             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3623         });
3624     } /* end node insert */
3625
3626     /*  Finish populating the prev field of the wordinfo array.  Walk back
3627      *  from each accept state until we find another accept state, and if
3628      *  so, point the first word's .prev field at the second word. If the
3629      *  second already has a .prev field set, stop now. This will be the
3630      *  case either if we've already processed that word's accept state,
3631      *  or that state had multiple words, and the overspill words were
3632      *  already linked up earlier.
3633      */
3634     {
3635         U16 word;
3636         U32 state;
3637         U16 prev;
3638
3639         for (word=1; word <= trie->wordcount; word++) {
3640             prev = 0;
3641             if (trie->wordinfo[word].prev)
3642                 continue;
3643             state = trie->wordinfo[word].accept;
3644             while (state) {
3645                 state = prev_states[state];
3646                 if (!state)
3647                     break;
3648                 prev = trie->states[state].wordnum;
3649                 if (prev)
3650                     break;
3651             }
3652             trie->wordinfo[word].prev = prev;
3653         }
3654         Safefree(prev_states);
3655     }
3656
3657
3658     /* and now dump out the compressed format */
3659     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3660
3661     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3662 #ifdef DEBUGGING
3663     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3664     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3665 #else
3666     SvREFCNT_dec_NN(revcharmap);
3667 #endif
3668     return trie->jump
3669            ? MADE_JUMP_TRIE
3670            : trie->startstate>1
3671              ? MADE_EXACT_TRIE
3672              : MADE_TRIE;
3673 }
3674
3675 STATIC regnode *
3676 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3677 {
3678 /* The Trie is constructed and compressed now so we can build a fail array if
3679  * it's needed
3680
3681    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3682    3.32 in the
3683    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3684    Ullman 1985/88
3685    ISBN 0-201-10088-6
3686
3687    We find the fail state for each state in the trie, this state is the longest
3688    proper suffix of the current state's 'word' that is also a proper prefix of
3689    another word in our trie. State 1 represents the word '' and is thus the
3690    default fail state. This allows the DFA not to have to restart after its
3691    tried and failed a word at a given point, it simply continues as though it
3692    had been matching the other word in the first place.
3693    Consider
3694       'abcdgu'=~/abcdefg|cdgu/
3695    When we get to 'd' we are still matching the first word, we would encounter
3696    'g' which would fail, which would bring us to the state representing 'd' in
3697    the second word where we would try 'g' and succeed, proceeding to match
3698    'cdgu'.
3699  */
3700  /* add a fail transition */
3701     const U32 trie_offset = ARG(source);
3702     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3703     U32 *q;
3704     const U32 ucharcount = trie->uniquecharcount;
3705     const U32 numstates = trie->statecount;
3706     const U32 ubound = trie->lasttrans + ucharcount;
3707     U32 q_read = 0;
3708     U32 q_write = 0;
3709     U32 charid;
3710     U32 base = trie->states[ 1 ].trans.base;
3711     U32 *fail;
3712     reg_ac_data *aho;
3713     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3714     regnode *stclass;
3715     GET_RE_DEBUG_FLAGS_DECL;
3716
3717     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3718     PERL_UNUSED_CONTEXT;
3719 #ifndef DEBUGGING
3720     PERL_UNUSED_ARG(depth);
3721 #endif
3722
3723     if ( OP(source) == TRIE ) {
3724         struct regnode_1 *op = (struct regnode_1 *)
3725             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3726         StructCopy(source, op, struct regnode_1);
3727         stclass = (regnode *)op;
3728     } else {
3729         struct regnode_charclass *op = (struct regnode_charclass *)
3730             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3731         StructCopy(source, op, struct regnode_charclass);
3732         stclass = (regnode *)op;
3733     }
3734     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3735
3736     ARG_SET( stclass, data_slot );
3737     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3738     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3739     aho->trie=trie_offset;
3740     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3741     Copy( trie->states, aho->states, numstates, reg_trie_state );
3742     Newx( q, numstates, U32);
3743     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3744     aho->refcount = 1;
3745     fail = aho->fail;
3746     /* initialize fail[0..1] to be 1 so that we always have
3747        a valid final fail state */
3748     fail[ 0 ] = fail[ 1 ] = 1;
3749
3750     for ( charid = 0; charid < ucharcount ; charid++ ) {
3751         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3752         if ( newstate ) {
3753             q[ q_write ] = newstate;
3754             /* set to point at the root */
3755             fail[ q[ q_write++ ] ]=1;
3756         }
3757     }
3758     while ( q_read < q_write) {
3759         const U32 cur = q[ q_read++ % numstates ];
3760         base = trie->states[ cur ].trans.base;
3761
3762         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3763             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3764             if (ch_state) {
3765                 U32 fail_state = cur;
3766                 U32 fail_base;
3767                 do {
3768                     fail_state = fail[ fail_state ];
3769                     fail_base = aho->states[ fail_state ].trans.base;
3770                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3771
3772                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3773                 fail[ ch_state ] = fail_state;
3774                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3775                 {
3776                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3777                 }
3778                 q[ q_write++ % numstates] = ch_state;
3779             }
3780         }
3781     }
3782     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3783        when we fail in state 1, this allows us to use the
3784        charclass scan to find a valid start char. This is based on the principle
3785        that theres a good chance the string being searched contains lots of stuff
3786        that cant be a start char.
3787      */
3788     fail[ 0 ] = fail[ 1 ] = 0;
3789     DEBUG_TRIE_COMPILE_r({
3790         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3791                       depth, (UV)numstates
3792         );
3793         for( q_read=1; q_read<numstates; q_read++ ) {
3794             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3795         }
3796         Perl_re_printf( aTHX_  "\n");
3797     });
3798     Safefree(q);
3799     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3800     return stclass;
3801 }
3802
3803
3804 /* The below joins as many adjacent EXACTish nodes as possible into a single
3805  * one.  The regop may be changed if the node(s) contain certain sequences that
3806  * require special handling.  The joining is only done if:
3807  * 1) there is room in the current conglomerated node to entirely contain the
3808  *    next one.
3809  * 2) they are compatible node types
3810  *
3811  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3812  * these get optimized out
3813  *
3814  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3815  * as possible, even if that means splitting an existing node so that its first
3816  * part is moved to the preceeding node.  This would maximise the efficiency of
3817  * memEQ during matching.
3818  *
3819  * If a node is to match under /i (folded), the number of characters it matches
3820  * can be different than its character length if it contains a multi-character
3821  * fold.  *min_subtract is set to the total delta number of characters of the
3822  * input nodes.
3823  *
3824  * And *unfolded_multi_char is set to indicate whether or not the node contains
3825  * an unfolded multi-char fold.  This happens when it won't be known until
3826  * runtime whether the fold is valid or not; namely
3827  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3828  *      target string being matched against turns out to be UTF-8 is that fold
3829  *      valid; or
3830  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3831  *      runtime.
3832  * (Multi-char folds whose components are all above the Latin1 range are not
3833  * run-time locale dependent, and have already been folded by the time this
3834  * function is called.)
3835  *
3836  * This is as good a place as any to discuss the design of handling these
3837  * multi-character fold sequences.  It's been wrong in Perl for a very long
3838  * time.  There are three code points in Unicode whose multi-character folds
3839  * were long ago discovered to mess things up.  The previous designs for
3840  * dealing with these involved assigning a special node for them.  This
3841  * approach doesn't always work, as evidenced by this example:
3842  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3843  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3844  * would match just the \xDF, it won't be able to handle the case where a
3845  * successful match would have to cross the node's boundary.  The new approach
3846  * that hopefully generally solves the problem generates an EXACTFUP node
3847  * that is "sss" in this case.
3848  *
3849  * It turns out that there are problems with all multi-character folds, and not
3850  * just these three.  Now the code is general, for all such cases.  The
3851  * approach taken is:
3852  * 1)   This routine examines each EXACTFish node that could contain multi-
3853  *      character folded sequences.  Since a single character can fold into
3854  *      such a sequence, the minimum match length for this node is less than
3855  *      the number of characters in the node.  This routine returns in
3856  *      *min_subtract how many characters to subtract from the the actual
3857  *      length of the string to get a real minimum match length; it is 0 if
3858  *      there are no multi-char foldeds.  This delta is used by the caller to
3859  *      adjust the min length of the match, and the delta between min and max,
3860  *      so that the optimizer doesn't reject these possibilities based on size
3861  *      constraints.
3862  *
3863  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3864  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3865  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3866  *      EXACTFU nodes.  The node type of such nodes is then changed to
3867  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3868  *      (The procedures in step 1) above are sufficient to handle this case in
3869  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3870  *      the only case where there is a possible fold length change in non-UTF-8
3871  *      patterns.  By reserving a special node type for problematic cases, the
3872  *      far more common regular EXACTFU nodes can be processed faster.
3873  *      regexec.c takes advantage of this.
3874  *
3875  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3876  *      problematic cases.   These all only occur when the pattern is not
3877  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3878  *      length change, it handles the situation where the string cannot be
3879  *      entirely folded.  The strings in an EXACTFish node are folded as much
3880  *      as possible during compilation in regcomp.c.  This saves effort in
3881  *      regex matching.  By using an EXACTFUP node when it is not possible to
3882  *      fully fold at compile time, regexec.c can know that everything in an
3883  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3884  *      case where folding in EXACTFU nodes can't be done at compile time is
3885  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3886  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3887  *      handle two very different cases.  Alternatively, there could have been
3888  *      a node type where there are length changes, one for unfolded, and one
3889  *      for both.  If yet another special case needed to be created, the number
3890  *      of required node types would have to go to 7.  khw figures that even
3891  *      though there are plenty of node types to spare, that the maintenance
3892  *      cost wasn't worth the small speedup of doing it that way, especially
3893  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3894  *
3895  *      There are other cases where folding isn't done at compile time, but
3896  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3897  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3898  *      changes.  Some folds in EXACTF depend on if the runtime target string
3899  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3900  *      when no fold in it depends on the UTF-8ness of the target string.)
3901  *
3902  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3903  *      validity of the fold won't be known until runtime, and so must remain
3904  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3905  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3906  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3907  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3908  *      The reason this is a problem is that the optimizer part of regexec.c
3909  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3910  *      that a character in the pattern corresponds to at most a single
3911  *      character in the target string.  (And I do mean character, and not byte
3912  *      here, unlike other parts of the documentation that have never been
3913  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3914  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3915  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3916  *      EXACTFL nodes, violate the assumption, and they are the only instances
3917  *      where it is violated.  I'm reluctant to try to change the assumption,
3918  *      as the code involved is impenetrable to me (khw), so instead the code
3919  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3920  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3921  *      boolean indicating whether or not the node contains such a fold.  When
3922  *      it is true, the caller sets a flag that later causes the optimizer in
3923  *      this file to not set values for the floating and fixed string lengths,
3924  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3925  *      assumption.  Thus, there is no optimization based on string lengths for
3926  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3927  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3928  *      assumption is wrong only in these cases is that all other non-UTF-8
3929  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3930  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3931  *      EXACTF nodes because we don't know at compile time if it actually
3932  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3933  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3934  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3935  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3936  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3937  *      string would require the pattern to be forced into UTF-8, the overhead
3938  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3939  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3940  *      locale.)
3941  *
3942  *      Similarly, the code that generates tries doesn't currently handle
3943  *      not-already-folded multi-char folds, and it looks like a pain to change
3944  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
3945  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
3946  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
3947  *      using /iaa matching will be doing so almost entirely with ASCII
3948  *      strings, so this should rarely be encountered in practice */
3949
3950 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3951     if (PL_regkind[OP(scan)] == EXACT) \
3952         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3953
3954 STATIC U32
3955 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3956                    UV *min_subtract, bool *unfolded_multi_char,
3957                    U32 flags, regnode *val, U32 depth)
3958 {
3959     /* Merge several consecutive EXACTish nodes into one. */
3960
3961     regnode *n = regnext(scan);
3962     U32 stringok = 1;
3963     regnode *next = scan + NODE_SZ_STR(scan);
3964     U32 merged = 0;
3965     U32 stopnow = 0;
3966 #ifdef DEBUGGING
3967     regnode *stop = scan;
3968     GET_RE_DEBUG_FLAGS_DECL;
3969 #else
3970     PERL_UNUSED_ARG(depth);
3971 #endif
3972
3973     PERL_ARGS_ASSERT_JOIN_EXACT;
3974 #ifndef EXPERIMENTAL_INPLACESCAN
3975     PERL_UNUSED_ARG(flags);
3976     PERL_UNUSED_ARG(val);
3977 #endif
3978     DEBUG_PEEP("join", scan, depth, 0);
3979
3980     assert(PL_regkind[OP(scan)] == EXACT);
3981
3982     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3983      * EXACT ones that are mergeable to the current one. */
3984     while (    n
3985            && (    PL_regkind[OP(n)] == NOTHING
3986                || (stringok && PL_regkind[OP(n)] == EXACT))
3987            && NEXT_OFF(n)
3988            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3989     {
3990
3991         if (OP(n) == TAIL || n > next)
3992             stringok = 0;
3993         if (PL_regkind[OP(n)] == NOTHING) {
3994             DEBUG_PEEP("skip:", n, depth, 0);
3995             NEXT_OFF(scan) += NEXT_OFF(n);
3996             next = n + NODE_STEP_REGNODE;
3997 #ifdef DEBUGGING
3998             if (stringok)
3999                 stop = n;
4000 #endif
4001             n = regnext(n);
4002         }
4003         else if (stringok) {
4004             const unsigned int oldl = STR_LEN(scan);
4005             regnode * const nnext = regnext(n);
4006
4007             /* XXX I (khw) kind of doubt that this works on platforms (should
4008              * Perl ever run on one) where U8_MAX is above 255 because of lots
4009              * of other assumptions */
4010             /* Don't join if the sum can't fit into a single node */
4011             if (oldl + STR_LEN(n) > U8_MAX)
4012                 break;
4013
4014             /* Joining something that requires UTF-8 with something that
4015              * doesn't, means the result requires UTF-8. */
4016             if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
4017                 OP(scan) = EXACT_ONLY8;
4018             }
4019             else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
4020                 ;   /* join is compatible, no need to change OP */
4021             }
4022             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
4023                 OP(scan) = EXACTFU_ONLY8;
4024             }
4025             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
4026                 ;   /* join is compatible, no need to change OP */
4027             }
4028             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4029                 ;   /* join is compatible, no need to change OP */
4030             }
4031             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4032
4033                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4034                   * which can join with EXACTFU ones.  We check for this case
4035                   * here.  These need to be resolved to either EXACTFU or
4036                   * EXACTF at joining time.  They have nothing in them that
4037                   * would forbid them from being the more desirable EXACTFU
4038                   * nodes except that they begin and/or end with a single [Ss].
4039                   * The reason this is problematic is because they could be
4040                   * joined in this loop with an adjacent node that ends and/or
4041                   * begins with [Ss] which would then form the sequence 'ss',
4042                   * which matches differently under /di than /ui, in which case
4043                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4044                   * formed, the nodes get absorbed into any adjacent EXACTFU
4045                   * node.  And if the only adjacent node is EXACTF, they get
4046                   * absorbed into that, under the theory that a longer node is
4047                   * better than two shorter ones, even if one is EXACTFU.  Note
4048                   * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
4049                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4050
4051                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4052
4053                     /* Here the joined node would end with 's'.  If the node
4054                      * following the combination is an EXACTF one, it's better to
4055                      * join this trailing edge 's' node with that one, leaving the
4056                      * current one in 'scan' be the more desirable EXACTFU */
4057                     if (OP(nnext) == EXACTF) {
4058                         break;
4059                     }
4060
4061                     OP(scan) = EXACTFU_S_EDGE;
4062
4063                 }   /* Otherwise, the beginning 's' of the 2nd node just
4064                        becomes an interior 's' in 'scan' */
4065             }
4066             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4067                 ;   /* join is compatible, no need to change OP */
4068             }
4069             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4070
4071                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4072                  * nodes.  But the latter nodes can be also joined with EXACTFU
4073                  * ones, and that is a better outcome, so if the node following
4074                  * 'n' is EXACTFU, quit now so that those two can be joined
4075                  * later */
4076                 if (OP(nnext) == EXACTFU) {
4077                     break;
4078                 }
4079
4080                 /* The join is compatible, and the combined node will be
4081                  * EXACTF.  (These don't care if they begin or end with 's' */
4082             }
4083             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4084                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4085                     && STRING(n)[0] == 's')
4086                 {
4087                     /* When combined, we have the sequence 'ss', which means we
4088                      * have to remain /di */
4089                     OP(scan) = EXACTF;
4090                 }
4091             }
4092             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4093                 if (STRING(n)[0] == 's') {
4094                     ;   /* Here the join is compatible and the combined node
4095                            starts with 's', no need to change OP */
4096                 }
4097                 else {  /* Now the trailing 's' is in the interior */
4098                     OP(scan) = EXACTFU;
4099                 }
4100             }
4101             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4102
4103                 /* The join is compatible, and the combined node will be
4104                  * EXACTF.  (These don't care if they begin or end with 's' */
4105                 OP(scan) = EXACTF;
4106             }
4107             else if (OP(scan) != OP(n)) {
4108
4109                 /* The only other compatible joinings are the same node type */
4110                 break;
4111             }
4112
4113             DEBUG_PEEP("merg", n, depth, 0);
4114             merged++;
4115
4116             NEXT_OFF(scan) += NEXT_OFF(n);
4117             STR_LEN(scan) += STR_LEN(n);
4118             next = n + NODE_SZ_STR(n);
4119             /* Now we can overwrite *n : */
4120             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4121 #ifdef DEBUGGING
4122             stop = next - 1;
4123 #endif
4124             n = nnext;
4125             if (stopnow) break;
4126         }
4127
4128 #ifdef EXPERIMENTAL_INPLACESCAN
4129         if (flags && !NEXT_OFF(n)) {
4130             DEBUG_PEEP("atch", val, depth, 0);
4131             if (reg_off_by_arg[OP(n)]) {
4132                 ARG_SET(n, val - n);
4133             }
4134             else {
4135                 NEXT_OFF(n) = val - n;
4136             }
4137             stopnow = 1;
4138         }
4139 #endif
4140     }
4141
4142     /* This temporary node can now be turned into EXACTFU, and must, as
4143      * regexec.c doesn't handle it */
4144     if (OP(scan) == EXACTFU_S_EDGE) {
4145         OP(scan) = EXACTFU;
4146     }
4147
4148     *min_subtract = 0;
4149     *unfolded_multi_char = FALSE;
4150
4151     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4152      * can now analyze for sequences of problematic code points.  (Prior to
4153      * this final joining, sequences could have been split over boundaries, and
4154      * hence missed).  The sequences only happen in folding, hence for any
4155      * non-EXACT EXACTish node */
4156     if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4157         U8* s0 = (U8*) STRING(scan);
4158         U8* s = s0;
4159         U8* s_end = s0 + STR_LEN(scan);
4160
4161         int total_count_delta = 0;  /* Total delta number of characters that
4162                                        multi-char folds expand to */
4163
4164         /* One pass is made over the node's string looking for all the
4165          * possibilities.  To avoid some tests in the loop, there are two main
4166          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4167          * non-UTF-8 */
4168         if (UTF) {
4169             U8* folded = NULL;
4170
4171             if (OP(scan) == EXACTFL) {
4172                 U8 *d;
4173
4174                 /* An EXACTFL node would already have been changed to another
4175                  * node type unless there is at least one character in it that
4176                  * is problematic; likely a character whose fold definition
4177                  * won't be known until runtime, and so has yet to be folded.
4178                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4179                  * to handle the UTF-8 case, we need to create a temporary
4180                  * folded copy using UTF-8 locale rules in order to analyze it.
4181                  * This is because our macros that look to see if a sequence is
4182                  * a multi-char fold assume everything is folded (otherwise the
4183                  * tests in those macros would be too complicated and slow).
4184                  * Note that here, the non-problematic folds will have already
4185                  * been done, so we can just copy such characters.  We actually
4186                  * don't completely fold the EXACTFL string.  We skip the
4187                  * unfolded multi-char folds, as that would just create work
4188                  * below to figure out the size they already are */
4189
4190                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4191                 d = folded;
4192                 while (s < s_end) {
4193                     STRLEN s_len = UTF8SKIP(s);
4194                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4195                         Copy(s, d, s_len, U8);
4196                         d += s_len;
4197                     }
4198                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4199                         *unfolded_multi_char = TRUE;
4200                         Copy(s, d, s_len, U8);
4201                         d += s_len;
4202                     }
4203                     else if (isASCII(*s)) {
4204                         *(d++) = toFOLD(*s);
4205                     }
4206                     else {
4207                         STRLEN len;
4208                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4209                         d += len;
4210                     }
4211                     s += s_len;
4212                 }
4213
4214                 /* Point the remainder of the routine to look at our temporary
4215                  * folded copy */
4216                 s = folded;
4217                 s_end = d;
4218             } /* End of creating folded copy of EXACTFL string */
4219
4220             /* Examine the string for a multi-character fold sequence.  UTF-8
4221              * patterns have all characters pre-folded by the time this code is
4222              * executed */
4223             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4224                                      length sequence we are looking for is 2 */
4225             {
4226                 int count = 0;  /* How many characters in a multi-char fold */
4227                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4228                 if (! len) {    /* Not a multi-char fold: get next char */
4229                     s += UTF8SKIP(s);
4230                     continue;
4231                 }
4232
4233                 { /* Here is a generic multi-char fold. */
4234                     U8* multi_end  = s + len;
4235
4236                     /* Count how many characters are in it.  In the case of
4237                      * /aa, no folds which contain ASCII code points are
4238                      * allowed, so check for those, and skip if found. */
4239                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4240                         count = utf8_length(s, multi_end);
4241                         s = multi_end;
4242                     }
4243                     else {
4244                         while (s < multi_end) {
4245                             if (isASCII(*s)) {
4246                                 s++;
4247                                 goto next_iteration;
4248                             }
4249                             else {
4250                                 s += UTF8SKIP(s);
4251                             }
4252                             count++;
4253                         }
4254                     }
4255                 }
4256
4257                 /* The delta is how long the sequence is minus 1 (1 is how long
4258                  * the character that folds to the sequence is) */
4259                 total_count_delta += count - 1;
4260               next_iteration: ;
4261             }
4262
4263             /* We created a temporary folded copy of the string in EXACTFL
4264              * nodes.  Therefore we need to be sure it doesn't go below zero,
4265              * as the real string could be shorter */
4266             if (OP(scan) == EXACTFL) {
4267                 int total_chars = utf8_length((U8*) STRING(scan),
4268                                            (U8*) STRING(scan) + STR_LEN(scan));
4269                 if (total_count_delta > total_chars) {
4270                     total_count_delta = total_chars;
4271                 }
4272             }
4273
4274             *min_subtract += total_count_delta;
4275             Safefree(folded);
4276         }
4277         else if (OP(scan) == EXACTFAA) {
4278
4279             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4280              * fold to the ASCII range (and there are no existing ones in the
4281              * upper latin1 range).  But, as outlined in the comments preceding
4282              * this function, we need to flag any occurrences of the sharp s.
4283              * This character forbids trie formation (because of added
4284              * complexity) */
4285 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4286    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4287                                       || UNICODE_DOT_DOT_VERSION > 0)
4288             while (s < s_end) {
4289                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4290                     OP(scan) = EXACTFAA_NO_TRIE;
4291                     *unfolded_multi_char = TRUE;
4292                     break;
4293                 }
4294                 s++;
4295             }
4296         }
4297         else {
4298
4299             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4300              * folds that are all Latin1.  As explained in the comments
4301              * preceding this function, we look also for the sharp s in EXACTF
4302              * and EXACTFL nodes; it can be in the final position.  Otherwise
4303              * we can stop looking 1 byte earlier because have to find at least
4304              * two characters for a multi-fold */
4305             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4306                               ? s_end
4307                               : s_end -1;
4308
4309             while (s < upper) {
4310                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4311                 if (! len) {    /* Not a multi-char fold. */
4312                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4313                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4314                     {
4315                         *unfolded_multi_char = TRUE;
4316                     }
4317                     s++;
4318                     continue;
4319                 }
4320
4321                 if (len == 2
4322                     && isALPHA_FOLD_EQ(*s, 's')
4323                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4324                 {
4325
4326                     /* EXACTF nodes need to know that the minimum length
4327                      * changed so that a sharp s in the string can match this
4328                      * ss in the pattern, but they remain EXACTF nodes, as they
4329                      * won't match this unless the target string is is UTF-8,
4330                      * which we don't know until runtime.  EXACTFL nodes can't
4331                      * transform into EXACTFU nodes */
4332                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4333                         OP(scan) = EXACTFUP;
4334                     }
4335                 }
4336
4337                 *min_subtract += len - 1;
4338                 s += len;
4339             }
4340 #endif
4341         }
4342
4343         if (     STR_LEN(scan) == 1
4344             &&   isALPHA_A(* STRING(scan))
4345             &&  (         OP(scan) == EXACTFAA
4346                  || (     OP(scan) == EXACTFU
4347                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4348         {
4349             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4350
4351             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4352              * with the mask set to the complement of the bit that differs
4353              * between upper and lower case, and the lowest code point of the
4354              * pair (which the '&' forces) */
4355             OP(scan) = ANYOFM;
4356             ARG_SET(scan, *STRING(scan) & mask);
4357             FLAGS(scan) = mask;
4358         }
4359     }
4360
4361 #ifdef DEBUGGING
4362     /* Allow dumping but overwriting the collection of skipped
4363      * ops and/or strings with fake optimized ops */
4364     n = scan + NODE_SZ_STR(scan);
4365     while (n <= stop) {
4366         OP(n) = OPTIMIZED;
4367         FLAGS(n) = 0;
4368         NEXT_OFF(n) = 0;
4369         n++;
4370     }
4371 #endif
4372     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4373     return stopnow;
4374 }
4375
4376 /* REx optimizer.  Converts nodes into quicker variants "in place".
4377    Finds fixed substrings.  */
4378
4379 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4380    to the position after last scanned or to NULL. */
4381
4382 #define INIT_AND_WITHP \
4383     assert(!and_withp); \
4384     Newx(and_withp, 1, regnode_ssc); \
4385     SAVEFREEPV(and_withp)
4386
4387
4388 static void
4389 S_unwind_scan_frames(pTHX_ const void *p)
4390 {
4391     scan_frame *f= (scan_frame *)p;
4392     do {
4393         scan_frame *n= f->next_frame;
4394         Safefree(f);
4395         f= n;
4396     } while (f);
4397 }
4398
4399 /* the return from this sub is the minimum length that could possibly match */
4400 STATIC SSize_t
4401 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4402                         SSize_t *minlenp, SSize_t *deltap,
4403                         regnode *last,
4404                         scan_data_t *data,
4405                         I32 stopparen,
4406                         U32 recursed_depth,
4407                         regnode_ssc *and_withp,
4408                         U32 flags, U32 depth)
4409                         /* scanp: Start here (read-write). */
4410                         /* deltap: Write maxlen-minlen here. */
4411                         /* last: Stop before this one. */
4412                         /* data: string data about the pattern */
4413                         /* stopparen: treat close N as END */
4414                         /* recursed: which subroutines have we recursed into */
4415                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4416 {
4417     /* There must be at least this number of characters to match */
4418     SSize_t min = 0;
4419     I32 pars = 0, code;
4420     regnode *scan = *scanp, *next;
4421     SSize_t delta = 0;
4422     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4423     int is_inf_internal = 0;            /* The studied chunk is infinite */
4424     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4425     scan_data_t data_fake;
4426     SV *re_trie_maxbuff = NULL;
4427     regnode *first_non_open = scan;
4428     SSize_t stopmin = SSize_t_MAX;
4429     scan_frame *frame = NULL;
4430     GET_RE_DEBUG_FLAGS_DECL;
4431
4432     PERL_ARGS_ASSERT_STUDY_CHUNK;
4433     RExC_study_started= 1;
4434
4435     Zero(&data_fake, 1, scan_data_t);
4436
4437     if ( depth == 0 ) {
4438         while (first_non_open && OP(first_non_open) == OPEN)
4439             first_non_open=regnext(first_non_open);
4440     }
4441
4442
4443   fake_study_recurse:
4444     DEBUG_r(
4445         RExC_study_chunk_recursed_count++;
4446     );
4447     DEBUG_OPTIMISE_MORE_r(
4448     {
4449         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4450             depth, (long)stopparen,
4451             (unsigned long)RExC_study_chunk_recursed_count,
4452             (unsigned long)depth, (unsigned long)recursed_depth,
4453             scan,
4454             last);
4455         if (recursed_depth) {
4456             U32 i;
4457             U32 j;
4458             for ( j = 0 ; j < recursed_depth ; j++ ) {
4459                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4460                     if (
4461                         PAREN_TEST(RExC_study_chunk_recursed +
4462                                    ( j * RExC_study_chunk_recursed_bytes), i )
4463                         && (
4464                             !j ||
4465                             !PAREN_TEST(RExC_study_chunk_recursed +
4466                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4467                         )
4468                     ) {
4469                         Perl_re_printf( aTHX_ " %d",(int)i);
4470                         break;
4471                     }
4472                 }
4473                 if ( j + 1 < recursed_depth ) {
4474                     Perl_re_printf( aTHX_  ",");
4475                 }
4476             }
4477         }
4478         Perl_re_printf( aTHX_ "\n");
4479     }
4480     );
4481     while ( scan && OP(scan) != END && scan < last ){
4482         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4483                                    node length to get a real minimum (because
4484                                    the folded version may be shorter) */
4485         bool unfolded_multi_char = FALSE;
4486         /* Peephole optimizer: */
4487         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4488         DEBUG_PEEP("Peep", scan, depth, flags);
4489
4490
4491         /* The reason we do this here is that we need to deal with things like
4492          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4493          * parsing code, as each (?:..) is handled by a different invocation of
4494          * reg() -- Yves
4495          */
4496         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4497
4498         /* Follow the next-chain of the current node and optimize
4499            away all the NOTHINGs from it.  */
4500         if (OP(scan) != CURLYX) {
4501             const int max = (reg_off_by_arg[OP(scan)]
4502                        ? I32_MAX
4503                        /* I32 may be smaller than U16 on CRAYs! */
4504                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4505             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4506             int noff;
4507             regnode *n = scan;
4508
4509             /* Skip NOTHING and LONGJMP. */
4510             while ((n = regnext(n))
4511                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4512                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4513                    && off + noff < max)
4514                 off += noff;
4515             if (reg_off_by_arg[OP(scan)])
4516                 ARG(scan) = off;
4517             else
4518                 NEXT_OFF(scan) = off;
4519         }
4520
4521         /* The principal pseudo-switch.  Cannot be a switch, since we
4522            look into several different things.  */
4523         if ( OP(scan) == DEFINEP ) {
4524             SSize_t minlen = 0;
4525             SSize_t deltanext = 0;
4526             SSize_t fake_last_close = 0;
4527             I32 f = SCF_IN_DEFINE;
4528
4529             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4530             scan = regnext(scan);
4531             assert( OP(scan) == IFTHEN );
4532             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4533
4534             data_fake.last_closep= &fake_last_close;
4535             minlen = *minlenp;
4536             next = regnext(scan);
4537             scan = NEXTOPER(NEXTOPER(scan));
4538             DEBUG_PEEP("scan", scan, depth, flags);
4539             DEBUG_PEEP("next", next, depth, flags);
4540
4541             /* we suppose the run is continuous, last=next...
4542              * NOTE we dont use the return here! */
4543             /* DEFINEP study_chunk() recursion */
4544             (void)study_chunk(pRExC_state, &scan, &minlen,
4545                               &deltanext, next, &data_fake, stopparen,
4546                               recursed_depth, NULL, f, depth+1);
4547
4548             scan = next;
4549         } else
4550         if (
4551             OP(scan) == BRANCH  ||
4552             OP(scan) == BRANCHJ ||
4553             OP(scan) == IFTHEN
4554         ) {
4555             next = regnext(scan);
4556             code = OP(scan);
4557
4558             /* The op(next)==code check below is to see if we
4559              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4560              * IFTHEN is special as it might not appear in pairs.
4561              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4562              * we dont handle it cleanly. */
4563             if (OP(next) == code || code == IFTHEN) {
4564                 /* NOTE - There is similar code to this block below for
4565                  * handling TRIE nodes on a re-study.  If you change stuff here
4566                  * check there too. */
4567                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4568                 regnode_ssc accum;
4569                 regnode * const startbranch=scan;
4570
4571                 if (flags & SCF_DO_SUBSTR) {
4572                     /* Cannot merge strings after this. */
4573                     scan_commit(pRExC_state, data, minlenp, is_inf);
4574                 }
4575
4576                 if (flags & SCF_DO_STCLASS)
4577                     ssc_init_zero(pRExC_state, &accum);
4578
4579                 while (OP(scan) == code) {
4580                     SSize_t deltanext, minnext, fake;
4581                     I32 f = 0;
4582                     regnode_ssc this_class;
4583
4584                     DEBUG_PEEP("Branch", scan, depth, flags);
4585
4586                     num++;
4587                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4588                     if (data) {
4589                         data_fake.whilem_c = data->whilem_c;
4590                         data_fake.last_closep = data->last_closep;
4591                     }
4592                     else
4593                         data_fake.last_closep = &fake;
4594
4595                     data_fake.pos_delta = delta;
4596                     next = regnext(scan);
4597
4598                     scan = NEXTOPER(scan); /* everything */
4599                     if (code != BRANCH)    /* everything but BRANCH */
4600                         scan = NEXTOPER(scan);
4601
4602                     if (flags & SCF_DO_STCLASS) {
4603                         ssc_init(pRExC_state, &this_class);
4604                         data_fake.start_class = &this_class;
4605                         f = SCF_DO_STCLASS_AND;
4606                     }
4607                     if (flags & SCF_WHILEM_VISITED_POS)
4608                         f |= SCF_WHILEM_VISITED_POS;
4609
4610                     /* we suppose the run is continuous, last=next...*/
4611                     /* recurse study_chunk() for each BRANCH in an alternation */
4612                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4613                                       &deltanext, next, &data_fake, stopparen,
4614                                       recursed_depth, NULL, f, depth+1);
4615
4616                     if (min1 > minnext)
4617                         min1 = minnext;
4618                     if (deltanext == SSize_t_MAX) {
4619                         is_inf = is_inf_internal = 1;
4620                         max1 = SSize_t_MAX;
4621                     } else if (max1 < minnext + deltanext)
4622                         max1 = minnext + deltanext;
4623                     scan = next;
4624                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4625                         pars++;
4626                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4627                         if ( stopmin > minnext)
4628                             stopmin = min + min1;
4629                         flags &= ~SCF_DO_SUBSTR;
4630                         if (data)
4631                             data->flags |= SCF_SEEN_ACCEPT;
4632                     }
4633                     if (data) {
4634                         if (data_fake.flags & SF_HAS_EVAL)
4635                             data->flags |= SF_HAS_EVAL;
4636                         data->whilem_c = data_fake.whilem_c;
4637                     }
4638                     if (flags & SCF_DO_STCLASS)
4639                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4640                 }
4641                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4642                     min1 = 0;
4643                 if (flags & SCF_DO_SUBSTR) {
4644                     data->pos_min += min1;
4645                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4646                         data->pos_delta = SSize_t_MAX;
4647                     else
4648                         data->pos_delta += max1 - min1;
4649                     if (max1 != min1 || is_inf)
4650                         data->cur_is_floating = 1;
4651                 }
4652                 min += min1;
4653                 if (delta == SSize_t_MAX
4654                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4655                     delta = SSize_t_MAX;
4656                 else
4657                     delta += max1 - min1;
4658                 if (flags & SCF_DO_STCLASS_OR) {
4659                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4660                     if (min1) {
4661                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4662                         flags &= ~SCF_DO_STCLASS;
4663                     }
4664                 }
4665                 else if (flags & SCF_DO_STCLASS_AND) {
4666                     if (min1) {
4667                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4668                         flags &= ~SCF_DO_STCLASS;
4669                     }
4670                     else {
4671                         /* Switch to OR mode: cache the old value of
4672                          * data->start_class */
4673                         INIT_AND_WITHP;
4674                         StructCopy(data->start_class, and_withp, regnode_ssc);
4675                         flags &= ~SCF_DO_STCLASS_AND;
4676                         StructCopy(&accum, data->start_class, regnode_ssc);
4677                         flags |= SCF_DO_STCLASS_OR;
4678                     }
4679                 }
4680
4681                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4682                         OP( startbranch ) == BRANCH )
4683                 {
4684                 /* demq.
4685
4686                    Assuming this was/is a branch we are dealing with: 'scan'
4687                    now points at the item that follows the branch sequence,
4688                    whatever it is. We now start at the beginning of the
4689                    sequence and look for subsequences of
4690
4691                    BRANCH->EXACT=>x1
4692                    BRANCH->EXACT=>x2
4693                    tail
4694
4695                    which would be constructed from a pattern like
4696                    /A|LIST|OF|WORDS/
4697
4698                    If we can find such a subsequence we need to turn the first
4699                    element into a trie and then add the subsequent branch exact
4700                    strings to the trie.
4701
4702                    We have two cases
4703
4704                      1. patterns where the whole set of branches can be
4705                         converted.
4706
4707                      2. patterns where only a subset can be converted.
4708
4709                    In case 1 we can replace the whole set with a single regop
4710                    for the trie. In case 2 we need to keep the start and end
4711                    branches so
4712
4713                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4714                      becomes BRANCH TRIE; BRANCH X;
4715
4716                   There is an additional case, that being where there is a
4717                   common prefix, which gets split out into an EXACT like node
4718                   preceding the TRIE node.
4719
4720                   If x(1..n)==tail then we can do a simple trie, if not we make
4721                   a "jump" trie, such that when we match the appropriate word
4722                   we "jump" to the appropriate tail node. Essentially we turn
4723                   a nested if into a case structure of sorts.
4724
4725                 */
4726
4727                     int made=0;
4728                     if (!re_trie_maxbuff) {
4729                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4730                         if (!SvIOK(re_trie_maxbuff))
4731                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4732                     }
4733                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4734                         regnode *cur;
4735                         regnode *first = (regnode *)NULL;
4736                         regnode *last = (regnode *)NULL;
4737                         regnode *tail = scan;
4738                         U8 trietype = 0;
4739                         U32 count=0;
4740
4741                         /* var tail is used because there may be a TAIL
4742                            regop in the way. Ie, the exacts will point to the
4743                            thing following the TAIL, but the last branch will
4744                            point at the TAIL. So we advance tail. If we
4745                            have nested (?:) we may have to move through several
4746                            tails.
4747                          */
4748
4749                         while ( OP( tail ) == TAIL ) {
4750                             /* this is the TAIL generated by (?:) */
4751                             tail = regnext( tail );
4752                         }
4753
4754
4755                         DEBUG_TRIE_COMPILE_r({
4756                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4757                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4758                               depth+1,
4759                               "Looking for TRIE'able sequences. Tail node is ",
4760                               (UV) REGNODE_OFFSET(tail),
4761                               SvPV_nolen_const( RExC_mysv )
4762                             );
4763                         });
4764
4765                         /*
4766
4767                             Step through the branches
4768                                 cur represents each branch,
4769                                 noper is the first thing to be matched as part
4770                                       of that branch
4771                                 noper_next is the regnext() of that node.
4772
4773                             We normally handle a case like this
4774                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4775                             support building with NOJUMPTRIE, which restricts
4776                             the trie logic to structures like /FOO|BAR/.
4777
4778                             If noper is a trieable nodetype then the branch is
4779                             a possible optimization target. If we are building
4780                             under NOJUMPTRIE then we require that noper_next is
4781                             the same as scan (our current position in the regex
4782                             program).
4783
4784                             Once we have two or more consecutive such branches
4785                             we can create a trie of the EXACT's contents and
4786                             stitch it in place into the program.
4787
4788                             If the sequence represents all of the branches in
4789                             the alternation we replace the entire thing with a
4790                             single TRIE node.
4791
4792                             Otherwise when it is a subsequence we need to
4793                             stitch it in place and replace only the relevant
4794                             branches. This means the first branch has to remain
4795                             as it is used by the alternation logic, and its
4796                             next pointer, and needs to be repointed at the item
4797                             on the branch chain following the last branch we
4798                             have optimized away.
4799
4800                             This could be either a BRANCH, in which case the
4801                             subsequence is internal, or it could be the item
4802                             following the branch sequence in which case the
4803                             subsequence is at the end (which does not
4804                             necessarily mean the first node is the start of the
4805                             alternation).
4806
4807                             TRIE_TYPE(X) is a define which maps the optype to a
4808                             trietype.
4809
4810                                 optype          |  trietype
4811                                 ----------------+-----------
4812                                 NOTHING         | NOTHING
4813                                 EXACT           | EXACT
4814                                 EXACT_ONLY8     | EXACT
4815                                 EXACTFU         | EXACTFU
4816                                 EXACTFU_ONLY8   | EXACTFU
4817                                 EXACTFUP        | EXACTFU
4818                                 EXACTFAA        | EXACTFAA
4819                                 EXACTL          | EXACTL
4820                                 EXACTFLU8       | EXACTFLU8
4821
4822
4823                         */
4824 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4825                        ? NOTHING                                            \
4826                        : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
4827                          ? EXACT                                            \
4828                          : (     EXACTFU == (X)                             \
4829                               || EXACTFU_ONLY8 == (X)                       \
4830                               || EXACTFUP == (X) )                          \
4831                            ? EXACTFU                                        \
4832                            : ( EXACTFAA == (X) )                            \
4833                              ? EXACTFAA                                     \
4834                              : ( EXACTL == (X) )                            \
4835                                ? EXACTL                                     \
4836                                : ( EXACTFLU8 == (X) )                       \
4837                                  ? EXACTFLU8                                \
4838                                  : 0 )
4839
4840                         /* dont use tail as the end marker for this traverse */
4841                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4842                             regnode * const noper = NEXTOPER( cur );
4843                             U8 noper_type = OP( noper );
4844                             U8 noper_trietype = TRIE_TYPE( noper_type );
4845 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4846                             regnode * const noper_next = regnext( noper );
4847                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4848                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4849 #endif
4850
4851                             DEBUG_TRIE_COMPILE_r({
4852                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4853                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4854                                    depth+1,
4855                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4856
4857                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4858                                 Perl_re_printf( aTHX_  " -> %d:%s",
4859                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4860
4861                                 if ( noper_next ) {
4862                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4863                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4864                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4865                                 }
4866                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4867                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4868                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4869                                 );
4870                             });
4871
4872                             /* Is noper a trieable nodetype that can be merged
4873                              * with the current trie (if there is one)? */
4874                             if ( noper_trietype
4875                                   &&
4876                                   (
4877                                         ( noper_trietype == NOTHING )
4878                                         || ( trietype == NOTHING )
4879                                         || ( trietype == noper_trietype )
4880                                   )
4881 #ifdef NOJUMPTRIE
4882                                   && noper_next >= tail
4883 #endif
4884                                   && count < U16_MAX)
4885                             {
4886                                 /* Handle mergable triable node Either we are
4887                                  * the first node in a new trieable sequence,
4888                                  * in which case we do some bookkeeping,
4889                                  * otherwise we update the end pointer. */
4890                                 if ( !first ) {
4891                                     first = cur;
4892                                     if ( noper_trietype == NOTHING ) {
4893 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4894                                         regnode * const noper_next = regnext( noper );
4895                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4896                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4897 #endif
4898
4899                                         if ( noper_next_trietype ) {
4900                                             trietype = noper_next_trietype;
4901                                         } else if (noper_next_type)  {
4902                                             /* a NOTHING regop is 1 regop wide.
4903                                              * We need at least two for a trie
4904                                              * so we can't merge this in */
4905                                             first = NULL;
4906                                         }
4907                                     } else {
4908                                         trietype = noper_trietype;
4909                                     }
4910                                 } else {
4911                                     if ( trietype == NOTHING )
4912                                         trietype = noper_trietype;
4913                                     last = cur;
4914                                 }
4915                                 if (first)
4916                                     count++;
4917                             } /* end handle mergable triable node */
4918                             else {
4919                                 /* handle unmergable node -
4920                                  * noper may either be a triable node which can
4921                                  * not be tried together with the current trie,
4922                                  * or a non triable node */
4923                                 if ( last ) {
4924                                     /* If last is set and trietype is not
4925                                      * NOTHING then we have found at least two
4926                                      * triable branch sequences in a row of a
4927                                      * similar trietype so we can turn them
4928                                      * into a trie. If/when we allow NOTHING to
4929                                      * start a trie sequence this condition
4930                                      * will be required, and it isn't expensive
4931                                      * so we leave it in for now. */
4932                                     if ( trietype && trietype != NOTHING )
4933                                         make_trie( pRExC_state,
4934                                                 startbranch, first, cur, tail,
4935                                                 count, trietype, depth+1 );
4936                                     last = NULL; /* note: we clear/update
4937                                                     first, trietype etc below,
4938                                                     so we dont do it here */
4939                                 }
4940                                 if ( noper_trietype
4941 #ifdef NOJUMPTRIE
4942                                      && noper_next >= tail
4943 #endif
4944                                 ){
4945                                     /* noper is triable, so we can start a new
4946                                      * trie sequence */
4947                                     count = 1;
4948                                     first = cur;
4949                                     trietype = noper_trietype;
4950                                 } else if (first) {
4951                                     /* if we already saw a first but the
4952                                      * current node is not triable then we have
4953                                      * to reset the first information. */
4954                                     count = 0;
4955                                     first = NULL;
4956                                     trietype = 0;
4957                                 }
4958                             } /* end handle unmergable node */
4959                         } /* loop over branches */
4960                         DEBUG_TRIE_COMPILE_r({
4961                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4962                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4963                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4964                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4965                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4966                                PL_reg_name[trietype]
4967                             );
4968
4969                         });
4970                         if ( last && trietype ) {
4971                             if ( trietype != NOTHING ) {
4972                                 /* the last branch of the sequence was part of
4973                                  * a trie, so we have to construct it here
4974                                  * outside of the loop */
4975                                 made= make_trie( pRExC_state, startbranch,
4976                                                  first, scan, tail, count,
4977                                                  trietype, depth+1 );
4978 #ifdef TRIE_STUDY_OPT
4979                                 if ( ((made == MADE_EXACT_TRIE &&
4980                                      startbranch == first)
4981                                      || ( first_non_open == first )) &&
4982                                      depth==0 ) {
4983                                     flags |= SCF_TRIE_RESTUDY;
4984                                     if ( startbranch == first
4985                                          && scan >= tail )
4986                                     {
4987                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4988                                     }
4989                                 }
4990 #endif
4991                             } else {
4992                                 /* at this point we know whatever we have is a
4993                                  * NOTHING sequence/branch AND if 'startbranch'
4994                                  * is 'first' then we can turn the whole thing
4995                                  * into a NOTHING
4996                                  */
4997                                 if ( startbranch == first ) {
4998                                     regnode *opt;
4999                                     /* the entire thing is a NOTHING sequence,
5000                                      * something like this: (?:|) So we can
5001                                      * turn it into a plain NOTHING op. */
5002                                     DEBUG_TRIE_COMPILE_r({
5003                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5004                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5005                                           depth+1,
5006                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5007
5008                                     });
5009                                     OP(startbranch)= NOTHING;
5010                                     NEXT_OFF(startbranch)= tail - startbranch;
5011                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5012                                         OP(opt)= OPTIMIZED;
5013                                 }
5014                             }
5015                         } /* end if ( last) */
5016                     } /* TRIE_MAXBUF is non zero */
5017
5018                 } /* do trie */
5019
5020             }
5021             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5022                 scan = NEXTOPER(NEXTOPER(scan));
5023             } else                      /* single branch is optimized. */
5024                 scan = NEXTOPER(scan);
5025             continue;
5026         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5027             I32 paren = 0;
5028             regnode *start = NULL;
5029             regnode *end = NULL;
5030             U32 my_recursed_depth= recursed_depth;
5031
5032             if (OP(scan) != SUSPEND) { /* GOSUB */
5033                 /* Do setup, note this code has side effects beyond
5034                  * the rest of this block. Specifically setting
5035                  * RExC_recurse[] must happen at least once during
5036                  * study_chunk(). */
5037                 paren = ARG(scan);
5038                 RExC_recurse[ARG2L(scan)] = scan;
5039                 start = REGNODE_p(RExC_open_parens[paren]);
5040                 end   = REGNODE_p(RExC_close_parens[paren]);
5041
5042                 /* NOTE we MUST always execute the above code, even
5043                  * if we do nothing with a GOSUB */
5044                 if (
5045                     ( flags & SCF_IN_DEFINE )
5046                     ||
5047                     (
5048                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5049                         &&
5050                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5051                     )
5052                 ) {
5053                     /* no need to do anything here if we are in a define. */
5054                     /* or we are after some kind of infinite construct
5055                      * so we can skip recursing into this item.
5056                      * Since it is infinite we will not change the maxlen
5057                      * or delta, and if we miss something that might raise
5058                      * the minlen it will merely pessimise a little.
5059                      *
5060                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5061                      * might result in a minlen of 1 and not of 4,
5062                      * but this doesn't make us mismatch, just try a bit
5063                      * harder than we should.
5064                      * */
5065                     scan= regnext(scan);
5066                     continue;
5067                 }
5068
5069                 if (
5070                     !recursed_depth
5071                     ||
5072                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5073                 ) {
5074                     /* it is quite possible that there are more efficient ways
5075                      * to do this. We maintain a bitmap per level of recursion
5076                      * of which patterns we have entered so we can detect if a
5077                      * pattern creates a possible infinite loop. When we
5078                      * recurse down a level we copy the previous levels bitmap
5079                      * down. When we are at recursion level 0 we zero the top
5080                      * level bitmap. It would be nice to implement a different
5081                      * more efficient way of doing this. In particular the top
5082                      * level bitmap may be unnecessary.
5083                      */
5084                     if (!recursed_depth) {
5085                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5086                     } else {
5087                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5088                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5089                              RExC_study_chunk_recursed_bytes, U8);
5090                     }
5091                     /* we havent recursed into this paren yet, so recurse into it */
5092                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5093                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5094                     my_recursed_depth= recursed_depth + 1;
5095                 } else {
5096                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5097                     /* some form of infinite recursion, assume infinite length
5098                      * */
5099                     if (flags & SCF_DO_SUBSTR) {
5100                         scan_commit(pRExC_state, data, minlenp, is_inf);
5101                         data->cur_is_floating = 1;
5102                     }
5103                     is_inf = is_inf_internal = 1;
5104                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5105                         ssc_anything(data->start_class);
5106                     flags &= ~SCF_DO_STCLASS;
5107
5108                     start= NULL; /* reset start so we dont recurse later on. */
5109                 }
5110             } else {
5111                 paren = stopparen;
5112                 start = scan + 2;
5113                 end = regnext(scan);
5114             }
5115             if (start) {
5116                 scan_frame *newframe;
5117                 assert(end);
5118                 if (!RExC_frame_last) {
5119                     Newxz(newframe, 1, scan_frame);
5120                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5121                     RExC_frame_head= newframe;
5122                     RExC_frame_count++;
5123                 } else if (!RExC_frame_last->next_frame) {
5124                     Newxz(newframe, 1, scan_frame);
5125                     RExC_frame_last->next_frame= newframe;
5126                     newframe->prev_frame= RExC_frame_last;
5127                     RExC_frame_count++;
5128                 } else {
5129                     newframe= RExC_frame_last->next_frame;
5130                 }
5131                 RExC_frame_last= newframe;
5132
5133                 newframe->next_regnode = regnext(scan);
5134                 newframe->last_regnode = last;
5135                 newframe->stopparen = stopparen;
5136                 newframe->prev_recursed_depth = recursed_depth;
5137                 newframe->this_prev_frame= frame;
5138
5139                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5140                 DEBUG_PEEP("fnew", scan, depth, flags);
5141
5142                 frame = newframe;
5143                 scan =  start;
5144                 stopparen = paren;
5145                 last = end;
5146                 depth = depth + 1;
5147                 recursed_depth= my_recursed_depth;
5148
5149                 continue;
5150             }
5151         }
5152         else if (   OP(scan) == EXACT
5153                  || OP(scan) == EXACT_ONLY8
5154                  || OP(scan) == EXACTL)
5155         {
5156             SSize_t l = STR_LEN(scan);
5157             UV uc;
5158             assert(l);
5159             if (UTF) {
5160                 const U8 * const s = (U8*)STRING(scan);
5161                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5162                 l = utf8_length(s, s + l);
5163             } else {
5164                 uc = *((U8*)STRING(scan));
5165             }
5166             min += l;
5167             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5168                 /* The code below prefers earlier match for fixed
5169                    offset, later match for variable offset.  */
5170                 if (data->last_end == -1) { /* Update the start info. */
5171                     data->last_start_min = data->pos_min;
5172                     data->last_start_max = is_inf
5173                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
5174                 }
5175                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5176                 if (UTF)
5177                     SvUTF8_on(data->last_found);
5178                 {
5179                     SV * const sv = data->last_found;
5180                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5181                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5182                     if (mg && mg->mg_len >= 0)
5183                         mg->mg_len += utf8_length((U8*)STRING(scan),
5184                                               (U8*)STRING(scan)+STR_LEN(scan));
5185                 }
5186                 data->last_end = data->pos_min + l;
5187                 data->pos_min += l; /* As in the first entry. */
5188                 data->flags &= ~SF_BEFORE_EOL;
5189             }
5190
5191             /* ANDing the code point leaves at most it, and not in locale, and
5192              * can't match null string */
5193             if (flags & SCF_DO_STCLASS_AND) {
5194                 ssc_cp_and(data->start_class, uc);
5195                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5196                 ssc_clear_locale(data->start_class);
5197             }
5198             else if (flags & SCF_DO_STCLASS_OR) {
5199                 ssc_add_cp(data->start_class, uc);
5200                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5201
5202                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5203                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5204             }
5205             flags &= ~SCF_DO_STCLASS;
5206         }
5207         else if (PL_regkind[OP(scan)] == EXACT) {
5208             /* But OP != EXACT!, so is EXACTFish */
5209             SSize_t l = STR_LEN(scan);
5210             const U8 * s = (U8*)STRING(scan);
5211
5212             /* Search for fixed substrings supports EXACT only. */
5213             if (flags & SCF_DO_SUBSTR) {
5214                 assert(data);
5215                 scan_commit(pRExC_state, data, minlenp, is_inf);
5216             }
5217             if (UTF) {
5218                 l = utf8_length(s, s + l);
5219             }
5220             if (unfolded_multi_char) {
5221                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5222             }
5223             min += l - min_subtract;
5224             assert (min >= 0);
5225             delta += min_subtract;
5226             if (flags & SCF_DO_SUBSTR) {
5227                 data->pos_min += l - min_subtract;
5228                 if (data->pos_min < 0) {
5229                     data->pos_min = 0;
5230                 }
5231                 data->pos_delta += min_subtract;
5232                 if (min_subtract) {
5233                     data->cur_is_floating = 1; /* float */
5234                 }
5235             }
5236
5237             if (flags & SCF_DO_STCLASS) {
5238                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5239
5240                 assert(EXACTF_invlist);
5241                 if (flags & SCF_DO_STCLASS_AND) {
5242                     if (OP(scan) != EXACTFL)
5243                         ssc_clear_locale(data->start_class);
5244                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5245                     ANYOF_POSIXL_ZERO(data->start_class);
5246                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5247                 }
5248                 else {  /* SCF_DO_STCLASS_OR */
5249                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5250                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5251
5252                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5253                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5254                 }
5255                 flags &= ~SCF_DO_STCLASS;
5256                 SvREFCNT_dec(EXACTF_invlist);
5257             }
5258         }
5259         else if (REGNODE_VARIES(OP(scan))) {
5260             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5261             I32 fl = 0, f = flags;
5262             regnode * const oscan = scan;
5263             regnode_ssc this_class;
5264             regnode_ssc *oclass = NULL;
5265             I32 next_is_eval = 0;
5266
5267             switch (PL_regkind[OP(scan)]) {
5268             case WHILEM:                /* End of (?:...)* . */
5269                 scan = NEXTOPER(scan);
5270                 goto finish;
5271             case PLUS:
5272                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5273                     next = NEXTOPER(scan);
5274                     if (   OP(next) == EXACT
5275                         || OP(next) == EXACT_ONLY8
5276                         || OP(next) == EXACTL
5277                         || (flags & SCF_DO_STCLASS))
5278                     {
5279                         mincount = 1;
5280                         maxcount = REG_INFTY;
5281                         next = regnext(scan);
5282                         scan = NEXTOPER(scan);
5283                         goto do_curly;
5284                     }
5285                 }
5286                 if (flags & SCF_DO_SUBSTR)
5287                     data->pos_min++;
5288                 min++;
5289                 /* FALLTHROUGH */
5290             case STAR:
5291                 next = NEXTOPER(scan);
5292
5293                 /* This temporary node can now be turned into EXACTFU, and
5294                  * must, as regexec.c doesn't handle it */
5295                 if (OP(next) == EXACTFU_S_EDGE) {
5296                     OP(next) = EXACTFU;
5297                 }
5298
5299                 if (     STR_LEN(next) == 1
5300                     &&   isALPHA_A(* STRING(next))
5301                     && (         OP(next) == EXACTFAA
5302                         || (     OP(next) == EXACTFU
5303                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5304                 {
5305                     /* These differ in just one bit */
5306                     U8 mask = ~ ('A' ^ 'a');
5307
5308                     assert(isALPHA_A(* STRING(next)));
5309
5310                     /* Then replace it by an ANYOFM node, with
5311                     * the mask set to the complement of the
5312                     * bit that differs between upper and lower
5313                     * case, and the lowest code point of the
5314                     * pair (which the '&' forces) */
5315                     OP(next) = ANYOFM;
5316                     ARG_SET(next, *STRING(next) & mask);
5317                     FLAGS(next) = mask;
5318                 }
5319
5320                 if (flags & SCF_DO_STCLASS) {
5321                     mincount = 0;
5322                     maxcount = REG_INFTY;
5323                     next = regnext(scan);
5324                     scan = NEXTOPER(scan);
5325                     goto do_curly;
5326                 }
5327                 if (flags & SCF_DO_SUBSTR) {
5328                     scan_commit(pRExC_state, data, minlenp, is_inf);
5329                     /* Cannot extend fixed substrings */
5330                     data->cur_is_floating = 1; /* float */
5331                 }
5332                 is_inf = is_inf_internal = 1;
5333                 scan = regnext(scan);
5334                 goto optimize_curly_tail;
5335             case CURLY:
5336                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5337                     && (scan->flags == stopparen))
5338                 {
5339                     mincount = 1;
5340                     maxcount = 1;
5341                 } else {
5342                     mincount = ARG1(scan);
5343                     maxcount = ARG2(scan);
5344                 }
5345                 next = regnext(scan);
5346                 if (OP(scan) == CURLYX) {
5347                     I32 lp = (data ? *(data->last_closep) : 0);
5348                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5349                 }
5350                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5351                 next_is_eval = (OP(scan) == EVAL);
5352               do_curly:
5353                 if (flags & SCF_DO_SUBSTR) {
5354                     if (mincount == 0)
5355                         scan_commit(pRExC_state, data, minlenp, is_inf);
5356                     /* Cannot extend fixed substrings */
5357                     pos_before = data->pos_min;
5358                 }
5359                 if (data) {
5360                     fl = data->flags;
5361                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5362                     if (is_inf)
5363                         data->flags |= SF_IS_INF;
5364                 }
5365                 if (flags & SCF_DO_STCLASS) {
5366                     ssc_init(pRExC_state, &this_class);
5367                     oclass = data->start_class;
5368                     data->start_class = &this_class;
5369                     f |= SCF_DO_STCLASS_AND;
5370                     f &= ~SCF_DO_STCLASS_OR;
5371                 }
5372                 /* Exclude from super-linear cache processing any {n,m}
5373                    regops for which the combination of input pos and regex
5374                    pos is not enough information to determine if a match
5375                    will be possible.
5376
5377                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5378                    regex pos at the \s*, the prospects for a match depend not
5379                    only on the input position but also on how many (bar\s*)
5380                    repeats into the {4,8} we are. */
5381                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5382                     f &= ~SCF_WHILEM_VISITED_POS;
5383
5384                 /* This will finish on WHILEM, setting scan, or on NULL: */
5385                 /* recurse study_chunk() on loop bodies */
5386                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5387                                   last, data, stopparen, recursed_depth, NULL,
5388                                   (mincount == 0
5389                                    ? (f & ~SCF_DO_SUBSTR)
5390                                    : f)
5391                                   ,depth+1);
5392
5393                 if (flags & SCF_DO_STCLASS)
5394                     data->start_class = oclass;
5395                 if (mincount == 0 || minnext == 0) {
5396                     if (flags & SCF_DO_STCLASS_OR) {
5397                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5398                     }
5399                     else if (flags & SCF_DO_STCLASS_AND) {
5400                         /* Switch to OR mode: cache the old value of
5401                          * data->start_class */
5402                         INIT_AND_WITHP;
5403                         StructCopy(data->start_class, and_withp, regnode_ssc);
5404                         flags &= ~SCF_DO_STCLASS_AND;
5405                         StructCopy(&this_class, data->start_class, regnode_ssc);
5406                         flags |= SCF_DO_STCLASS_OR;
5407                         ANYOF_FLAGS(data->start_class)
5408                                                 |= SSC_MATCHES_EMPTY_STRING;
5409                     }
5410                 } else {                /* Non-zero len */
5411                     if (flags & SCF_DO_STCLASS_OR) {
5412                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5413                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5414                     }
5415                     else if (flags & SCF_DO_STCLASS_AND)
5416                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5417                     flags &= ~SCF_DO_STCLASS;
5418                 }
5419                 if (!scan)              /* It was not CURLYX, but CURLY. */
5420                     scan = next;
5421                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5422                     /* ? quantifier ok, except for (?{ ... }) */
5423                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5424                     && (minnext == 0) && (deltanext == 0)
5425                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5426                     && maxcount <= REG_INFTY/3) /* Complement check for big
5427                                                    count */
5428                 {
5429                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5430                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5431                             "Quantifier unexpected on zero-length expression "
5432                             "in regex m/%" UTF8f "/",
5433                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5434                                   RExC_precomp)));
5435                 }
5436
5437                 min += minnext * mincount;
5438                 is_inf_internal |= deltanext == SSize_t_MAX
5439                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5440                 is_inf |= is_inf_internal;
5441                 if (is_inf) {
5442                     delta = SSize_t_MAX;
5443                 } else {
5444                     delta += (minnext + deltanext) * maxcount
5445                              - minnext * mincount;
5446                 }
5447                 /* Try powerful optimization CURLYX => CURLYN. */
5448                 if (  OP(oscan) == CURLYX && data
5449                       && data->flags & SF_IN_PAR
5450                       && !(data->flags & SF_HAS_EVAL)
5451                       && !deltanext && minnext == 1 ) {
5452                     /* Try to optimize to CURLYN.  */
5453                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5454                     regnode * const nxt1 = nxt;
5455 #ifdef DEBUGGING
5456                     regnode *nxt2;
5457 #endif
5458
5459                     /* Skip open. */
5460                     nxt = regnext(nxt);
5461                     if (!REGNODE_SIMPLE(OP(nxt))
5462                         && !(PL_regkind[OP(nxt)] == EXACT
5463                              && STR_LEN(nxt) == 1))
5464                         goto nogo;
5465 #ifdef DEBUGGING
5466                     nxt2 = nxt;
5467 #endif
5468                     nxt = regnext(nxt);
5469                     if (OP(nxt) != CLOSE)
5470                         goto nogo;
5471                     if (RExC_open_parens) {
5472
5473                         /*open->CURLYM*/
5474                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5475
5476                         /*close->while*/
5477                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5478                     }
5479                     /* Now we know that nxt2 is the only contents: */
5480                     oscan->flags = (U8)ARG(nxt);
5481                     OP(oscan) = CURLYN;
5482                     OP(nxt1) = NOTHING; /* was OPEN. */
5483
5484 #ifdef DEBUGGING
5485                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5486                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5487                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5488                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5489                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5490                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5491 #endif
5492                 }
5493               nogo:
5494
5495                 /* Try optimization CURLYX => CURLYM. */
5496                 if (  OP(oscan) == CURLYX && data
5497                       && !(data->flags & SF_HAS_PAR)
5498                       && !(data->flags & SF_HAS_EVAL)
5499                       && !deltanext     /* atom is fixed width */
5500                       && minnext != 0   /* CURLYM can't handle zero width */
5501
5502                          /* Nor characters whose fold at run-time may be
5503                           * multi-character */
5504                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5505                 ) {
5506                     /* XXXX How to optimize if data == 0? */
5507                     /* Optimize to a simpler form.  */
5508                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5509                     regnode *nxt2;
5510
5511                     OP(oscan) = CURLYM;
5512                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5513                             && (OP(nxt2) != WHILEM))
5514                         nxt = nxt2;
5515                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5516                     /* Need to optimize away parenths. */
5517                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5518                         /* Set the parenth number.  */
5519                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5520
5521                         oscan->flags = (U8)ARG(nxt);
5522                         if (RExC_open_parens) {
5523                              /*open->CURLYM*/
5524                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5525
5526                             /*close->NOTHING*/
5527                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5528                                                          + 1;
5529                         }
5530                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5531                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5532
5533 #ifdef DEBUGGING
5534                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5535                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5536                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5537                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5538 #endif
5539 #if 0
5540                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5541                             regnode *nnxt = regnext(nxt1);
5542                             if (nnxt == nxt) {
5543                                 if (reg_off_by_arg[OP(nxt1)])
5544                                     ARG_SET(nxt1, nxt2 - nxt1);
5545                                 else if (nxt2 - nxt1 < U16_MAX)
5546                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5547                                 else
5548                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5549                             }
5550                             nxt1 = nnxt;
5551                         }
5552 #endif
5553                         /* Optimize again: */
5554                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5555                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5556                                     NULL, stopparen, recursed_depth, NULL, 0,
5557                                     depth+1);
5558                     }
5559                     else
5560                         oscan->flags = 0;
5561                 }
5562                 else if ((OP(oscan) == CURLYX)
5563                          && (flags & SCF_WHILEM_VISITED_POS)
5564                          /* See the comment on a similar expression above.
5565                             However, this time it's not a subexpression
5566                             we care about, but the expression itself. */
5567                          && (maxcount == REG_INFTY)
5568                          && data) {
5569                     /* This stays as CURLYX, we can put the count/of pair. */
5570                     /* Find WHILEM (as in regexec.c) */
5571                     regnode *nxt = oscan + NEXT_OFF(oscan);
5572
5573                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5574                         nxt += ARG(nxt);
5575                     nxt = PREVOPER(nxt);
5576                     if (nxt->flags & 0xf) {
5577                         /* we've already set whilem count on this node */
5578                     } else if (++data->whilem_c < 16) {
5579                         assert(data->whilem_c <= RExC_whilem_seen);
5580                         nxt->flags = (U8)(data->whilem_c
5581                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5582                     }
5583                 }
5584                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5585                     pars++;
5586                 if (flags & SCF_DO_SUBSTR) {
5587                     SV *last_str = NULL;
5588                     STRLEN last_chrs = 0;
5589                     int counted = mincount != 0;
5590
5591                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5592                                                                   string. */
5593                         SSize_t b = pos_before >= data->last_start_min
5594                             ? pos_before : data->last_start_min;
5595                         STRLEN l;
5596                         const char * const s = SvPV_const(data->last_found, l);
5597                         SSize_t old = b - data->last_start_min;
5598
5599                         if (UTF)
5600                             old = utf8_hop((U8*)s, old) - (U8*)s;
5601                         l -= old;
5602                         /* Get the added string: */
5603                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5604                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5605                                             (U8*)(s + old + l)) : l;
5606                         if (deltanext == 0 && pos_before == b) {
5607                             /* What was added is a constant string */
5608                             if (mincount > 1) {
5609
5610                                 SvGROW(last_str, (mincount * l) + 1);
5611                                 repeatcpy(SvPVX(last_str) + l,
5612                                           SvPVX_const(last_str), l,
5613                                           mincount - 1);
5614                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5615                                 /* Add additional parts. */
5616                                 SvCUR_set(data->last_found,
5617                                           SvCUR(data->last_found) - l);
5618                                 sv_catsv(data->last_found, last_str);
5619                                 {
5620                                     SV * sv = data->last_found;
5621                                     MAGIC *mg =
5622                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5623                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5624                                     if (mg && mg->mg_len >= 0)
5625                                         mg->mg_len += last_chrs * (mincount-1);
5626                                 }
5627                                 last_chrs *= mincount;
5628                                 data->last_end += l * (mincount - 1);
5629                             }
5630                         } else {
5631                             /* start offset must point into the last copy */
5632                             data->last_start_min += minnext * (mincount - 1);
5633                             data->last_start_max =
5634                               is_inf
5635                                ? SSize_t_MAX
5636                                : data->last_start_max +
5637                                  (maxcount - 1) * (minnext + data->pos_delta);
5638                         }
5639                     }
5640                     /* It is counted once already... */
5641                     data->pos_min += minnext * (mincount - counted);
5642 #if 0
5643 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5644                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5645                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5646     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5647     (UV)mincount);
5648 if (deltanext != SSize_t_MAX)
5649 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5650     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5651           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5652 #endif
5653                     if (deltanext == SSize_t_MAX
5654                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5655                         data->pos_delta = SSize_t_MAX;
5656                     else
5657                         data->pos_delta += - counted * deltanext +
5658                         (minnext + deltanext) * maxcount - minnext * mincount;
5659                     if (mincount != maxcount) {
5660                          /* Cannot extend fixed substrings found inside
5661                             the group.  */
5662                         scan_commit(pRExC_state, data, minlenp, is_inf);
5663                         if (mincount && last_str) {
5664                             SV * const sv = data->last_found;
5665                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5666                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5667
5668                             if (mg)
5669                                 mg->mg_len = -1;
5670                             sv_setsv(sv, last_str);
5671                             data->last_end = data->pos_min;
5672                             data->last_start_min = data->pos_min - last_chrs;
5673                             data->last_start_max = is_inf
5674                                 ? SSize_t_MAX
5675                                 : data->pos_min + data->pos_delta - last_chrs;
5676                         }
5677                         data->cur_is_floating = 1; /* float */
5678                     }
5679                     SvREFCNT_dec(last_str);
5680                 }
5681                 if (data && (fl & SF_HAS_EVAL))
5682                     data->flags |= SF_HAS_EVAL;
5683               optimize_curly_tail:
5684                 if (OP(oscan) != CURLYX) {
5685                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5686                            && NEXT_OFF(next))
5687                         NEXT_OFF(oscan) += NEXT_OFF(next);
5688                 }
5689                 continue;
5690
5691             default:
5692 #ifdef DEBUGGING
5693                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5694                                                                     OP(scan));
5695 #endif
5696             case REF:
5697             case CLUMP:
5698                 if (flags & SCF_DO_SUBSTR) {
5699                     /* Cannot expect anything... */
5700                     scan_commit(pRExC_state, data, minlenp, is_inf);
5701                     data->cur_is_floating = 1; /* float */
5702                 }
5703                 is_inf = is_inf_internal = 1;
5704                 if (flags & SCF_DO_STCLASS_OR) {
5705                     if (OP(scan) == CLUMP) {
5706                         /* Actually is any start char, but very few code points
5707                          * aren't start characters */
5708                         ssc_match_all_cp(data->start_class);
5709                     }
5710                     else {
5711                         ssc_anything(data->start_class);
5712                     }
5713                 }
5714                 flags &= ~SCF_DO_STCLASS;
5715                 break;
5716             }
5717         }
5718         else if (OP(scan) == LNBREAK) {
5719             if (flags & SCF_DO_STCLASS) {
5720                 if (flags & SCF_DO_STCLASS_AND) {
5721                     ssc_intersection(data->start_class,
5722                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5723                     ssc_clear_locale(data->start_class);
5724                     ANYOF_FLAGS(data->start_class)
5725                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5726                 }
5727                 else if (flags & SCF_DO_STCLASS_OR) {
5728                     ssc_union(data->start_class,
5729                               PL_XPosix_ptrs[_CC_VERTSPACE],
5730                               FALSE);
5731                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5732
5733                     /* See commit msg for
5734                      * 749e076fceedeb708a624933726e7989f2302f6a */
5735                     ANYOF_FLAGS(data->start_class)
5736                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5737                 }
5738                 flags &= ~SCF_DO_STCLASS;
5739             }
5740             min++;
5741             if (delta != SSize_t_MAX)
5742                 delta++;    /* Because of the 2 char string cr-lf */
5743             if (flags & SCF_DO_SUBSTR) {
5744                 /* Cannot expect anything... */
5745                 scan_commit(pRExC_state, data, minlenp, is_inf);
5746                 data->pos_min += 1;
5747                 if (data->pos_delta != SSize_t_MAX) {
5748                     data->pos_delta += 1;
5749                 }
5750                 data->cur_is_floating = 1; /* float */
5751             }
5752         }
5753         else if (REGNODE_SIMPLE(OP(scan))) {
5754
5755             if (flags & SCF_DO_SUBSTR) {
5756                 scan_commit(pRExC_state, data, minlenp, is_inf);
5757                 data->pos_min++;
5758             }
5759             min++;
5760             if (flags & SCF_DO_STCLASS) {
5761                 bool invert = 0;
5762                 SV* my_invlist = NULL;
5763                 U8 namedclass;
5764
5765                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5766                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5767
5768                 /* Some of the logic below assumes that switching
5769                    locale on will only add false positives. */
5770                 switch (OP(scan)) {
5771
5772                 default:
5773 #ifdef DEBUGGING
5774                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5775                                                                      OP(scan));
5776 #endif
5777                 case SANY:
5778                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5779                         ssc_match_all_cp(data->start_class);
5780                     break;
5781
5782                 case REG_ANY:
5783                     {
5784                         SV* REG_ANY_invlist = _new_invlist(2);
5785                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5786                                                             '\n');
5787                         if (flags & SCF_DO_STCLASS_OR) {
5788                             ssc_union(data->start_class,
5789                                       REG_ANY_invlist,
5790                                       TRUE /* TRUE => invert, hence all but \n
5791                                             */
5792                                       );
5793                         }
5794                         else if (flags & SCF_DO_STCLASS_AND) {
5795                             ssc_intersection(data->start_class,
5796                                              REG_ANY_invlist,
5797                                              TRUE  /* TRUE => invert */
5798                                              );
5799                             ssc_clear_locale(data->start_class);
5800                         }
5801                         SvREFCNT_dec_NN(REG_ANY_invlist);
5802                     }
5803                     break;
5804
5805                 case ANYOFD:
5806                 case ANYOFL:
5807                 case ANYOFPOSIXL:
5808                 case ANYOFH:
5809                 case ANYOF:
5810                     if (flags & SCF_DO_STCLASS_AND)
5811                         ssc_and(pRExC_state, data->start_class,
5812                                 (regnode_charclass *) scan);
5813                     else
5814                         ssc_or(pRExC_state, data->start_class,
5815                                                           (regnode_charclass *) scan);
5816                     break;
5817
5818                 case NANYOFM:
5819                 case ANYOFM:
5820                   {
5821                     SV* cp_list = get_ANYOFM_contents(scan);
5822
5823                     if (flags & SCF_DO_STCLASS_OR) {
5824                         ssc_union(data->start_class, cp_list, invert);
5825                     }
5826                     else if (flags & SCF_DO_STCLASS_AND) {
5827                         ssc_intersection(data->start_class, cp_list, invert);
5828                     }
5829
5830                     SvREFCNT_dec_NN(cp_list);
5831                     break;
5832                   }
5833
5834                 case NPOSIXL:
5835                     invert = 1;
5836                     /* FALLTHROUGH */
5837
5838                 case POSIXL:
5839                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5840                     if (flags & SCF_DO_STCLASS_AND) {
5841                         bool was_there = cBOOL(
5842                                           ANYOF_POSIXL_TEST(data->start_class,
5843                                                                  namedclass));
5844                         ANYOF_POSIXL_ZERO(data->start_class);
5845                         if (was_there) {    /* Do an AND */
5846                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5847                         }
5848                         /* No individual code points can now match */
5849                         data->start_class->invlist
5850                                                 = sv_2mortal(_new_invlist(0));
5851                     }
5852                     else {
5853                         int complement = namedclass + ((invert) ? -1 : 1);
5854
5855                         assert(flags & SCF_DO_STCLASS_OR);
5856
5857                         /* If the complement of this class was already there,
5858                          * the result is that they match all code points,
5859                          * (\d + \D == everything).  Remove the classes from
5860                          * future consideration.  Locale is not relevant in
5861                          * this case */
5862                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5863                             ssc_match_all_cp(data->start_class);
5864                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5865                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5866                         }
5867                         else {  /* The usual case; just add this class to the
5868                                    existing set */
5869                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5870                         }
5871                     }
5872                     break;
5873
5874                 case NPOSIXA:   /* For these, we always know the exact set of
5875                                    what's matched */
5876                     invert = 1;
5877                     /* FALLTHROUGH */
5878                 case POSIXA:
5879                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5880                     goto join_posix_and_ascii;
5881
5882                 case NPOSIXD:
5883                 case NPOSIXU:
5884                     invert = 1;
5885                     /* FALLTHROUGH */
5886                 case POSIXD:
5887                 case POSIXU:
5888                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5889
5890                     /* NPOSIXD matches all upper Latin1 code points unless the
5891                      * target string being matched is UTF-8, which is
5892                      * unknowable until match time.  Since we are going to
5893                      * invert, we want to get rid of all of them so that the
5894                      * inversion will match all */
5895                     if (OP(scan) == NPOSIXD) {
5896                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5897                                           &my_invlist);
5898                     }
5899
5900                   join_posix_and_ascii:
5901
5902                     if (flags & SCF_DO_STCLASS_AND) {
5903                         ssc_intersection(data->start_class, my_invlist, invert);
5904                         ssc_clear_locale(data->start_class);
5905                     }
5906                     else {
5907                         assert(flags & SCF_DO_STCLASS_OR);
5908                         ssc_union(data->start_class, my_invlist, invert);
5909                     }
5910                     SvREFCNT_dec(my_invlist);
5911                 }
5912                 if (flags & SCF_DO_STCLASS_OR)
5913                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5914                 flags &= ~SCF_DO_STCLASS;
5915             }
5916         }
5917         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5918             data->flags |= (OP(scan) == MEOL
5919                             ? SF_BEFORE_MEOL
5920                             : SF_BEFORE_SEOL);
5921             scan_commit(pRExC_state, data, minlenp, is_inf);
5922
5923         }
5924         else if (  PL_regkind[OP(scan)] == BRANCHJ
5925                  /* Lookbehind, or need to calculate parens/evals/stclass: */
5926                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
5927                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5928         {
5929             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5930                 || OP(scan) == UNLESSM )
5931             {
5932                 /* Negative Lookahead/lookbehind
5933                    In this case we can't do fixed string optimisation.
5934                 */
5935
5936                 SSize_t deltanext, minnext, fake = 0;
5937                 regnode *nscan;
5938                 regnode_ssc intrnl;
5939                 int f = 0;
5940
5941                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5942                 if (data) {
5943                     data_fake.whilem_c = data->whilem_c;
5944                     data_fake.last_closep = data->last_closep;
5945                 }
5946                 else
5947                     data_fake.last_closep = &fake;
5948                 data_fake.pos_delta = delta;
5949                 if ( flags & SCF_DO_STCLASS && !scan->flags
5950                      && OP(scan) == IFMATCH ) { /* Lookahead */
5951                     ssc_init(pRExC_state, &intrnl);
5952                     data_fake.start_class = &intrnl;
5953                     f |= SCF_DO_STCLASS_AND;
5954                 }
5955                 if (flags & SCF_WHILEM_VISITED_POS)
5956                     f |= SCF_WHILEM_VISITED_POS;
5957                 next = regnext(scan);
5958                 nscan = NEXTOPER(NEXTOPER(scan));
5959
5960                 /* recurse study_chunk() for lookahead body */
5961                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5962                                       last, &data_fake, stopparen,
5963                                       recursed_depth, NULL, f, depth+1);
5964                 if (scan->flags) {
5965                     if (deltanext) {
5966                         FAIL("Variable length lookbehind not implemented");
5967                     }
5968                     else if (minnext > (I32)U8_MAX) {
5969                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
5970                               (UV)U8_MAX);
5971                     }
5972                     scan->flags = (U8)minnext;
5973                 }
5974                 if (data) {
5975                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5976                         pars++;
5977                     if (data_fake.flags & SF_HAS_EVAL)
5978                         data->flags |= SF_HAS_EVAL;
5979                     data->whilem_c = data_fake.whilem_c;
5980                 }
5981                 if (f & SCF_DO_STCLASS_AND) {
5982                     if (flags & SCF_DO_STCLASS_OR) {
5983                         /* OR before, AND after: ideally we would recurse with
5984                          * data_fake to get the AND applied by study of the
5985                          * remainder of the pattern, and then derecurse;
5986                          * *** HACK *** for now just treat as "no information".
5987                          * See [perl #56690].
5988                          */
5989                         ssc_init(pRExC_state, data->start_class);
5990                     }  else {
5991                         /* AND before and after: combine and continue.  These
5992                          * assertions are zero-length, so can match an EMPTY
5993                          * string */
5994                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5995                         ANYOF_FLAGS(data->start_class)
5996                                                    |= SSC_MATCHES_EMPTY_STRING;
5997                     }
5998                 }
5999             }
6000 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6001             else {
6002                 /* Positive Lookahead/lookbehind
6003                    In this case we can do fixed string optimisation,
6004                    but we must be careful about it. Note in the case of
6005                    lookbehind the positions will be offset by the minimum
6006                    length of the pattern, something we won't know about
6007                    until after the recurse.
6008                 */
6009                 SSize_t deltanext, fake = 0;
6010                 regnode *nscan;
6011                 regnode_ssc intrnl;
6012                 int f = 0;
6013                 /* We use SAVEFREEPV so that when the full compile
6014                     is finished perl will clean up the allocated
6015                     minlens when it's all done. This way we don't
6016                     have to worry about freeing them when we know
6017                     they wont be used, which would be a pain.
6018                  */
6019                 SSize_t *minnextp;
6020                 Newx( minnextp, 1, SSize_t );
6021                 SAVEFREEPV(minnextp);
6022
6023                 if (data) {
6024                     StructCopy(data, &data_fake, scan_data_t);
6025                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6026                         f |= SCF_DO_SUBSTR;
6027                         if (scan->flags)
6028                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6029                         data_fake.last_found=newSVsv(data->last_found);
6030                     }
6031                 }
6032                 else
6033                     data_fake.last_closep = &fake;
6034                 data_fake.flags = 0;
6035                 data_fake.substrs[0].flags = 0;
6036                 data_fake.substrs[1].flags = 0;
6037                 data_fake.pos_delta = delta;
6038                 if (is_inf)
6039                     data_fake.flags |= SF_IS_INF;
6040                 if ( flags & SCF_DO_STCLASS && !scan->flags
6041                      && OP(scan) == IFMATCH ) { /* Lookahead */
6042                     ssc_init(pRExC_state, &intrnl);
6043                     data_fake.start_class = &intrnl;
6044                     f |= SCF_DO_STCLASS_AND;
6045                 }
6046                 if (flags & SCF_WHILEM_VISITED_POS)
6047                     f |= SCF_WHILEM_VISITED_POS;
6048                 next = regnext(scan);
6049                 nscan = NEXTOPER(NEXTOPER(scan));
6050
6051                 /* positive lookahead study_chunk() recursion */
6052                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6053                                         &deltanext, last, &data_fake,
6054                                         stopparen, recursed_depth, NULL,
6055                                         f, depth+1);
6056                 if (scan->flags) {
6057                     if (deltanext) {
6058                         FAIL("Variable length lookbehind not implemented");
6059                     }
6060                     else if (*minnextp > (I32)U8_MAX) {
6061                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6062                               (UV)U8_MAX);
6063                     }
6064                     scan->flags = (U8)*minnextp;
6065                 }
6066
6067                 *minnextp += min;
6068
6069                 if (f & SCF_DO_STCLASS_AND) {
6070                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6071                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6072                 }
6073                 if (data) {
6074                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6075                         pars++;
6076                     if (data_fake.flags & SF_HAS_EVAL)
6077                         data->flags |= SF_HAS_EVAL;
6078                     data->whilem_c = data_fake.whilem_c;
6079                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6080                         int i;
6081                         if (RExC_rx->minlen<*minnextp)
6082                             RExC_rx->minlen=*minnextp;
6083                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6084                         SvREFCNT_dec_NN(data_fake.last_found);
6085
6086                         for (i = 0; i < 2; i++) {
6087                             if (data_fake.substrs[i].minlenp != minlenp) {
6088                                 data->substrs[i].min_offset =
6089                                             data_fake.substrs[i].min_offset;
6090                                 data->substrs[i].max_offset =
6091                                             data_fake.substrs[i].max_offset;
6092                                 data->substrs[i].minlenp =
6093                                             data_fake.substrs[i].minlenp;
6094                                 data->substrs[i].lookbehind += scan->flags;
6095                             }
6096                         }
6097                     }
6098                 }
6099             }
6100 #endif
6101         }
6102
6103         else if (OP(scan) == OPEN) {
6104             if (stopparen != (I32)ARG(scan))
6105                 pars++;
6106         }
6107         else if (OP(scan) == CLOSE) {
6108             if (stopparen == (I32)ARG(scan)) {
6109                 break;
6110             }
6111             if ((I32)ARG(scan) == is_par) {
6112                 next = regnext(scan);
6113
6114                 if ( next && (OP(next) != WHILEM) && next < last)
6115                     is_par = 0;         /* Disable optimization */
6116             }
6117             if (data)
6118                 *(data->last_closep) = ARG(scan);
6119         }
6120         else if (OP(scan) == EVAL) {
6121                 if (data)
6122                     data->flags |= SF_HAS_EVAL;
6123         }
6124         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6125             if (flags & SCF_DO_SUBSTR) {
6126                 scan_commit(pRExC_state, data, minlenp, is_inf);
6127                 flags &= ~SCF_DO_SUBSTR;
6128             }
6129             if (data && OP(scan)==ACCEPT) {
6130                 data->flags |= SCF_SEEN_ACCEPT;
6131                 if (stopmin > min)
6132                     stopmin = min;
6133             }
6134         }
6135         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6136         {
6137                 if (flags & SCF_DO_SUBSTR) {
6138                     scan_commit(pRExC_state, data, minlenp, is_inf);
6139                     data->cur_is_floating = 1; /* float */
6140                 }
6141                 is_inf = is_inf_internal = 1;
6142                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6143                     ssc_anything(data->start_class);
6144                 flags &= ~SCF_DO_STCLASS;
6145         }
6146         else if (OP(scan) == GPOS) {
6147             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6148                 !(delta || is_inf || (data && data->pos_delta)))
6149             {
6150                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6151                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6152                 if (RExC_rx->gofs < (STRLEN)min)
6153                     RExC_rx->gofs = min;
6154             } else {
6155                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6156                 RExC_rx->gofs = 0;
6157             }
6158         }
6159 #ifdef TRIE_STUDY_OPT
6160 #ifdef FULL_TRIE_STUDY
6161         else if (PL_regkind[OP(scan)] == TRIE) {
6162             /* NOTE - There is similar code to this block above for handling
6163                BRANCH nodes on the initial study.  If you change stuff here
6164                check there too. */
6165             regnode *trie_node= scan;
6166             regnode *tail= regnext(scan);
6167             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6168             SSize_t max1 = 0, min1 = SSize_t_MAX;
6169             regnode_ssc accum;
6170
6171             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6172                 /* Cannot merge strings after this. */
6173                 scan_commit(pRExC_state, data, minlenp, is_inf);
6174             }
6175             if (flags & SCF_DO_STCLASS)
6176                 ssc_init_zero(pRExC_state, &accum);
6177
6178             if (!trie->jump) {
6179                 min1= trie->minlen;
6180                 max1= trie->maxlen;
6181             } else {
6182                 const regnode *nextbranch= NULL;
6183                 U32 word;
6184
6185                 for ( word=1 ; word <= trie->wordcount ; word++)
6186                 {
6187                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6188                     regnode_ssc this_class;
6189
6190                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6191                     if (data) {
6192                         data_fake.whilem_c = data->whilem_c;
6193                         data_fake.last_closep = data->last_closep;
6194                     }
6195                     else
6196                         data_fake.last_closep = &fake;
6197                     data_fake.pos_delta = delta;
6198                     if (flags & SCF_DO_STCLASS) {
6199                         ssc_init(pRExC_state, &this_class);
6200                         data_fake.start_class = &this_class;
6201                         f = SCF_DO_STCLASS_AND;
6202                     }
6203                     if (flags & SCF_WHILEM_VISITED_POS)
6204                         f |= SCF_WHILEM_VISITED_POS;
6205
6206                     if (trie->jump[word]) {
6207                         if (!nextbranch)
6208                             nextbranch = trie_node + trie->jump[0];
6209                         scan= trie_node + trie->jump[word];
6210                         /* We go from the jump point to the branch that follows
6211                            it. Note this means we need the vestigal unused
6212                            branches even though they arent otherwise used. */
6213                         /* optimise study_chunk() for TRIE */
6214                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6215                             &deltanext, (regnode *)nextbranch, &data_fake,
6216                             stopparen, recursed_depth, NULL, f, depth+1);
6217                     }
6218                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6219                         nextbranch= regnext((regnode*)nextbranch);
6220
6221                     if (min1 > (SSize_t)(minnext + trie->minlen))
6222                         min1 = minnext + trie->minlen;
6223                     if (deltanext == SSize_t_MAX) {
6224                         is_inf = is_inf_internal = 1;
6225                         max1 = SSize_t_MAX;
6226                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6227                         max1 = minnext + deltanext + trie->maxlen;
6228
6229                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6230                         pars++;
6231                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6232                         if ( stopmin > min + min1)
6233                             stopmin = min + min1;
6234                         flags &= ~SCF_DO_SUBSTR;
6235                         if (data)
6236                             data->flags |= SCF_SEEN_ACCEPT;
6237                     }
6238                     if (data) {
6239                         if (data_fake.flags & SF_HAS_EVAL)
6240                             data->flags |= SF_HAS_EVAL;
6241                         data->whilem_c = data_fake.whilem_c;
6242                     }
6243                     if (flags & SCF_DO_STCLASS)
6244                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6245                 }
6246             }
6247             if (flags & SCF_DO_SUBSTR) {
6248                 data->pos_min += min1;
6249                 data->pos_delta += max1 - min1;
6250                 if (max1 != min1 || is_inf)
6251                     data->cur_is_floating = 1; /* float */
6252             }
6253             min += min1;
6254             if (delta != SSize_t_MAX) {
6255                 if (SSize_t_MAX - (max1 - min1) >= delta)
6256                     delta += max1 - min1;
6257                 else
6258                     delta = SSize_t_MAX;
6259             }
6260             if (flags & SCF_DO_STCLASS_OR) {
6261                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6262                 if (min1) {
6263                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6264                     flags &= ~SCF_DO_STCLASS;
6265                 }
6266             }
6267             else if (flags & SCF_DO_STCLASS_AND) {
6268                 if (min1) {
6269                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6270                     flags &= ~SCF_DO_STCLASS;
6271                 }
6272                 else {
6273                     /* Switch to OR mode: cache the old value of
6274                      * data->start_class */
6275                     INIT_AND_WITHP;
6276                     StructCopy(data->start_class, and_withp, regnode_ssc);
6277                     flags &= ~SCF_DO_STCLASS_AND;
6278                     StructCopy(&accum, data->start_class, regnode_ssc);
6279                     flags |= SCF_DO_STCLASS_OR;
6280                 }
6281             }
6282             scan= tail;
6283             continue;
6284         }
6285 #else
6286         else if (PL_regkind[OP(scan)] == TRIE) {
6287             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6288             U8*bang=NULL;
6289
6290             min += trie->minlen;
6291             delta += (trie->maxlen - trie->minlen);
6292             flags &= ~SCF_DO_STCLASS; /* xxx */
6293             if (flags & SCF_DO_SUBSTR) {
6294                 /* Cannot expect anything... */
6295                 scan_commit(pRExC_state, data, minlenp, is_inf);
6296                 data->pos_min += trie->minlen;
6297                 data->pos_delta += (trie->maxlen - trie->minlen);
6298                 if (trie->maxlen != trie->minlen)
6299                     data->cur_is_floating = 1; /* float */
6300             }
6301             if (trie->jump) /* no more substrings -- for now /grr*/
6302                flags &= ~SCF_DO_SUBSTR;
6303         }
6304 #endif /* old or new */
6305 #endif /* TRIE_STUDY_OPT */
6306
6307         /* Else: zero-length, ignore. */
6308         scan = regnext(scan);
6309     }
6310
6311   finish:
6312     if (frame) {
6313         /* we need to unwind recursion. */
6314         depth = depth - 1;
6315
6316         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6317         DEBUG_PEEP("fend", scan, depth, flags);
6318
6319         /* restore previous context */
6320         last = frame->last_regnode;
6321         scan = frame->next_regnode;
6322         stopparen = frame->stopparen;
6323         recursed_depth = frame->prev_recursed_depth;
6324
6325         RExC_frame_last = frame->prev_frame;
6326         frame = frame->this_prev_frame;
6327         goto fake_study_recurse;
6328     }
6329
6330     assert(!frame);
6331     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6332
6333     *scanp = scan;
6334     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6335
6336     if (flags & SCF_DO_SUBSTR && is_inf)
6337         data->pos_delta = SSize_t_MAX - data->pos_min;
6338     if (is_par > (I32)U8_MAX)
6339         is_par = 0;
6340     if (is_par && pars==1 && data) {
6341         data->flags |= SF_IN_PAR;
6342         data->flags &= ~SF_HAS_PAR;
6343     }
6344     else if (pars && data) {
6345         data->flags |= SF_HAS_PAR;
6346         data->flags &= ~SF_IN_PAR;
6347     }
6348     if (flags & SCF_DO_STCLASS_OR)
6349         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6350     if (flags & SCF_TRIE_RESTUDY)
6351         data->flags |=  SCF_TRIE_RESTUDY;
6352
6353     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6354
6355     {
6356         SSize_t final_minlen= min < stopmin ? min : stopmin;
6357
6358         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6359             if (final_minlen > SSize_t_MAX - delta)
6360                 RExC_maxlen = SSize_t_MAX;
6361             else if (RExC_maxlen < final_minlen + delta)
6362                 RExC_maxlen = final_minlen + delta;
6363         }
6364         return final_minlen;
6365     }
6366     NOT_REACHED; /* NOTREACHED */
6367 }
6368
6369 STATIC U32
6370 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6371 {
6372     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6373
6374     PERL_ARGS_ASSERT_ADD_DATA;
6375
6376     Renewc(RExC_rxi->data,
6377            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6378            char, struct reg_data);
6379     if(count)
6380         Renew(RExC_rxi->data->what, count + n, U8);
6381     else
6382         Newx(RExC_rxi->data->what, n, U8);
6383     RExC_rxi->data->count = count + n;
6384     Copy(s, RExC_rxi->data->what + count, n, U8);
6385     return count;
6386 }
6387
6388 /*XXX: todo make this not included in a non debugging perl, but appears to be
6389  * used anyway there, in 'use re' */
6390 #ifndef PERL_IN_XSUB_RE
6391 void
6392 Perl_reginitcolors(pTHX)
6393 {
6394     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6395     if (s) {
6396         char *t = savepv(s);
6397         int i = 0;
6398         PL_colors[0] = t;
6399         while (++i < 6) {
6400             t = strchr(t, '\t');
6401             if (t) {
6402                 *t = '\0';
6403                 PL_colors[i] = ++t;
6404             }
6405             else
6406                 PL_colors[i] = t = (char *)"";
6407         }
6408     } else {
6409         int i = 0;
6410         while (i < 6)
6411             PL_colors[i++] = (char *)"";
6412     }
6413     PL_colorset = 1;
6414 }
6415 #endif
6416
6417
6418 #ifdef TRIE_STUDY_OPT
6419 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6420     STMT_START {                                            \
6421         if (                                                \
6422               (data.flags & SCF_TRIE_RESTUDY)               \
6423               && ! restudied++                              \
6424         ) {                                                 \
6425             dOsomething;                                    \
6426             goto reStudy;                                   \
6427         }                                                   \
6428     } STMT_END
6429 #else
6430 #define CHECK_RESTUDY_GOTO_butfirst
6431 #endif
6432
6433 /*
6434  * pregcomp - compile a regular expression into internal code
6435  *
6436  * Decides which engine's compiler to call based on the hint currently in
6437  * scope
6438  */
6439
6440 #ifndef PERL_IN_XSUB_RE
6441
6442 /* return the currently in-scope regex engine (or the default if none)  */
6443
6444 regexp_engine const *
6445 Perl_current_re_engine(pTHX)
6446 {
6447     if (IN_PERL_COMPILETIME) {
6448         HV * const table = GvHV(PL_hintgv);
6449         SV **ptr;
6450
6451         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6452             return &PL_core_reg_engine;
6453         ptr = hv_fetchs(table, "regcomp", FALSE);
6454         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6455             return &PL_core_reg_engine;
6456         return INT2PTR(regexp_engine*, SvIV(*ptr));
6457     }
6458     else {
6459         SV *ptr;
6460         if (!PL_curcop->cop_hints_hash)
6461             return &PL_core_reg_engine;
6462         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6463         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6464             return &PL_core_reg_engine;
6465         return INT2PTR(regexp_engine*, SvIV(ptr));
6466     }
6467 }
6468
6469
6470 REGEXP *
6471 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6472 {
6473     regexp_engine const *eng = current_re_engine();
6474     GET_RE_DEBUG_FLAGS_DECL;
6475
6476     PERL_ARGS_ASSERT_PREGCOMP;
6477
6478     /* Dispatch a request to compile a regexp to correct regexp engine. */
6479     DEBUG_COMPILE_r({
6480         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6481                         PTR2UV(eng));
6482     });
6483     return CALLREGCOMP_ENG(eng, pattern, flags);
6484 }
6485 #endif
6486
6487 /* public(ish) entry point for the perl core's own regex compiling code.
6488  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6489  * pattern rather than a list of OPs, and uses the internal engine rather
6490  * than the current one */
6491
6492 REGEXP *
6493 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6494 {
6495     SV *pat = pattern; /* defeat constness! */
6496     PERL_ARGS_ASSERT_RE_COMPILE;
6497     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6498 #ifdef PERL_IN_XSUB_RE
6499                                 &my_reg_engine,
6500 #else
6501                                 &PL_core_reg_engine,
6502 #endif
6503                                 NULL, NULL, rx_flags, 0);
6504 }
6505
6506
6507 static void
6508 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6509 {
6510     int n;
6511
6512     if (--cbs->refcnt > 0)
6513         return;
6514     for (n = 0; n < cbs->count; n++) {
6515         REGEXP *rx = cbs->cb[n].src_regex;
6516         if (rx) {
6517             cbs->cb[n].src_regex = NULL;
6518             SvREFCNT_dec_NN(rx);
6519         }
6520     }
6521     Safefree(cbs->cb);
6522     Safefree(cbs);
6523 }
6524
6525
6526 static struct reg_code_blocks *
6527 S_alloc_code_blocks(pTHX_  int ncode)
6528 {
6529      struct reg_code_blocks *cbs;
6530     Newx(cbs, 1, struct reg_code_blocks);
6531     cbs->count = ncode;
6532     cbs->refcnt = 1;
6533     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6534     if (ncode)
6535         Newx(cbs->cb, ncode, struct reg_code_block);
6536     else
6537         cbs->cb = NULL;
6538     return cbs;
6539 }
6540
6541
6542 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6543  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6544  * point to the realloced string and length.
6545  *
6546  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6547  * stuff added */
6548
6549 static void
6550 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6551                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6552 {
6553     U8 *const src = (U8*)*pat_p;
6554     U8 *dst, *d;
6555     int n=0;
6556     STRLEN s = 0;
6557     bool do_end = 0;
6558     GET_RE_DEBUG_FLAGS_DECL;
6559
6560     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6561         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6562
6563     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6564     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6565     d = dst;
6566
6567     while (s < *plen_p) {
6568         append_utf8_from_native_byte(src[s], &d);
6569
6570         if (n < num_code_blocks) {
6571             assert(pRExC_state->code_blocks);
6572             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6573                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6574                 assert(*(d - 1) == '(');
6575                 do_end = 1;
6576             }
6577             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6578                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6579                 assert(*(d - 1) == ')');
6580                 do_end = 0;
6581                 n++;
6582             }
6583         }
6584         s++;
6585     }
6586     *d = '\0';
6587     *plen_p = d - dst;
6588     *pat_p = (char*) dst;
6589     SAVEFREEPV(*pat_p);
6590     RExC_orig_utf8 = RExC_utf8 = 1;
6591 }
6592
6593
6594
6595 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6596  * while recording any code block indices, and handling overloading,
6597  * nested qr// objects etc.  If pat is null, it will allocate a new
6598  * string, or just return the first arg, if there's only one.
6599  *
6600  * Returns the malloced/updated pat.
6601  * patternp and pat_count is the array of SVs to be concatted;
6602  * oplist is the optional list of ops that generated the SVs;
6603  * recompile_p is a pointer to a boolean that will be set if
6604  *   the regex will need to be recompiled.
6605  * delim, if non-null is an SV that will be inserted between each element
6606  */
6607
6608 static SV*
6609 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6610                 SV *pat, SV ** const patternp, int pat_count,
6611                 OP *oplist, bool *recompile_p, SV *delim)
6612 {
6613     SV **svp;
6614     int n = 0;
6615     bool use_delim = FALSE;
6616     bool alloced = FALSE;
6617
6618     /* if we know we have at least two args, create an empty string,
6619      * then concatenate args to that. For no args, return an empty string */
6620     if (!pat && pat_count != 1) {
6621         pat = newSVpvs("");
6622         SAVEFREESV(pat);
6623         alloced = TRUE;
6624     }
6625
6626     for (svp = patternp; svp < patternp + pat_count; svp++) {
6627         SV *sv;
6628         SV *rx  = NULL;
6629         STRLEN orig_patlen = 0;
6630         bool code = 0;
6631         SV *msv = use_delim ? delim : *svp;
6632         if (!msv) msv = &PL_sv_undef;
6633
6634         /* if we've got a delimiter, we go round the loop twice for each
6635          * svp slot (except the last), using the delimiter the second
6636          * time round */
6637         if (use_delim) {
6638             svp--;
6639             use_delim = FALSE;
6640         }
6641         else if (delim)
6642             use_delim = TRUE;
6643
6644         if (SvTYPE(msv) == SVt_PVAV) {
6645             /* we've encountered an interpolated array within
6646              * the pattern, e.g. /...@a..../. Expand the list of elements,
6647              * then recursively append elements.
6648              * The code in this block is based on S_pushav() */
6649
6650             AV *const av = (AV*)msv;
6651             const SSize_t maxarg = AvFILL(av) + 1;
6652             SV **array;
6653
6654             if (oplist) {
6655                 assert(oplist->op_type == OP_PADAV
6656                     || oplist->op_type == OP_RV2AV);
6657                 oplist = OpSIBLING(oplist);
6658             }
6659
6660             if (SvRMAGICAL(av)) {
6661                 SSize_t i;
6662
6663                 Newx(array, maxarg, SV*);
6664                 SAVEFREEPV(array);
6665                 for (i=0; i < maxarg; i++) {
6666                     SV ** const svp = av_fetch(av, i, FALSE);
6667                     array[i] = svp ? *svp : &PL_sv_undef;
6668                 }
6669             }
6670             else
6671                 array = AvARRAY(av);
6672
6673             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6674                                 array, maxarg, NULL, recompile_p,
6675                                 /* $" */
6676                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6677
6678             continue;
6679         }
6680
6681
6682         /* we make the assumption here that each op in the list of
6683          * op_siblings maps to one SV pushed onto the stack,
6684          * except for code blocks, with have both an OP_NULL and
6685          * and OP_CONST.
6686          * This allows us to match up the list of SVs against the
6687          * list of OPs to find the next code block.
6688          *
6689          * Note that       PUSHMARK PADSV PADSV ..
6690          * is optimised to
6691          *                 PADRANGE PADSV  PADSV  ..
6692          * so the alignment still works. */
6693
6694         if (oplist) {
6695             if (oplist->op_type == OP_NULL
6696                 && (oplist->op_flags & OPf_SPECIAL))
6697             {
6698                 assert(n < pRExC_state->code_blocks->count);
6699                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6700                 pRExC_state->code_blocks->cb[n].block = oplist;
6701                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6702                 n++;
6703                 code = 1;
6704                 oplist = OpSIBLING(oplist); /* skip CONST */
6705                 assert(oplist);
6706             }
6707             oplist = OpSIBLING(oplist);;
6708         }
6709
6710         /* apply magic and QR overloading to arg */
6711
6712         SvGETMAGIC(msv);
6713         if (SvROK(msv) && SvAMAGIC(msv)) {
6714             SV *sv = AMG_CALLunary(msv, regexp_amg);
6715             if (sv) {
6716                 if (SvROK(sv))
6717                     sv = SvRV(sv);
6718                 if (SvTYPE(sv) != SVt_REGEXP)
6719                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6720                 msv = sv;
6721             }
6722         }
6723
6724         /* try concatenation overload ... */
6725         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6726                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6727         {
6728             sv_setsv(pat, sv);
6729             /* overloading involved: all bets are off over literal
6730              * code. Pretend we haven't seen it */
6731             if (n)
6732                 pRExC_state->code_blocks->count -= n;
6733             n = 0;
6734         }
6735         else  {
6736             /* ... or failing that, try "" overload */
6737             while (SvAMAGIC(msv)
6738                     && (sv = AMG_CALLunary(msv, string_amg))
6739                     && sv != msv
6740                     &&  !(   SvROK(msv)
6741                           && SvROK(sv)
6742                           && SvRV(msv) == SvRV(sv))
6743             ) {
6744                 msv = sv;
6745                 SvGETMAGIC(msv);
6746             }
6747             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6748                 msv = SvRV(msv);
6749
6750             if (pat) {
6751                 /* this is a partially unrolled
6752                  *     sv_catsv_nomg(pat, msv);
6753                  * that allows us to adjust code block indices if
6754                  * needed */
6755                 STRLEN dlen;
6756                 char *dst = SvPV_force_nomg(pat, dlen);
6757                 orig_patlen = dlen;
6758                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6759                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6760                     sv_setpvn(pat, dst, dlen);
6761                     SvUTF8_on(pat);
6762                 }
6763                 sv_catsv_nomg(pat, msv);
6764                 rx = msv;
6765             }
6766             else {
6767                 /* We have only one SV to process, but we need to verify
6768                  * it is properly null terminated or we will fail asserts
6769                  * later. In theory we probably shouldn't get such SV's,
6770                  * but if we do we should handle it gracefully. */
6771                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6772                     /* not a string, or a string with a trailing null */
6773                     pat = msv;
6774                 } else {
6775                     /* a string with no trailing null, we need to copy it
6776                      * so it has a trailing null */
6777                     pat = sv_2mortal(newSVsv(msv));
6778                 }
6779             }
6780
6781             if (code)
6782                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6783         }
6784
6785         /* extract any code blocks within any embedded qr//'s */
6786         if (rx && SvTYPE(rx) == SVt_REGEXP
6787             && RX_ENGINE((REGEXP*)rx)->op_comp)
6788         {
6789
6790             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6791             if (ri->code_blocks && ri->code_blocks->count) {
6792                 int i;
6793                 /* the presence of an embedded qr// with code means
6794                  * we should always recompile: the text of the
6795                  * qr// may not have changed, but it may be a
6796                  * different closure than last time */
6797                 *recompile_p = 1;
6798                 if (pRExC_state->code_blocks) {
6799                     int new_count = pRExC_state->code_blocks->count
6800                             + ri->code_blocks->count;
6801                     Renew(pRExC_state->code_blocks->cb,
6802                             new_count, struct reg_code_block);
6803                     pRExC_state->code_blocks->count = new_count;
6804                 }
6805                 else
6806                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6807                                                     ri->code_blocks->count);
6808
6809                 for (i=0; i < ri->code_blocks->count; i++) {
6810                     struct reg_code_block *src, *dst;
6811                     STRLEN offset =  orig_patlen
6812                         + ReANY((REGEXP *)rx)->pre_prefix;
6813                     assert(n < pRExC_state->code_blocks->count);
6814                     src = &ri->code_blocks->cb[i];
6815                     dst = &pRExC_state->code_blocks->cb[n];
6816                     dst->start      = src->start + offset;
6817                     dst->end        = src->end   + offset;
6818                     dst->block      = src->block;
6819                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6820                                             src->src_regex
6821                                                 ? src->src_regex
6822                                                 : (REGEXP*)rx);
6823                     n++;
6824                 }
6825             }
6826         }
6827     }
6828     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6829     if (alloced)
6830         SvSETMAGIC(pat);
6831
6832     return pat;
6833 }
6834
6835
6836
6837 /* see if there are any run-time code blocks in the pattern.
6838  * False positives are allowed */
6839
6840 static bool
6841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6842                     char *pat, STRLEN plen)
6843 {
6844     int n = 0;
6845     STRLEN s;
6846
6847     PERL_UNUSED_CONTEXT;
6848
6849     for (s = 0; s < plen; s++) {
6850         if (   pRExC_state->code_blocks
6851             && n < pRExC_state->code_blocks->count
6852             && s == pRExC_state->code_blocks->cb[n].start)
6853         {
6854             s = pRExC_state->code_blocks->cb[n].end;
6855             n++;
6856             continue;
6857         }
6858         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6859          * positives here */
6860         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6861             (pat[s+2] == '{'
6862                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6863         )
6864             return 1;
6865     }
6866     return 0;
6867 }
6868
6869 /* Handle run-time code blocks. We will already have compiled any direct
6870  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6871  * copy of it, but with any literal code blocks blanked out and
6872  * appropriate chars escaped; then feed it into
6873  *
6874  *    eval "qr'modified_pattern'"
6875  *
6876  * For example,
6877  *
6878  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6879  *
6880  * becomes
6881  *
6882  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6883  *
6884  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6885  * and merge them with any code blocks of the original regexp.
6886  *
6887  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6888  * instead, just save the qr and return FALSE; this tells our caller that
6889  * the original pattern needs upgrading to utf8.
6890  */
6891
6892 static bool
6893 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6894     char *pat, STRLEN plen)
6895 {
6896     SV *qr;
6897
6898     GET_RE_DEBUG_FLAGS_DECL;
6899
6900     if (pRExC_state->runtime_code_qr) {
6901         /* this is the second time we've been called; this should
6902          * only happen if the main pattern got upgraded to utf8
6903          * during compilation; re-use the qr we compiled first time
6904          * round (which should be utf8 too)
6905          */
6906         qr = pRExC_state->runtime_code_qr;
6907         pRExC_state->runtime_code_qr = NULL;
6908         assert(RExC_utf8 && SvUTF8(qr));
6909     }
6910     else {
6911         int n = 0;
6912         STRLEN s;
6913         char *p, *newpat;
6914         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6915         SV *sv, *qr_ref;
6916         dSP;
6917
6918         /* determine how many extra chars we need for ' and \ escaping */
6919         for (s = 0; s < plen; s++) {
6920             if (pat[s] == '\'' || pat[s] == '\\')
6921                 newlen++;
6922         }
6923
6924         Newx(newpat, newlen, char);
6925         p = newpat;
6926         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6927
6928         for (s = 0; s < plen; s++) {
6929             if (   pRExC_state->code_blocks
6930                 && n < pRExC_state->code_blocks->count
6931                 && s == pRExC_state->code_blocks->cb[n].start)
6932             {
6933                 /* blank out literal code block so that they aren't
6934                  * recompiled: eg change from/to:
6935                  *     /(?{xyz})/
6936                  *     /(?=====)/
6937                  * and
6938                  *     /(??{xyz})/
6939                  *     /(?======)/
6940                  * and
6941                  *     /(?(?{xyz}))/
6942                  *     /(?(?=====))/
6943                 */
6944                 assert(pat[s]   == '(');
6945                 assert(pat[s+1] == '?');
6946                 *p++ = '(';
6947                 *p++ = '?';
6948                 s += 2;
6949                 while (s < pRExC_state->code_blocks->cb[n].end) {
6950                     *p++ = '=';
6951                     s++;
6952                 }
6953                 *p++ = ')';
6954                 n++;
6955                 continue;
6956             }
6957             if (pat[s] == '\'' || pat[s] == '\\')
6958                 *p++ = '\\';
6959             *p++ = pat[s];
6960         }
6961         *p++ = '\'';
6962         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6963             *p++ = 'x';
6964             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6965                 *p++ = 'x';
6966             }
6967         }
6968         *p++ = '\0';
6969         DEBUG_COMPILE_r({
6970             Perl_re_printf( aTHX_
6971                 "%sre-parsing pattern for runtime code:%s %s\n",
6972                 PL_colors[4], PL_colors[5], newpat);
6973         });
6974
6975         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6976         Safefree(newpat);
6977
6978         ENTER;
6979         SAVETMPS;
6980         save_re_context();
6981         PUSHSTACKi(PERLSI_REQUIRE);
6982         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6983          * parsing qr''; normally only q'' does this. It also alters
6984          * hints handling */
6985         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6986         SvREFCNT_dec_NN(sv);
6987         SPAGAIN;
6988         qr_ref = POPs;
6989         PUTBACK;
6990         {
6991             SV * const errsv = ERRSV;
6992             if (SvTRUE_NN(errsv))
6993                 /* use croak_sv ? */
6994                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6995         }
6996         assert(SvROK(qr_ref));
6997         qr = SvRV(qr_ref);
6998         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6999         /* the leaving below frees the tmp qr_ref.
7000          * Give qr a life of its own */
7001         SvREFCNT_inc(qr);
7002         POPSTACK;
7003         FREETMPS;
7004         LEAVE;
7005
7006     }
7007
7008     if (!RExC_utf8 && SvUTF8(qr)) {
7009         /* first time through; the pattern got upgraded; save the
7010          * qr for the next time through */
7011         assert(!pRExC_state->runtime_code_qr);
7012         pRExC_state->runtime_code_qr = qr;
7013         return 0;
7014     }
7015
7016
7017     /* extract any code blocks within the returned qr//  */
7018
7019
7020     /* merge the main (r1) and run-time (r2) code blocks into one */
7021     {
7022         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7023         struct reg_code_block *new_block, *dst;
7024         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7025         int i1 = 0, i2 = 0;
7026         int r1c, r2c;
7027
7028         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7029         {
7030             SvREFCNT_dec_NN(qr);
7031             return 1;
7032         }
7033
7034         if (!r1->code_blocks)
7035             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7036
7037         r1c = r1->code_blocks->count;
7038         r2c = r2->code_blocks->count;
7039
7040         Newx(new_block, r1c + r2c, struct reg_code_block);
7041
7042         dst = new_block;
7043
7044         while (i1 < r1c || i2 < r2c) {
7045             struct reg_code_block *src;
7046             bool is_qr = 0;
7047
7048             if (i1 == r1c) {
7049                 src = &r2->code_blocks->cb[i2++];
7050                 is_qr = 1;
7051             }
7052             else if (i2 == r2c)
7053                 src = &r1->code_blocks->cb[i1++];
7054             else if (  r1->code_blocks->cb[i1].start
7055                      < r2->code_blocks->cb[i2].start)
7056             {
7057                 src = &r1->code_blocks->cb[i1++];
7058                 assert(src->end < r2->code_blocks->cb[i2].start);
7059             }
7060             else {
7061                 assert(  r1->code_blocks->cb[i1].start
7062                        > r2->code_blocks->cb[i2].start);
7063                 src = &r2->code_blocks->cb[i2++];
7064                 is_qr = 1;
7065                 assert(src->end < r1->code_blocks->cb[i1].start);
7066             }
7067
7068             assert(pat[src->start] == '(');
7069             assert(pat[src->end]   == ')');
7070             dst->start      = src->start;
7071             dst->end        = src->end;
7072             dst->block      = src->block;
7073             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7074                                     : src->src_regex;
7075             dst++;
7076         }
7077         r1->code_blocks->count += r2c;
7078         Safefree(r1->code_blocks->cb);
7079         r1->code_blocks->cb = new_block;
7080     }
7081
7082     SvREFCNT_dec_NN(qr);
7083     return 1;
7084 }
7085
7086
7087 STATIC bool
7088 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7089                       struct reg_substr_datum  *rsd,
7090                       struct scan_data_substrs *sub,
7091                       STRLEN longest_length)
7092 {
7093     /* This is the common code for setting up the floating and fixed length
7094      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7095      * as to whether succeeded or not */
7096
7097     I32 t;
7098     SSize_t ml;
7099     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7100     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7101
7102     if (! (longest_length
7103            || (eol /* Can't have SEOL and MULTI */
7104                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7105           )
7106             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7107         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7108     {
7109         return FALSE;
7110     }
7111
7112     /* copy the information about the longest from the reg_scan_data
7113         over to the program. */
7114     if (SvUTF8(sub->str)) {
7115         rsd->substr      = NULL;
7116         rsd->utf8_substr = sub->str;
7117     } else {
7118         rsd->substr      = sub->str;
7119         rsd->utf8_substr = NULL;
7120     }
7121     /* end_shift is how many chars that must be matched that
7122         follow this item. We calculate it ahead of time as once the
7123         lookbehind offset is added in we lose the ability to correctly
7124         calculate it.*/
7125     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7126     rsd->end_shift = ml - sub->min_offset
7127         - longest_length
7128             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7129              * intead? - DAPM
7130             + (SvTAIL(sub->str) != 0)
7131             */
7132         + sub->lookbehind;
7133
7134     t = (eol/* Can't have SEOL and MULTI */
7135          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7136     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7137
7138     return TRUE;
7139 }
7140
7141 STATIC void
7142 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7143 {
7144     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7145      * properly wrapped with the right modifiers */
7146
7147     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7148     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7149                                                 != REGEX_DEPENDS_CHARSET);
7150
7151     /* The caret is output if there are any defaults: if not all the STD
7152         * flags are set, or if no character set specifier is needed */
7153     bool has_default =
7154                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7155                 || ! has_charset);
7156     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7157                                                 == REG_RUN_ON_COMMENT_SEEN);
7158     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7159                         >> RXf_PMf_STD_PMMOD_SHIFT);
7160     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7161     char *p;
7162     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7163
7164     /* We output all the necessary flags; we never output a minus, as all
7165         * those are defaults, so are
7166         * covered by the caret */
7167     const STRLEN wraplen = pat_len + has_p + has_runon
7168         + has_default       /* If needs a caret */
7169         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7170
7171             /* If needs a character set specifier */
7172         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7173         + (sizeof("(?:)") - 1);
7174
7175     PERL_ARGS_ASSERT_SET_REGEX_PV;
7176
7177     /* make sure PL_bitcount bounds not exceeded */
7178     assert(sizeof(STD_PAT_MODS) <= 8);
7179
7180     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7181     SvPOK_on(Rx);
7182     if (RExC_utf8)
7183         SvFLAGS(Rx) |= SVf_UTF8;
7184     *p++='('; *p++='?';
7185
7186     /* If a default, cover it using the caret */
7187     if (has_default) {
7188         *p++= DEFAULT_PAT_MOD;
7189     }
7190     if (has_charset) {
7191         STRLEN len;
7192         const char* name;
7193
7194         name = get_regex_charset_name(RExC_rx->extflags, &len);
7195         if strEQ(name, DEPENDS_PAT_MODS) {  /* /d under UTF-8 => /u */
7196             assert(RExC_utf8);
7197             name = UNICODE_PAT_MODS;
7198             len = sizeof(UNICODE_PAT_MODS) - 1;
7199         }
7200         Copy(name, p, len, char);
7201         p += len;
7202     }
7203     if (has_p)
7204         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7205     {
7206         char ch;
7207         while((ch = *fptr++)) {
7208             if(reganch & 1)
7209                 *p++ = ch;
7210             reganch >>= 1;
7211         }
7212     }
7213
7214     *p++ = ':';
7215     Copy(RExC_precomp, p, pat_len, char);
7216     assert ((RX_WRAPPED(Rx) - p) < 16);
7217     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7218     p += pat_len;
7219
7220     /* Adding a trailing \n causes this to compile properly:
7221             my $R = qr / A B C # D E/x; /($R)/
7222         Otherwise the parens are considered part of the comment */
7223     if (has_runon)
7224         *p++ = '\n';
7225     *p++ = ')';
7226     *p = 0;
7227     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7228 }
7229
7230 /*
7231  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7232  * regular expression into internal code.
7233  * The pattern may be passed either as:
7234  *    a list of SVs (patternp plus pat_count)
7235  *    a list of OPs (expr)
7236  * If both are passed, the SV list is used, but the OP list indicates
7237  * which SVs are actually pre-compiled code blocks
7238  *
7239  * The SVs in the list have magic and qr overloading applied to them (and
7240  * the list may be modified in-place with replacement SVs in the latter
7241  * case).
7242  *
7243  * If the pattern hasn't changed from old_re, then old_re will be
7244  * returned.
7245  *
7246  * eng is the current engine. If that engine has an op_comp method, then
7247  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7248  * do the initial concatenation of arguments and pass on to the external
7249  * engine.
7250  *
7251  * If is_bare_re is not null, set it to a boolean indicating whether the
7252  * arg list reduced (after overloading) to a single bare regex which has
7253  * been returned (i.e. /$qr/).
7254  *
7255  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7256  *
7257  * pm_flags contains the PMf_* flags, typically based on those from the
7258  * pm_flags field of the related PMOP. Currently we're only interested in
7259  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7260  *
7261  * For many years this code had an initial sizing pass that calculated
7262  * (sometimes incorrectly, leading to security holes) the size needed for the
7263  * compiled pattern.  That was changed by commit
7264  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7265  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7266  * references to this sizing pass.
7267  *
7268  * Now, an initial crude guess as to the size needed is made, based on the
7269  * length of the pattern.  Patches welcome to improve that guess.  That amount
7270  * of space is malloc'd and then immediately freed, and then clawed back node
7271  * by node.  This design is to minimze, to the extent possible, memory churn
7272  * when doing the the reallocs.
7273  *
7274  * A separate parentheses counting pass may be needed in some cases.
7275  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7276  * of these cases.
7277  *
7278  * The existence of a sizing pass necessitated design decisions that are no
7279  * longer needed.  There are potential areas of simplification.
7280  *
7281  * Beware that the optimization-preparation code in here knows about some
7282  * of the structure of the compiled regexp.  [I'll say.]
7283  */
7284
7285 REGEXP *
7286 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7287                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7288                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7289 {
7290     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7291     STRLEN plen;
7292     char *exp;
7293     regnode *scan;
7294     I32 flags;
7295     SSize_t minlen = 0;
7296     U32 rx_flags;
7297     SV *pat;
7298     SV** new_patternp = patternp;
7299
7300     /* these are all flags - maybe they should be turned
7301      * into a single int with different bit masks */
7302     I32 sawlookahead = 0;
7303     I32 sawplus = 0;
7304     I32 sawopen = 0;
7305     I32 sawminmod = 0;
7306
7307     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7308     bool recompile = 0;
7309     bool runtime_code = 0;
7310     scan_data_t data;
7311     RExC_state_t RExC_state;
7312     RExC_state_t * const pRExC_state = &RExC_state;
7313 #ifdef TRIE_STUDY_OPT
7314     int restudied = 0;
7315     RExC_state_t copyRExC_state;
7316 #endif
7317     GET_RE_DEBUG_FLAGS_DECL;
7318
7319     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7320
7321     DEBUG_r(if (!PL_colorset) reginitcolors());
7322
7323     /* Initialize these here instead of as-needed, as is quick and avoids
7324      * having to test them each time otherwise */
7325     if (! PL_InBitmap) {
7326 #ifdef DEBUGGING
7327         char * dump_len_string;
7328 #endif
7329
7330         /* This is calculated here, because the Perl program that generates the
7331          * static global ones doesn't currently have access to
7332          * NUM_ANYOF_CODE_POINTS */
7333         PL_InBitmap = _new_invlist(2);
7334         PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7335                                                     NUM_ANYOF_CODE_POINTS - 1);
7336 #ifdef DEBUGGING
7337         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7338         if (   ! dump_len_string
7339             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7340         {
7341             PL_dump_re_max_len = 60;    /* A reasonable default */
7342         }
7343 #endif
7344     }
7345
7346     pRExC_state->warn_text = NULL;
7347     pRExC_state->code_blocks = NULL;
7348
7349     if (is_bare_re)
7350         *is_bare_re = FALSE;
7351
7352     if (expr && (expr->op_type == OP_LIST ||
7353                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7354         /* allocate code_blocks if needed */
7355         OP *o;
7356         int ncode = 0;
7357
7358         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7359             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7360                 ncode++; /* count of DO blocks */
7361
7362         if (ncode)
7363             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7364     }
7365
7366     if (!pat_count) {
7367         /* compile-time pattern with just OP_CONSTs and DO blocks */
7368
7369         int n;
7370         OP *o;
7371
7372         /* find how many CONSTs there are */
7373         assert(expr);
7374         n = 0;
7375         if (expr->op_type == OP_CONST)
7376             n = 1;
7377         else
7378             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7379                 if (o->op_type == OP_CONST)
7380                     n++;
7381             }
7382
7383         /* fake up an SV array */
7384
7385         assert(!new_patternp);
7386         Newx(new_patternp, n, SV*);
7387         SAVEFREEPV(new_patternp);
7388         pat_count = n;
7389
7390         n = 0;
7391         if (expr->op_type == OP_CONST)
7392             new_patternp[n] = cSVOPx_sv(expr);
7393         else
7394             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7395                 if (o->op_type == OP_CONST)
7396                     new_patternp[n++] = cSVOPo_sv;
7397             }
7398
7399     }
7400
7401     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7402         "Assembling pattern from %d elements%s\n", pat_count,
7403             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7404
7405     /* set expr to the first arg op */
7406
7407     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7408          && expr->op_type != OP_CONST)
7409     {
7410             expr = cLISTOPx(expr)->op_first;
7411             assert(   expr->op_type == OP_PUSHMARK
7412                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7413                    || expr->op_type == OP_PADRANGE);
7414             expr = OpSIBLING(expr);
7415     }
7416
7417     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7418                         expr, &recompile, NULL);
7419
7420     /* handle bare (possibly after overloading) regex: foo =~ $re */
7421     {
7422         SV *re = pat;
7423         if (SvROK(re))
7424             re = SvRV(re);
7425         if (SvTYPE(re) == SVt_REGEXP) {
7426             if (is_bare_re)
7427                 *is_bare_re = TRUE;
7428             SvREFCNT_inc(re);
7429             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7430                 "Precompiled pattern%s\n",
7431                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7432
7433             return (REGEXP*)re;
7434         }
7435     }
7436
7437     exp = SvPV_nomg(pat, plen);
7438
7439     if (!eng->op_comp) {
7440         if ((SvUTF8(pat) && IN_BYTES)
7441                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7442         {
7443             /* make a temporary copy; either to convert to bytes,
7444              * or to avoid repeating get-magic / overloaded stringify */
7445             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7446                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7447         }
7448         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7449     }
7450
7451     /* ignore the utf8ness if the pattern is 0 length */
7452     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7453     RExC_uni_semantics = 0;
7454     RExC_contains_locale = 0;
7455     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7456     RExC_in_script_run = 0;
7457     RExC_study_started = 0;
7458     pRExC_state->runtime_code_qr = NULL;
7459     RExC_frame_head= NULL;
7460     RExC_frame_last= NULL;
7461     RExC_frame_count= 0;
7462     RExC_latest_warn_offset = 0;
7463     RExC_use_BRANCHJ = 0;
7464     RExC_total_parens = 0;
7465     RExC_open_parens = NULL;
7466     RExC_close_parens = NULL;
7467     RExC_paren_names = NULL;
7468     RExC_size = 0;
7469     RExC_seen_d_op = FALSE;
7470 #ifdef DEBUGGING
7471     RExC_paren_name_list = NULL;
7472 #endif
7473
7474     DEBUG_r({
7475         RExC_mysv1= sv_newmortal();
7476         RExC_mysv2= sv_newmortal();
7477     });
7478
7479     DEBUG_COMPILE_r({
7480             SV *dsv= sv_newmortal();
7481             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7482             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7483                           PL_colors[4], PL_colors[5], s);
7484         });
7485
7486     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7487      * to utf8 */
7488
7489     if ((pm_flags & PMf_USE_RE_EVAL)
7490                 /* this second condition covers the non-regex literal case,
7491                  * i.e.  $foo =~ '(?{})'. */
7492                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7493     )
7494         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7495
7496   redo_parse:
7497     /* return old regex if pattern hasn't changed */
7498     /* XXX: note in the below we have to check the flags as well as the
7499      * pattern.
7500      *
7501      * Things get a touch tricky as we have to compare the utf8 flag
7502      * independently from the compile flags.  */
7503
7504     if (   old_re
7505         && !recompile
7506         && !!RX_UTF8(old_re) == !!RExC_utf8
7507         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7508         && RX_PRECOMP(old_re)
7509         && RX_PRELEN(old_re) == plen
7510         && memEQ(RX_PRECOMP(old_re), exp, plen)
7511         && !runtime_code /* with runtime code, always recompile */ )
7512     {
7513         return old_re;
7514     }
7515
7516     /* Allocate the pattern's SV */
7517     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7518     RExC_rx = ReANY(Rx);
7519     if ( RExC_rx == NULL )
7520         FAIL("Regexp out of space");
7521
7522     rx_flags = orig_rx_flags;
7523
7524     if (   (UTF || RExC_uni_semantics)
7525         && initial_charset == REGEX_DEPENDS_CHARSET)
7526     {
7527
7528         /* Set to use unicode semantics if the pattern is in utf8 and has the
7529          * 'depends' charset specified, as it means unicode when utf8  */
7530         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7531         RExC_uni_semantics = 1;
7532     }
7533
7534     RExC_pm_flags = pm_flags;
7535
7536     if (runtime_code) {
7537         assert(TAINTING_get || !TAINT_get);
7538         if (TAINT_get)
7539             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7540
7541         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7542             /* whoops, we have a non-utf8 pattern, whilst run-time code
7543              * got compiled as utf8. Try again with a utf8 pattern */
7544             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7545                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7546             goto redo_parse;
7547         }
7548     }
7549     assert(!pRExC_state->runtime_code_qr);
7550
7551     RExC_sawback = 0;
7552
7553     RExC_seen = 0;
7554     RExC_maxlen = 0;
7555     RExC_in_lookbehind = 0;
7556     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7557 #ifdef EBCDIC
7558     RExC_recode_x_to_native = 0;
7559 #endif
7560     RExC_in_multi_char_class = 0;
7561
7562     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7563     RExC_precomp_end = RExC_end = exp + plen;
7564     RExC_nestroot = 0;
7565     RExC_whilem_seen = 0;
7566     RExC_end_op = NULL;
7567     RExC_recurse = NULL;
7568     RExC_study_chunk_recursed = NULL;
7569     RExC_study_chunk_recursed_bytes= 0;
7570     RExC_recurse_count = 0;
7571     pRExC_state->code_index = 0;
7572
7573     /* Initialize the string in the compiled pattern.  This is so that there is
7574      * something to output if necessary */
7575     set_regex_pv(pRExC_state, Rx);
7576
7577     DEBUG_PARSE_r({
7578         Perl_re_printf( aTHX_
7579             "Starting parse and generation\n");
7580         RExC_lastnum=0;
7581         RExC_lastparse=NULL;
7582     });
7583
7584     /* Allocate space and zero-initialize. Note, the two step process
7585        of zeroing when in debug mode, thus anything assigned has to
7586        happen after that */
7587     if (!  RExC_size) {
7588
7589         /* On the first pass of the parse, we guess how big this will be.  Then
7590          * we grow in one operation to that amount and then give it back.  As
7591          * we go along, we re-allocate what we need.
7592          *
7593          * XXX Currently the guess is essentially that the pattern will be an
7594          * EXACT node with one byte input, one byte output.  This is crude, and
7595          * better heuristics are welcome.
7596          *
7597          * On any subsequent passes, we guess what we actually computed in the
7598          * latest earlier pass.  Such a pass probably didn't complete so is
7599          * missing stuff.  We could improve those guesses by knowing where the
7600          * parse stopped, and use the length so far plus apply the above
7601          * assumption to what's left. */
7602         RExC_size = STR_SZ(RExC_end - RExC_start);
7603     }
7604
7605     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7606     if ( RExC_rxi == NULL )
7607         FAIL("Regexp out of space");
7608
7609     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7610     RXi_SET( RExC_rx, RExC_rxi );
7611
7612     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7613      * node parsed will give back any excess memory we have allocated so far).
7614      * */
7615     RExC_size = 0;
7616
7617     /* non-zero initialization begins here */
7618     RExC_rx->engine= eng;
7619     RExC_rx->extflags = rx_flags;
7620     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7621
7622     if (pm_flags & PMf_IS_QR) {
7623         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7624         if (RExC_rxi->code_blocks) {
7625             RExC_rxi->code_blocks->refcnt++;
7626         }
7627     }
7628
7629     RExC_rx->intflags = 0;
7630
7631     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7632     RExC_parse = exp;
7633
7634     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7635      * code makes sure the final byte is an uncounted NUL.  But should this
7636      * ever not be the case, lots of things could read beyond the end of the
7637      * buffer: loops like
7638      *      while(isFOO(*RExC_parse)) RExC_parse++;
7639      *      strchr(RExC_parse, "foo");
7640      * etc.  So it is worth noting. */
7641     assert(*RExC_end == '\0');
7642
7643     RExC_naughty = 0;
7644     RExC_npar = 1;
7645     RExC_emit_start = RExC_rxi->program;
7646     pRExC_state->code_index = 0;
7647
7648     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7649     RExC_emit = 1;
7650
7651     /* Do the parse */
7652     if (reg(pRExC_state, 0, &flags, 1)) {
7653
7654         /* Success!, But if RExC_total_parens < 0, we need to redo the parse
7655          * knowing how many parens there actually are */
7656         if (RExC_total_parens < 0) {
7657             flags |= RESTART_PARSE;
7658         }
7659
7660         /* We have that number in RExC_npar */
7661         RExC_total_parens = RExC_npar;
7662     }
7663     else if (! MUST_RESTART(flags)) {
7664         ReREFCNT_dec(Rx);
7665         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7666     }
7667
7668     /* Here, we either have success, or we have to redo the parse for some reason */
7669     if (MUST_RESTART(flags)) {
7670
7671         /* It's possible to write a regexp in ascii that represents Unicode
7672         codepoints outside of the byte range, such as via \x{100}. If we
7673         detect such a sequence we have to convert the entire pattern to utf8
7674         and then recompile, as our sizing calculation will have been based
7675         on 1 byte == 1 character, but we will need to use utf8 to encode
7676         at least some part of the pattern, and therefore must convert the whole
7677         thing.
7678         -- dmq */
7679         if (flags & NEED_UTF8) {
7680
7681             /* We have stored the offset of the final warning output so far.
7682              * That must be adjusted.  Any variant characters between the start
7683              * of the pattern and this warning count for 2 bytes in the final,
7684              * so just add them again */
7685             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7686                 RExC_latest_warn_offset +=
7687                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7688                                                 + RExC_latest_warn_offset);
7689             }
7690             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7691             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7692             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7693         }
7694         else {
7695             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7696         }
7697
7698         if (RExC_total_parens > 0) {
7699             /* Make enough room for all the known parens, and zero it */
7700             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7701             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7702             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7703
7704             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7705             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7706         }
7707         else { /* Parse did not complete.  Reinitialize the parentheses
7708                   structures */
7709             RExC_total_parens = 0;
7710             if (RExC_open_parens) {
7711                 Safefree(RExC_open_parens);
7712                 RExC_open_parens = NULL;
7713             }
7714             if (RExC_close_parens) {
7715                 Safefree(RExC_close_parens);
7716                 RExC_close_parens = NULL;
7717             }
7718         }
7719
7720         /* Clean up what we did in this parse */
7721         SvREFCNT_dec_NN(RExC_rx_sv);
7722
7723         goto redo_parse;
7724     }
7725
7726     /* Here, we have successfully parsed and generated the pattern's program
7727      * for the regex engine.  We are ready to finish things up and look for
7728      * optimizations. */
7729
7730     /* Update the string to compile, with correct modifiers, etc */
7731     set_regex_pv(pRExC_state, Rx);
7732
7733     RExC_rx->nparens = RExC_total_parens - 1;
7734
7735     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7736     if (RExC_whilem_seen > 15)
7737         RExC_whilem_seen = 15;
7738
7739     DEBUG_PARSE_r({
7740         Perl_re_printf( aTHX_
7741             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7742         RExC_lastnum=0;
7743         RExC_lastparse=NULL;
7744     });
7745
7746 #ifdef RE_TRACK_PATTERN_OFFSETS
7747     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7748                           "%s %" UVuf " bytes for offset annotations.\n",
7749                           RExC_offsets ? "Got" : "Couldn't get",
7750                           (UV)((RExC_offsets[0] * 2 + 1))));
7751     DEBUG_OFFSETS_r(if (RExC_offsets) {
7752         const STRLEN len = RExC_offsets[0];
7753         STRLEN i;
7754         GET_RE_DEBUG_FLAGS_DECL;
7755         Perl_re_printf( aTHX_
7756                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7757         for (i = 1; i <= len; i++) {
7758             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7759                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7760                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7761         }
7762         Perl_re_printf( aTHX_  "\n");
7763     });
7764
7765 #else
7766     SetProgLen(RExC_rxi,RExC_size);
7767 #endif
7768
7769     DEBUG_OPTIMISE_r(
7770         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7771     );
7772
7773     /* XXXX To minimize changes to RE engine we always allocate
7774        3-units-long substrs field. */
7775     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7776     if (RExC_recurse_count) {
7777         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7778         SAVEFREEPV(RExC_recurse);
7779     }
7780
7781     if (RExC_seen & REG_RECURSE_SEEN) {
7782         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7783          * So its 1 if there are no parens. */
7784         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7785                                          ((RExC_total_parens & 0x07) != 0);
7786         Newx(RExC_study_chunk_recursed,
7787              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7788         SAVEFREEPV(RExC_study_chunk_recursed);
7789     }
7790
7791   reStudy:
7792     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7793     DEBUG_r(
7794         RExC_study_chunk_recursed_count= 0;
7795     );
7796     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7797     if (RExC_study_chunk_recursed) {
7798         Zero(RExC_study_chunk_recursed,
7799              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7800     }
7801
7802
7803 #ifdef TRIE_STUDY_OPT
7804     if (!restudied) {
7805         StructCopy(&zero_scan_data, &data, scan_data_t);
7806         copyRExC_state = RExC_state;
7807     } else {
7808         U32 seen=RExC_seen;
7809         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7810
7811         RExC_state = copyRExC_state;
7812         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7813             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7814         else
7815             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7816         StructCopy(&zero_scan_data, &data, scan_data_t);
7817     }
7818 #else
7819     StructCopy(&zero_scan_data, &data, scan_data_t);
7820 #endif
7821
7822     /* Dig out information for optimizations. */
7823     RExC_rx->extflags = RExC_flags; /* was pm_op */
7824     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7825
7826     if (UTF)
7827         SvUTF8_on(Rx);  /* Unicode in it? */
7828     RExC_rxi->regstclass = NULL;
7829     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7830         RExC_rx->intflags |= PREGf_NAUGHTY;
7831     scan = RExC_rxi->program + 1;               /* First BRANCH. */
7832
7833     /* testing for BRANCH here tells us whether there is "must appear"
7834        data in the pattern. If there is then we can use it for optimisations */
7835     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7836                                                   */
7837         SSize_t fake;
7838         STRLEN longest_length[2];
7839         regnode_ssc ch_class; /* pointed to by data */
7840         int stclass_flag;
7841         SSize_t last_close = 0; /* pointed to by data */
7842         regnode *first= scan;
7843         regnode *first_next= regnext(first);
7844         int i;
7845
7846         /*
7847          * Skip introductions and multiplicators >= 1
7848          * so that we can extract the 'meat' of the pattern that must
7849          * match in the large if() sequence following.
7850          * NOTE that EXACT is NOT covered here, as it is normally
7851          * picked up by the optimiser separately.
7852          *
7853          * This is unfortunate as the optimiser isnt handling lookahead
7854          * properly currently.
7855          *
7856          */
7857         while ((OP(first) == OPEN && (sawopen = 1)) ||
7858                /* An OR of *one* alternative - should not happen now. */
7859             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7860             /* for now we can't handle lookbehind IFMATCH*/
7861             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7862             (OP(first) == PLUS) ||
7863             (OP(first) == MINMOD) ||
7864                /* An {n,m} with n>0 */
7865             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7866             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7867         {
7868                 /*
7869                  * the only op that could be a regnode is PLUS, all the rest
7870                  * will be regnode_1 or regnode_2.
7871                  *
7872                  * (yves doesn't think this is true)
7873                  */
7874                 if (OP(first) == PLUS)
7875                     sawplus = 1;
7876                 else {
7877                     if (OP(first) == MINMOD)
7878                         sawminmod = 1;
7879                     first += regarglen[OP(first)];
7880                 }
7881                 first = NEXTOPER(first);
7882                 first_next= regnext(first);
7883         }
7884
7885         /* Starting-point info. */
7886       again:
7887         DEBUG_PEEP("first:", first, 0, 0);
7888         /* Ignore EXACT as we deal with it later. */
7889         if (PL_regkind[OP(first)] == EXACT) {
7890             if (   OP(first) == EXACT
7891                 || OP(first) == EXACT_ONLY8
7892                 || OP(first) == EXACTL)
7893             {
7894                 NOOP;   /* Empty, get anchored substr later. */
7895             }
7896             else
7897                 RExC_rxi->regstclass = first;
7898         }
7899 #ifdef TRIE_STCLASS
7900         else if (PL_regkind[OP(first)] == TRIE &&
7901                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7902         {
7903             /* this can happen only on restudy */
7904             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7905         }
7906 #endif
7907         else if (REGNODE_SIMPLE(OP(first)))
7908             RExC_rxi->regstclass = first;
7909         else if (PL_regkind[OP(first)] == BOUND ||
7910                  PL_regkind[OP(first)] == NBOUND)
7911             RExC_rxi->regstclass = first;
7912         else if (PL_regkind[OP(first)] == BOL) {
7913             RExC_rx->intflags |= (OP(first) == MBOL
7914                            ? PREGf_ANCH_MBOL
7915                            : PREGf_ANCH_SBOL);
7916             first = NEXTOPER(first);
7917             goto again;
7918         }
7919         else if (OP(first) == GPOS) {
7920             RExC_rx->intflags |= PREGf_ANCH_GPOS;
7921             first = NEXTOPER(first);
7922             goto again;
7923         }
7924         else if ((!sawopen || !RExC_sawback) &&
7925             !sawlookahead &&
7926             (OP(first) == STAR &&
7927             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7928             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7929         {
7930             /* turn .* into ^.* with an implied $*=1 */
7931             const int type =
7932                 (OP(NEXTOPER(first)) == REG_ANY)
7933                     ? PREGf_ANCH_MBOL
7934                     : PREGf_ANCH_SBOL;
7935             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
7936             first = NEXTOPER(first);
7937             goto again;
7938         }
7939         if (sawplus && !sawminmod && !sawlookahead
7940             && (!sawopen || !RExC_sawback)
7941             && !pRExC_state->code_blocks) /* May examine pos and $& */
7942             /* x+ must match at the 1st pos of run of x's */
7943             RExC_rx->intflags |= PREGf_SKIP;
7944
7945         /* Scan is after the zeroth branch, first is atomic matcher. */
7946 #ifdef TRIE_STUDY_OPT
7947         DEBUG_PARSE_r(
7948             if (!restudied)
7949                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7950                               (IV)(first - scan + 1))
7951         );
7952 #else
7953         DEBUG_PARSE_r(
7954             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
7955                 (IV)(first - scan + 1))
7956         );
7957 #endif
7958
7959
7960         /*
7961         * If there's something expensive in the r.e., find the
7962         * longest literal string that must appear and make it the
7963         * regmust.  Resolve ties in favor of later strings, since
7964         * the regstart check works with the beginning of the r.e.
7965         * and avoiding duplication strengthens checking.  Not a
7966         * strong reason, but sufficient in the absence of others.
7967         * [Now we resolve ties in favor of the earlier string if
7968         * it happens that c_offset_min has been invalidated, since the
7969         * earlier string may buy us something the later one won't.]
7970         */
7971
7972         data.substrs[0].str = newSVpvs("");
7973         data.substrs[1].str = newSVpvs("");
7974         data.last_found = newSVpvs("");
7975         data.cur_is_floating = 0; /* initially any found substring is fixed */
7976         ENTER_with_name("study_chunk");
7977         SAVEFREESV(data.substrs[0].str);
7978         SAVEFREESV(data.substrs[1].str);
7979         SAVEFREESV(data.last_found);
7980         first = scan;
7981         if (!RExC_rxi->regstclass) {
7982             ssc_init(pRExC_state, &ch_class);
7983             data.start_class = &ch_class;
7984             stclass_flag = SCF_DO_STCLASS_AND;
7985         } else                          /* XXXX Check for BOUND? */
7986             stclass_flag = 0;
7987         data.last_closep = &last_close;
7988
7989         DEBUG_RExC_seen();
7990         /*
7991          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7992          * (NO top level branches)
7993          */
7994         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7995                              scan + RExC_size, /* Up to end */
7996             &data, -1, 0, NULL,
7997             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7998                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7999             0);
8000
8001
8002         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8003
8004
8005         if ( RExC_total_parens == 1 && !data.cur_is_floating
8006              && data.last_start_min == 0 && data.last_end > 0
8007              && !RExC_seen_zerolen
8008              && !(RExC_seen & REG_VERBARG_SEEN)
8009              && !(RExC_seen & REG_GPOS_SEEN)
8010         ){
8011             RExC_rx->extflags |= RXf_CHECK_ALL;
8012         }
8013         scan_commit(pRExC_state, &data,&minlen, 0);
8014
8015
8016         /* XXX this is done in reverse order because that's the way the
8017          * code was before it was parameterised. Don't know whether it
8018          * actually needs doing in reverse order. DAPM */
8019         for (i = 1; i >= 0; i--) {
8020             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8021
8022             if (   !(   i
8023                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8024                      &&    data.substrs[0].min_offset
8025                         == data.substrs[1].min_offset
8026                      &&    SvCUR(data.substrs[0].str)
8027                         == SvCUR(data.substrs[1].str)
8028                     )
8029                 && S_setup_longest (aTHX_ pRExC_state,
8030                                         &(RExC_rx->substrs->data[i]),
8031                                         &(data.substrs[i]),
8032                                         longest_length[i]))
8033             {
8034                 RExC_rx->substrs->data[i].min_offset =
8035                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8036
8037                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8038                 /* Don't offset infinity */
8039                 if (data.substrs[i].max_offset < SSize_t_MAX)
8040                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8041                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8042             }
8043             else {
8044                 RExC_rx->substrs->data[i].substr      = NULL;
8045                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8046                 longest_length[i] = 0;
8047             }
8048         }
8049
8050         LEAVE_with_name("study_chunk");
8051
8052         if (RExC_rxi->regstclass
8053             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8054             RExC_rxi->regstclass = NULL;
8055
8056         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8057               || RExC_rx->substrs->data[0].min_offset)
8058             && stclass_flag
8059             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8060             && is_ssc_worth_it(pRExC_state, data.start_class))
8061         {
8062             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8063
8064             ssc_finalize(pRExC_state, data.start_class);
8065
8066             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8067             StructCopy(data.start_class,
8068                        (regnode_ssc*)RExC_rxi->data->data[n],
8069                        regnode_ssc);
8070             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8071             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8072             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8073                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8074                       Perl_re_printf( aTHX_
8075                                     "synthetic stclass \"%s\".\n",
8076                                     SvPVX_const(sv));});
8077             data.start_class = NULL;
8078         }
8079
8080         /* A temporary algorithm prefers floated substr to fixed one of
8081          * same length to dig more info. */
8082         i = (longest_length[0] <= longest_length[1]);
8083         RExC_rx->substrs->check_ix = i;
8084         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8085         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8086         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8087         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8088         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8089         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8090             RExC_rx->intflags |= PREGf_NOSCAN;
8091
8092         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8093             RExC_rx->extflags |= RXf_USE_INTUIT;
8094             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8095                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8096         }
8097
8098         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8099         if ( (STRLEN)minlen < longest_length[1] )
8100             minlen= longest_length[1];
8101         if ( (STRLEN)minlen < longest_length[0] )
8102             minlen= longest_length[0];
8103         */
8104     }
8105     else {
8106         /* Several toplevels. Best we can is to set minlen. */
8107         SSize_t fake;
8108         regnode_ssc ch_class;
8109         SSize_t last_close = 0;
8110
8111         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8112
8113         scan = RExC_rxi->program + 1;
8114         ssc_init(pRExC_state, &ch_class);
8115         data.start_class = &ch_class;
8116         data.last_closep = &last_close;
8117
8118         DEBUG_RExC_seen();
8119         /*
8120          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8121          * (patterns WITH top level branches)
8122          */
8123         minlen = study_chunk(pRExC_state,
8124             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8125             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8126                                                       ? SCF_TRIE_DOING_RESTUDY
8127                                                       : 0),
8128             0);
8129
8130         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8131
8132         RExC_rx->check_substr = NULL;
8133         RExC_rx->check_utf8 = NULL;
8134         RExC_rx->substrs->data[0].substr      = NULL;
8135         RExC_rx->substrs->data[0].utf8_substr = NULL;
8136         RExC_rx->substrs->data[1].substr      = NULL;
8137         RExC_rx->substrs->data[1].utf8_substr = NULL;
8138
8139         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8140             && is_ssc_worth_it(pRExC_state, data.start_class))
8141         {
8142             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8143
8144             ssc_finalize(pRExC_state, data.start_class);
8145
8146             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8147             StructCopy(data.start_class,
8148                        (regnode_ssc*)RExC_rxi->data->data[n],
8149                        regnode_ssc);
8150             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8151             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8152             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8153                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8154                       Perl_re_printf( aTHX_
8155                                     "synthetic stclass \"%s\".\n",
8156                                     SvPVX_const(sv));});
8157             data.start_class = NULL;
8158         }
8159     }
8160
8161     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8162         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8163         RExC_rx->maxlen = REG_INFTY;
8164     }
8165     else {
8166         RExC_rx->maxlen = RExC_maxlen;
8167     }
8168
8169     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8170        the "real" pattern. */
8171     DEBUG_OPTIMISE_r({
8172         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8173                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8174     });
8175     RExC_rx->minlenret = minlen;
8176     if (RExC_rx->minlen < minlen)
8177         RExC_rx->minlen = minlen;
8178
8179     if (RExC_seen & REG_RECURSE_SEEN ) {
8180         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8181         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8182     }
8183     if (RExC_seen & REG_GPOS_SEEN)
8184         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8185     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8186         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8187                                                 lookbehind */
8188     if (pRExC_state->code_blocks)
8189         RExC_rx->extflags |= RXf_EVAL_SEEN;
8190     if (RExC_seen & REG_VERBARG_SEEN)
8191     {
8192         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8193         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8194     }
8195     if (RExC_seen & REG_CUTGROUP_SEEN)
8196         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8197     if (pm_flags & PMf_USE_RE_EVAL)
8198         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8199     if (RExC_paren_names)
8200         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8201     else
8202         RXp_PAREN_NAMES(RExC_rx) = NULL;
8203
8204     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8205      * so it can be used in pp.c */
8206     if (RExC_rx->intflags & PREGf_ANCH)
8207         RExC_rx->extflags |= RXf_IS_ANCHORED;
8208
8209
8210     {
8211         /* this is used to identify "special" patterns that might result
8212          * in Perl NOT calling the regex engine and instead doing the match "itself",
8213          * particularly special cases in split//. By having the regex compiler
8214          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8215          * we avoid weird issues with equivalent patterns resulting in different behavior,
8216          * AND we allow non Perl engines to get the same optimizations by the setting the
8217          * flags appropriately - Yves */
8218         regnode *first = RExC_rxi->program + 1;
8219         U8 fop = OP(first);
8220         regnode *next = regnext(first);
8221         U8 nop = OP(next);
8222
8223         if (PL_regkind[fop] == NOTHING && nop == END)
8224             RExC_rx->extflags |= RXf_NULL;
8225         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8226             /* when fop is SBOL first->flags will be true only when it was
8227              * produced by parsing /\A/, and not when parsing /^/. This is
8228              * very important for the split code as there we want to
8229              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8230              * See rt #122761 for more details. -- Yves */
8231             RExC_rx->extflags |= RXf_START_ONLY;
8232         else if (fop == PLUS
8233                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8234                  && nop == END)
8235             RExC_rx->extflags |= RXf_WHITE;
8236         else if ( RExC_rx->extflags & RXf_SPLIT
8237                   && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8238                   && STR_LEN(first) == 1
8239                   && *(STRING(first)) == ' '
8240                   && nop == END )
8241             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8242
8243     }
8244
8245     if (RExC_contains_locale) {
8246         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8247     }
8248
8249 #ifdef DEBUGGING
8250     if (RExC_paren_names) {
8251         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8252         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8253                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8254     } else
8255 #endif
8256     RExC_rxi->name_list_idx = 0;
8257
8258     while ( RExC_recurse_count > 0 ) {
8259         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8260         /*
8261          * This data structure is set up in study_chunk() and is used
8262          * to calculate the distance between a GOSUB regopcode and
8263          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8264          * it refers to.
8265          *
8266          * If for some reason someone writes code that optimises
8267          * away a GOSUB opcode then the assert should be changed to
8268          * an if(scan) to guard the ARG2L_SET() - Yves
8269          *
8270          */
8271         assert(scan && OP(scan) == GOSUB);
8272         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8273     }
8274
8275     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8276     /* assume we don't need to swap parens around before we match */
8277     DEBUG_TEST_r({
8278         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8279             (unsigned long)RExC_study_chunk_recursed_count);
8280     });
8281     DEBUG_DUMP_r({
8282         DEBUG_RExC_seen();
8283         Perl_re_printf( aTHX_ "Final program:\n");
8284         regdump(RExC_rx);
8285     });
8286
8287     if (RExC_open_parens) {
8288         Safefree(RExC_open_parens);
8289         RExC_open_parens = NULL;
8290     }
8291     if (RExC_close_parens) {
8292         Safefree(RExC_close_parens);
8293         RExC_close_parens = NULL;
8294     }
8295
8296 #ifdef USE_ITHREADS
8297     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8298      * by setting the regexp SV to readonly-only instead. If the
8299      * pattern's been recompiled, the USEDness should remain. */
8300     if (old_re && SvREADONLY(old_re))
8301         SvREADONLY_on(Rx);
8302 #endif
8303     return Rx;
8304 }
8305
8306
8307 SV*
8308 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8309                     const U32 flags)
8310 {
8311     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8312
8313     PERL_UNUSED_ARG(value);
8314
8315     if (flags & RXapif_FETCH) {
8316         return reg_named_buff_fetch(rx, key, flags);
8317     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8318         Perl_croak_no_modify();
8319         return NULL;
8320     } else if (flags & RXapif_EXISTS) {
8321         return reg_named_buff_exists(rx, key, flags)
8322             ? &PL_sv_yes
8323             : &PL_sv_no;
8324     } else if (flags & RXapif_REGNAMES) {
8325         return reg_named_buff_all(rx, flags);
8326     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8327         return reg_named_buff_scalar(rx, flags);
8328     } else {
8329         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8330         return NULL;
8331     }
8332 }
8333
8334 SV*
8335 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8336                          const U32 flags)
8337 {
8338     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8339     PERL_UNUSED_ARG(lastkey);
8340
8341     if (flags & RXapif_FIRSTKEY)
8342         return reg_named_buff_firstkey(rx, flags);
8343     else if (flags & RXapif_NEXTKEY)
8344         return reg_named_buff_nextkey(rx, flags);
8345     else {
8346         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8347                                             (int)flags);
8348         return NULL;
8349     }
8350 }
8351
8352 SV*
8353 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8354                           const U32 flags)
8355 {
8356     SV *ret;
8357     struct regexp *const rx = ReANY(r);
8358
8359     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8360
8361     if (rx && RXp_PAREN_NAMES(rx)) {
8362         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8363         if (he_str) {
8364             IV i;
8365             SV* sv_dat=HeVAL(he_str);
8366             I32 *nums=(I32*)SvPVX(sv_dat);
8367             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8368             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8369                 if ((I32)(rx->nparens) >= nums[i]
8370                     && rx->offs[nums[i]].start != -1
8371                     && rx->offs[nums[i]].end != -1)
8372                 {
8373                     ret = newSVpvs("");
8374                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8375                     if (!retarray)
8376                         return ret;
8377                 } else {
8378                     if (retarray)
8379                         ret = newSVsv(&PL_sv_undef);
8380                 }
8381                 if (retarray)
8382                     av_push(retarray, ret);
8383             }
8384             if (retarray)
8385                 return newRV_noinc(MUTABLE_SV(retarray));
8386         }
8387     }
8388     return NULL;
8389 }
8390
8391 bool
8392 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8393                            const U32 flags)
8394 {
8395     struct regexp *const rx = ReANY(r);
8396
8397     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8398
8399     if (rx && RXp_PAREN_NAMES(rx)) {
8400         if (flags & RXapif_ALL) {
8401             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8402         } else {
8403             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8404             if (sv) {
8405                 SvREFCNT_dec_NN(sv);
8406                 return TRUE;
8407             } else {
8408                 return FALSE;
8409             }
8410         }
8411     } else {
8412         return FALSE;
8413     }
8414 }
8415
8416 SV*
8417 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8418 {
8419     struct regexp *const rx = ReANY(r);
8420
8421     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8422
8423     if ( rx && RXp_PAREN_NAMES(rx) ) {
8424         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8425
8426         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8427     } else {
8428         return FALSE;
8429     }
8430 }
8431
8432 SV*
8433 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8434 {
8435     struct regexp *const rx = ReANY(r);
8436     GET_RE_DEBUG_FLAGS_DECL;
8437
8438     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8439
8440     if (rx && RXp_PAREN_NAMES(rx)) {
8441         HV *hv = RXp_PAREN_NAMES(rx);
8442         HE *temphe;
8443         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8444             IV i;
8445             IV parno = 0;
8446             SV* sv_dat = HeVAL(temphe);
8447             I32 *nums = (I32*)SvPVX(sv_dat);
8448             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8449                 if ((I32)(rx->lastparen) >= nums[i] &&
8450                     rx->offs[nums[i]].start != -1 &&
8451                     rx->offs[nums[i]].end != -1)
8452                 {
8453                     parno = nums[i];
8454                     break;
8455                 }
8456             }
8457             if (parno || flags & RXapif_ALL) {
8458                 return newSVhek(HeKEY_hek(temphe));
8459             }
8460         }
8461     }
8462     return NULL;
8463 }
8464
8465 SV*
8466 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8467 {
8468     SV *ret;
8469     AV *av;
8470     SSize_t length;
8471     struct regexp *const rx = ReANY(r);
8472
8473     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8474
8475     if (rx && RXp_PAREN_NAMES(rx)) {
8476         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8477             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8478         } else if (flags & RXapif_ONE) {
8479             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8480             av = MUTABLE_AV(SvRV(ret));
8481             length = av_tindex(av);
8482             SvREFCNT_dec_NN(ret);
8483             return newSViv(length + 1);
8484         } else {
8485             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8486                                                 (int)flags);
8487             return NULL;
8488         }
8489     }
8490     return &PL_sv_undef;
8491 }
8492
8493 SV*
8494 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8495 {
8496     struct regexp *const rx = ReANY(r);
8497     AV *av = newAV();
8498
8499     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8500
8501     if (rx && RXp_PAREN_NAMES(rx)) {
8502         HV *hv= RXp_PAREN_NAMES(rx);
8503         HE *temphe;
8504         (void)hv_iterinit(hv);
8505         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8506             IV i;
8507             IV parno = 0;
8508             SV* sv_dat = HeVAL(temphe);
8509             I32 *nums = (I32*)SvPVX(sv_dat);
8510             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8511                 if ((I32)(rx->lastparen) >= nums[i] &&
8512                     rx->offs[nums[i]].start != -1 &&
8513                     rx->offs[nums[i]].end != -1)
8514                 {
8515                     parno = nums[i];
8516                     break;
8517                 }
8518             }
8519             if (parno || flags & RXapif_ALL) {
8520                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8521             }
8522         }
8523     }
8524
8525     return newRV_noinc(MUTABLE_SV(av));
8526 }
8527
8528 void
8529 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8530                              SV * const sv)
8531 {
8532     struct regexp *const rx = ReANY(r);
8533     char *s = NULL;
8534     SSize_t i = 0;
8535     SSize_t s1, t1;
8536     I32 n = paren;
8537
8538     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8539
8540     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8541            || n == RX_BUFF_IDX_CARET_FULLMATCH
8542            || n == RX_BUFF_IDX_CARET_POSTMATCH
8543        )
8544     {
8545         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8546         if (!keepcopy) {
8547             /* on something like
8548              *    $r = qr/.../;
8549              *    /$qr/p;
8550              * the KEEPCOPY is set on the PMOP rather than the regex */
8551             if (PL_curpm && r == PM_GETRE(PL_curpm))
8552                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8553         }
8554         if (!keepcopy)
8555             goto ret_undef;
8556     }
8557
8558     if (!rx->subbeg)
8559         goto ret_undef;
8560
8561     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8562         /* no need to distinguish between them any more */
8563         n = RX_BUFF_IDX_FULLMATCH;
8564
8565     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8566         && rx->offs[0].start != -1)
8567     {
8568         /* $`, ${^PREMATCH} */
8569         i = rx->offs[0].start;
8570         s = rx->subbeg;
8571     }
8572     else
8573     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8574         && rx->offs[0].end != -1)
8575     {
8576         /* $', ${^POSTMATCH} */
8577         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8578         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8579     }
8580     else
8581     if ( 0 <= n && n <= (I32)rx->nparens &&
8582         (s1 = rx->offs[n].start) != -1 &&
8583         (t1 = rx->offs[n].end) != -1)
8584     {
8585         /* $&, ${^MATCH},  $1 ... */
8586         i = t1 - s1;
8587         s = rx->subbeg + s1 - rx->suboffset;
8588     } else {
8589         goto ret_undef;
8590     }
8591
8592     assert(s >= rx->subbeg);
8593     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8594     if (i >= 0) {
8595 #ifdef NO_TAINT_SUPPORT
8596         sv_setpvn(sv, s, i);
8597 #else
8598         const int oldtainted = TAINT_get;
8599         TAINT_NOT;
8600         sv_setpvn(sv, s, i);
8601         TAINT_set(oldtainted);
8602 #endif
8603         if (RXp_MATCH_UTF8(rx))
8604             SvUTF8_on(sv);
8605         else
8606             SvUTF8_off(sv);
8607         if (TAINTING_get) {
8608             if (RXp_MATCH_TAINTED(rx)) {
8609                 if (SvTYPE(sv) >= SVt_PVMG) {
8610                     MAGIC* const mg = SvMAGIC(sv);
8611                     MAGIC* mgt;
8612                     TAINT;
8613                     SvMAGIC_set(sv, mg->mg_moremagic);
8614                     SvTAINT(sv);
8615                     if ((mgt = SvMAGIC(sv))) {
8616                         mg->mg_moremagic = mgt;
8617                         SvMAGIC_set(sv, mg);
8618                     }
8619                 } else {
8620                     TAINT;
8621                     SvTAINT(sv);
8622                 }
8623             } else
8624                 SvTAINTED_off(sv);
8625         }
8626     } else {
8627       ret_undef:
8628         sv_set_undef(sv);
8629         return;
8630     }
8631 }
8632
8633 void
8634 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8635                                                          SV const * const value)
8636 {
8637     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8638
8639     PERL_UNUSED_ARG(rx);
8640     PERL_UNUSED_ARG(paren);
8641     PERL_UNUSED_ARG(value);
8642
8643     if (!PL_localizing)
8644         Perl_croak_no_modify();
8645 }
8646
8647 I32
8648 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8649                               const I32 paren)
8650 {
8651     struct regexp *const rx = ReANY(r);
8652     I32 i;
8653     I32 s1, t1;
8654
8655     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8656
8657     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8658         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8659         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8660     )
8661     {
8662         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8663         if (!keepcopy) {
8664             /* on something like
8665              *    $r = qr/.../;
8666              *    /$qr/p;
8667              * the KEEPCOPY is set on the PMOP rather than the regex */
8668             if (PL_curpm && r == PM_GETRE(PL_curpm))
8669                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8670         }
8671         if (!keepcopy)
8672             goto warn_undef;
8673     }
8674
8675     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8676     switch (paren) {
8677       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8678       case RX_BUFF_IDX_PREMATCH:       /* $` */
8679         if (rx->offs[0].start != -1) {
8680                         i = rx->offs[0].start;
8681                         if (i > 0) {
8682                                 s1 = 0;
8683                                 t1 = i;
8684                                 goto getlen;
8685                         }
8686             }
8687         return 0;
8688
8689       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8690       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8691             if (rx->offs[0].end != -1) {
8692                         i = rx->sublen - rx->offs[0].end;
8693                         if (i > 0) {
8694                                 s1 = rx->offs[0].end;
8695                                 t1 = rx->sublen;
8696                                 goto getlen;
8697                         }
8698             }
8699         return 0;
8700
8701       default: /* $& / ${^MATCH}, $1, $2, ... */
8702             if (paren <= (I32)rx->nparens &&
8703             (s1 = rx->offs[paren].start) != -1 &&
8704             (t1 = rx->offs[paren].end) != -1)
8705             {
8706             i = t1 - s1;
8707             goto getlen;
8708         } else {
8709           warn_undef:
8710             if (ckWARN(WARN_UNINITIALIZED))
8711                 report_uninit((const SV *)sv);
8712             return 0;
8713         }
8714     }
8715   getlen:
8716     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8717         const char * const s = rx->subbeg - rx->suboffset + s1;
8718         const U8 *ep;
8719         STRLEN el;
8720
8721         i = t1 - s1;
8722         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8723                         i = el;
8724     }
8725     return i;
8726 }
8727
8728 SV*
8729 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8730 {
8731     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8732         PERL_UNUSED_ARG(rx);
8733         if (0)
8734             return NULL;
8735         else
8736             return newSVpvs("Regexp");
8737 }
8738
8739 /* Scans the name of a named buffer from the pattern.
8740  * If flags is REG_RSN_RETURN_NULL returns null.
8741  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8742  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8743  * to the parsed name as looked up in the RExC_paren_names hash.
8744  * If there is an error throws a vFAIL().. type exception.
8745  */
8746
8747 #define REG_RSN_RETURN_NULL    0
8748 #define REG_RSN_RETURN_NAME    1
8749 #define REG_RSN_RETURN_DATA    2
8750
8751 STATIC SV*
8752 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8753 {
8754     char *name_start = RExC_parse;
8755     SV* sv_name;
8756
8757     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8758
8759     assert (RExC_parse <= RExC_end);
8760     if (RExC_parse == RExC_end) NOOP;
8761     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8762          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8763           * using do...while */
8764         if (UTF)
8765             do {
8766                 RExC_parse += UTF8SKIP(RExC_parse);
8767             } while (   RExC_parse < RExC_end
8768                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8769         else
8770             do {
8771                 RExC_parse++;
8772             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8773     } else {
8774         RExC_parse++; /* so the <- from the vFAIL is after the offending
8775                          character */
8776         vFAIL("Group name must start with a non-digit word character");
8777     }
8778     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8779                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8780     if ( flags == REG_RSN_RETURN_NAME)
8781         return sv_name;
8782     else if (flags==REG_RSN_RETURN_DATA) {
8783         HE *he_str = NULL;
8784         SV *sv_dat = NULL;
8785         if ( ! sv_name )      /* should not happen*/
8786             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8787         if (RExC_paren_names)
8788             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8789         if ( he_str )
8790             sv_dat = HeVAL(he_str);
8791         if ( ! sv_dat ) {   /* Didn't find group */
8792
8793             /* It might be a forward reference; we can't fail until we
8794                 * know, by completing the parse to get all the groups, and
8795                 * then reparsing */
8796             if (RExC_total_parens > 0)  {
8797                 vFAIL("Reference to nonexistent named group");
8798             }
8799             else {
8800                 REQUIRE_PARENS_PASS;
8801             }
8802         }
8803         return sv_dat;
8804     }
8805
8806     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8807                      (unsigned long) flags);
8808 }
8809
8810 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8811     if (RExC_lastparse!=RExC_parse) {                           \
8812         Perl_re_printf( aTHX_  "%s",                            \
8813             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8814                 RExC_end - RExC_parse, 16,                      \
8815                 "", "",                                         \
8816                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8817                 PERL_PV_PRETTY_ELLIPSES   |                     \
8818                 PERL_PV_PRETTY_LTGT       |                     \
8819                 PERL_PV_ESCAPE_RE         |                     \
8820                 PERL_PV_PRETTY_EXACTSIZE                        \
8821             )                                                   \
8822         );                                                      \
8823     } else                                                      \
8824         Perl_re_printf( aTHX_ "%16s","");                       \
8825                                                                 \
8826     if (RExC_lastnum!=RExC_emit)                                \
8827        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8828     else                                                        \
8829        Perl_re_printf( aTHX_ "|%4s","");                        \
8830     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8831         (int)((depth*2)), "",                                   \
8832         (funcname)                                              \
8833     );                                                          \
8834     RExC_lastnum=RExC_emit;                                     \
8835     RExC_lastparse=RExC_parse;                                  \
8836 })
8837
8838
8839
8840 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8841     DEBUG_PARSE_MSG((funcname));                            \
8842     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8843 })
8844 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8845     DEBUG_PARSE_MSG((funcname));                            \
8846     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8847 })
8848
8849 /* This section of code defines the inversion list object and its methods.  The
8850  * interfaces are highly subject to change, so as much as possible is static to
8851  * this file.  An inversion list is here implemented as a malloc'd C UV array
8852  * as an SVt_INVLIST scalar.
8853  *
8854  * An inversion list for Unicode is an array of code points, sorted by ordinal
8855  * number.  Each element gives the code point that begins a range that extends
8856  * up-to but not including the code point given by the next element.  The final
8857  * element gives the first code point of a range that extends to the platform's
8858  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8859  * ...) give ranges whose code points are all in the inversion list.  We say
8860  * that those ranges are in the set.  The odd-numbered elements give ranges
8861  * whose code points are not in the inversion list, and hence not in the set.
8862  * Thus, element [0] is the first code point in the list.  Element [1]
8863  * is the first code point beyond that not in the list; and element [2] is the
8864  * first code point beyond that that is in the list.  In other words, the first
8865  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8866  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8867  * all code points in that range are not in the inversion list.  The third
8868  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8869  * list, and so forth.  Thus every element whose index is divisible by two
8870  * gives the beginning of a range that is in the list, and every element whose
8871  * index is not divisible by two gives the beginning of a range not in the
8872  * list.  If the final element's index is divisible by two, the inversion list
8873  * extends to the platform's infinity; otherwise the highest code point in the
8874  * inversion list is the contents of that element minus 1.
8875  *
8876  * A range that contains just a single code point N will look like
8877  *  invlist[i]   == N
8878  *  invlist[i+1] == N+1
8879  *
8880  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8881  * impossible to represent, so element [i+1] is omitted.  The single element
8882  * inversion list
8883  *  invlist[0] == UV_MAX
8884  * contains just UV_MAX, but is interpreted as matching to infinity.
8885  *
8886  * Taking the complement (inverting) an inversion list is quite simple, if the
8887  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8888  * This implementation reserves an element at the beginning of each inversion
8889  * list to always contain 0; there is an additional flag in the header which
8890  * indicates if the list begins at the 0, or is offset to begin at the next
8891  * element.  This means that the inversion list can be inverted without any
8892  * copying; just flip the flag.
8893  *
8894  * More about inversion lists can be found in "Unicode Demystified"
8895  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8896  *
8897  * The inversion list data structure is currently implemented as an SV pointing
8898  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8899  * array of UV whose memory management is automatically handled by the existing
8900  * facilities for SV's.
8901  *
8902  * Some of the methods should always be private to the implementation, and some
8903  * should eventually be made public */
8904
8905 /* The header definitions are in F<invlist_inline.h> */
8906
8907 #ifndef PERL_IN_XSUB_RE
8908
8909 PERL_STATIC_INLINE UV*
8910 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8911 {
8912     /* Returns a pointer to the first element in the inversion list's array.
8913      * This is called upon initialization of an inversion list.  Where the
8914      * array begins depends on whether the list has the code point U+0000 in it
8915      * or not.  The other parameter tells it whether the code that follows this
8916      * call is about to put a 0 in the inversion list or not.  The first
8917      * element is either the element reserved for 0, if TRUE, or the element
8918      * after it, if FALSE */
8919
8920     bool* offset = get_invlist_offset_addr(invlist);
8921     UV* zero_addr = (UV *) SvPVX(invlist);
8922
8923     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8924
8925     /* Must be empty */
8926     assert(! _invlist_len(invlist));
8927
8928     *zero_addr = 0;
8929
8930     /* 1^1 = 0; 1^0 = 1 */
8931     *offset = 1 ^ will_have_0;
8932     return zero_addr + *offset;
8933 }
8934
8935 PERL_STATIC_INLINE void
8936 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8937 {
8938     /* Sets the current number of elements stored in the inversion list.
8939      * Updates SvCUR correspondingly */
8940     PERL_UNUSED_CONTEXT;
8941     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8942
8943     assert(is_invlist(invlist));
8944
8945     SvCUR_set(invlist,
8946               (len == 0)
8947                ? 0
8948                : TO_INTERNAL_SIZE(len + offset));
8949     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8950 }
8951
8952 STATIC void
8953 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8954 {
8955     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
8956      * steals the list from 'src', so 'src' is made to have a NULL list.  This
8957      * is similar to what SvSetMagicSV() would do, if it were implemented on
8958      * inversion lists, though this routine avoids a copy */
8959
8960     const UV src_len          = _invlist_len(src);
8961     const bool src_offset     = *get_invlist_offset_addr(src);
8962     const STRLEN src_byte_len = SvLEN(src);
8963     char * array              = SvPVX(src);
8964
8965     const int oldtainted = TAINT_get;
8966
8967     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8968
8969     assert(is_invlist(src));
8970     assert(is_invlist(dest));
8971     assert(! invlist_is_iterating(src));
8972     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8973
8974     /* Make sure it ends in the right place with a NUL, as our inversion list
8975      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8976      * asserts it */
8977     array[src_byte_len - 1] = '\0';
8978
8979     TAINT_NOT;      /* Otherwise it breaks */
8980     sv_usepvn_flags(dest,
8981                     (char *) array,
8982                     src_byte_len - 1,
8983
8984                     /* This flag is documented to cause a copy to be avoided */
8985                     SV_HAS_TRAILING_NUL);
8986     TAINT_set(oldtainted);
8987     SvPV_set(src, 0);
8988     SvLEN_set(src, 0);
8989     SvCUR_set(src, 0);
8990
8991     /* Finish up copying over the other fields in an inversion list */
8992     *get_invlist_offset_addr(dest) = src_offset;
8993     invlist_set_len(dest, src_len, src_offset);
8994     *get_invlist_previous_index_addr(dest) = 0;
8995     invlist_iterfinish(dest);
8996 }
8997
8998 PERL_STATIC_INLINE IV*
8999 S_get_invlist_previous_index_addr(SV* invlist)
9000 {
9001     /* Return the address of the IV that is reserved to hold the cached index
9002      * */
9003     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9004
9005     assert(is_invlist(invlist));
9006
9007     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9008 }
9009
9010 PERL_STATIC_INLINE IV
9011 S_invlist_previous_index(SV* const invlist)
9012 {
9013     /* Returns cached index of previous search */
9014
9015     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9016
9017     return *get_invlist_previous_index_addr(invlist);
9018 }
9019
9020 PERL_STATIC_INLINE void
9021 S_invlist_set_previous_index(SV* const invlist, const IV index)
9022 {
9023     /* Caches <index> for later retrieval */
9024
9025     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9026
9027     assert(index == 0 || index < (int) _invlist_len(invlist));
9028
9029     *get_invlist_previous_index_addr(invlist) = index;
9030 }
9031
9032 PERL_STATIC_INLINE void
9033 S_invlist_trim(SV* invlist)
9034 {
9035     /* Free the not currently-being-used space in an inversion list */
9036
9037     /* But don't free up the space needed for the 0 UV that is always at the
9038      * beginning of the list, nor the trailing NUL */
9039     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9040
9041     PERL_ARGS_ASSERT_INVLIST_TRIM;
9042
9043     assert(is_invlist(invlist));
9044
9045     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9046 }
9047
9048 PERL_STATIC_INLINE void
9049 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9050 {
9051     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9052
9053     assert(is_invlist(invlist));
9054
9055     invlist_set_len(invlist, 0, 0);
9056     invlist_trim(invlist);
9057 }
9058
9059 #endif /* ifndef PERL_IN_XSUB_RE */
9060
9061 PERL_STATIC_INLINE bool
9062 S_invlist_is_iterating(SV* const invlist)
9063 {
9064     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9065
9066     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9067 }
9068
9069 #ifndef PERL_IN_XSUB_RE
9070
9071 PERL_STATIC_INLINE UV
9072 S_invlist_max(SV* const invlist)
9073 {
9074     /* Returns the maximum number of elements storable in the inversion list's
9075      * array, without having to realloc() */
9076
9077     PERL_ARGS_ASSERT_INVLIST_MAX;
9078
9079     assert(is_invlist(invlist));
9080
9081     /* Assumes worst case, in which the 0 element is not counted in the
9082      * inversion list, so subtracts 1 for that */
9083     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9084            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9085            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9086 }
9087
9088 STATIC void
9089 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9090 {
9091     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9092
9093     /* First 1 is in case the zero element isn't in the list; second 1 is for
9094      * trailing NUL */
9095     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9096     invlist_set_len(invlist, 0, 0);
9097
9098     /* Force iterinit() to be used to get iteration to work */
9099     invlist_iterfinish(invlist);
9100
9101     *get_invlist_previous_index_addr(invlist) = 0;
9102 }
9103
9104 SV*
9105 Perl__new_invlist(pTHX_ IV initial_size)
9106 {
9107
9108     /* Return a pointer to a newly constructed inversion list, with enough
9109      * space to store 'initial_size' elements.  If that number is negative, a
9110      * system default is used instead */
9111
9112     SV* new_list;
9113
9114     if (initial_size < 0) {
9115         initial_size = 10;
9116     }
9117
9118     /* Allocate the initial space */
9119     new_list = newSV_type(SVt_INVLIST);
9120
9121     initialize_invlist_guts(new_list, initial_size);
9122
9123     return new_list;
9124 }
9125
9126 SV*
9127 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9128 {
9129     /* Return a pointer to a newly constructed inversion list, initialized to
9130      * point to <list>, which has to be in the exact correct inversion list
9131      * form, including internal fields.  Thus this is a dangerous routine that
9132      * should not be used in the wrong hands.  The passed in 'list' contains
9133      * several header fields at the beginning that are not part of the
9134      * inversion list body proper */
9135
9136     const STRLEN length = (STRLEN) list[0];
9137     const UV version_id =          list[1];
9138     const bool offset   =    cBOOL(list[2]);
9139 #define HEADER_LENGTH 3
9140     /* If any of the above changes in any way, you must change HEADER_LENGTH
9141      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9142      *      perl -E 'say int(rand 2**31-1)'
9143      */
9144 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9145                                         data structure type, so that one being
9146                                         passed in can be validated to be an
9147                                         inversion list of the correct vintage.
9148                                        */
9149
9150     SV* invlist = newSV_type(SVt_INVLIST);
9151
9152     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9153
9154     if (version_id != INVLIST_VERSION_ID) {
9155         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9156     }
9157
9158     /* The generated array passed in includes header elements that aren't part
9159      * of the list proper, so start it just after them */
9160     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9161
9162     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9163                                shouldn't touch it */
9164
9165     *(get_invlist_offset_addr(invlist)) = offset;
9166
9167     /* The 'length' passed to us is the physical number of elements in the
9168      * inversion list.  But if there is an offset the logical number is one
9169      * less than that */
9170     invlist_set_len(invlist, length  - offset, offset);
9171
9172     invlist_set_previous_index(invlist, 0);
9173
9174     /* Initialize the iteration pointer. */
9175     invlist_iterfinish(invlist);
9176
9177     SvREADONLY_on(invlist);
9178
9179     return invlist;
9180 }
9181
9182 STATIC void
9183 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9184 {
9185     /* Grow the maximum size of an inversion list */
9186
9187     PERL_ARGS_ASSERT_INVLIST_EXTEND;
9188
9189     assert(is_invlist(invlist));
9190
9191     /* Add one to account for the zero element at the beginning which may not
9192      * be counted by the calling parameters */
9193     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9194 }
9195
9196 STATIC void
9197 S__append_range_to_invlist(pTHX_ SV* const invlist,
9198                                  const UV start, const UV end)
9199 {
9200    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9201     * the end of the inversion list.  The range must be above any existing
9202     * ones. */
9203
9204     UV* array;
9205     UV max = invlist_max(invlist);
9206     UV len = _invlist_len(invlist);
9207     bool offset;
9208
9209     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9210
9211     if (len == 0) { /* Empty lists must be initialized */
9212         offset = start != 0;
9213         array = _invlist_array_init(invlist, ! offset);
9214     }
9215     else {
9216         /* Here, the existing list is non-empty. The current max entry in the
9217          * list is generally the first value not in the set, except when the
9218          * set extends to the end of permissible values, in which case it is
9219          * the first entry in that final set, and so this call is an attempt to
9220          * append out-of-order */
9221
9222         UV final_element = len - 1;
9223         array = invlist_array(invlist);
9224         if (   array[final_element] > start
9225             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9226         {
9227             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9228                      array[final_element], start,
9229                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9230         }
9231
9232         /* Here, it is a legal append.  If the new range begins 1 above the end
9233          * of the range below it, it is extending the range below it, so the
9234          * new first value not in the set is one greater than the newly
9235          * extended range.  */
9236         offset = *get_invlist_offset_addr(invlist);
9237         if (array[final_element] == start) {
9238             if (end != UV_MAX) {
9239                 array[final_element] = end + 1;
9240             }
9241             else {
9242                 /* But if the end is the maximum representable on the machine,
9243                  * assume that infinity was actually what was meant.  Just let
9244                  * the range that this would extend to have no end */
9245                 invlist_set_len(invlist, len - 1, offset);
9246             }
9247             return;
9248         }
9249     }
9250
9251     /* Here the new range doesn't extend any existing set.  Add it */
9252
9253     len += 2;   /* Includes an element each for the start and end of range */
9254
9255     /* If wll overflow the existing space, extend, which may cause the array to
9256      * be moved */
9257     if (max < len) {
9258         invlist_extend(invlist, len);
9259
9260         /* Have to set len here to avoid assert failure in invlist_array() */
9261         invlist_set_len(invlist, len, offset);
9262
9263         array = invlist_array(invlist);
9264     }
9265     else {
9266         invlist_set_len(invlist, len, offset);
9267     }
9268
9269     /* The next item on the list starts the range, the one after that is
9270      * one past the new range.  */
9271     array[len - 2] = start;
9272     if (end != UV_MAX) {
9273         array[len - 1] = end + 1;
9274     }
9275     else {
9276         /* But if the end is the maximum representable on the machine, just let
9277          * the range have no end */
9278         invlist_set_len(invlist, len - 1, offset);
9279     }
9280 }
9281
9282 SSize_t
9283 Perl__invlist_search(SV* const invlist, const UV cp)
9284 {
9285     /* Searches the inversion list for the entry that contains the input code
9286      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9287      * return value is the index into the list's array of the range that
9288      * contains <cp>, that is, 'i' such that
9289      *  array[i] <= cp < array[i+1]
9290      */
9291
9292     IV low = 0;
9293     IV mid;
9294     IV high = _invlist_len(invlist);
9295     const IV highest_element = high - 1;
9296     const UV* array;
9297
9298     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9299
9300     /* If list is empty, return failure. */
9301     if (high == 0) {
9302         return -1;
9303     }
9304
9305     /* (We can't get the array unless we know the list is non-empty) */
9306     array = invlist_array(invlist);
9307
9308     mid = invlist_previous_index(invlist);
9309     assert(mid >=0);
9310     if (mid > highest_element) {
9311         mid = highest_element;
9312     }
9313
9314     /* <mid> contains the cache of the result of the previous call to this
9315      * function (0 the first time).  See if this call is for the same result,
9316      * or if it is for mid-1.  This is under the theory that calls to this
9317      * function will often be for related code points that are near each other.
9318      * And benchmarks show that caching gives better results.  We also test
9319      * here if the code point is within the bounds of the list.  These tests
9320      * replace others that would have had to be made anyway to make sure that
9321      * the array bounds were not exceeded, and these give us extra information
9322      * at the same time */
9323     if (cp >= array[mid]) {
9324         if (cp >= array[highest_element]) {
9325             return highest_element;
9326         }
9327
9328         /* Here, array[mid] <= cp < array[highest_element].  This means that
9329          * the final element is not the answer, so can exclude it; it also
9330          * means that <mid> is not the final element, so can refer to 'mid + 1'
9331          * safely */
9332         if (cp < array[mid + 1]) {
9333             return mid;
9334         }
9335         high--;
9336         low = mid + 1;
9337     }
9338     else { /* cp < aray[mid] */
9339         if (cp < array[0]) { /* Fail if outside the array */
9340             return -1;
9341         }
9342         high = mid;
9343         if (cp >= array[mid - 1]) {
9344             goto found_entry;
9345         }
9346     }
9347
9348     /* Binary search.  What we are looking for is <i> such that
9349      *  array[i] <= cp < array[i+1]
9350      * The loop below converges on the i+1.  Note that there may not be an
9351      * (i+1)th element in the array, and things work nonetheless */
9352     while (low < high) {
9353         mid = (low + high) / 2;
9354         assert(mid <= highest_element);
9355         if (array[mid] <= cp) { /* cp >= array[mid] */
9356             low = mid + 1;
9357
9358             /* We could do this extra test to exit the loop early.
9359             if (cp < array[low]) {
9360                 return mid;
9361             }
9362             */
9363         }
9364         else { /* cp < array[mid] */
9365             high = mid;
9366         }
9367     }
9368
9369   found_entry:
9370     high--;
9371     invlist_set_previous_index(invlist, high);
9372     return high;
9373 }
9374
9375 void
9376 Perl__invlist_populate_swatch(SV* const invlist,
9377                               const UV start, const UV end, U8* swatch)
9378 {
9379     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
9380      * but is used when the swash has an inversion list.  This makes this much
9381      * faster, as it uses a binary search instead of a linear one.  This is
9382      * intimately tied to that function, and perhaps should be in utf8.c,
9383      * except it is intimately tied to inversion lists as well.  It assumes
9384      * that <swatch> is all 0's on input */
9385
9386     UV current = start;
9387     const IV len = _invlist_len(invlist);
9388     IV i;
9389     const UV * array;
9390
9391     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
9392
9393     if (len == 0) { /* Empty inversion list */
9394         return;
9395     }
9396
9397     array = invlist_array(invlist);
9398
9399     /* Find which element it is */
9400     i = _invlist_search(invlist, start);
9401
9402     /* We populate from <start> to <end> */
9403     while (current < end) {
9404         UV upper;
9405
9406         /* The inversion list gives the results for every possible code point
9407          * after the first one in the list.  Only those ranges whose index is
9408          * even are ones that the inversion list matches.  For the odd ones,
9409          * and if the initial code point is not in the list, we have to skip
9410          * forward to the next element */
9411         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9412             i++;
9413             if (i >= len) { /* Finished if beyond the end of the array */
9414                 return;
9415             }
9416             current = array[i];
9417             if (current >= end) {   /* Finished if beyond the end of what we
9418                                        are populating */
9419                 if (LIKELY(end < UV_MAX)) {
9420                     return;
9421                 }
9422
9423                 /* We get here when the upper bound is the maximum
9424                  * representable on the machine, and we are looking for just
9425                  * that code point.  Have to special case it */
9426                 i = len;
9427                 goto join_end_of_list;
9428             }
9429         }
9430         assert(current >= start);
9431
9432         /* The current range ends one below the next one, except don't go past
9433          * <end> */
9434         i++;
9435         upper = (i < len && array[i] < end) ? array[i] : end;
9436
9437         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
9438          * for each code point in it */
9439         for (; current < upper; current++) {
9440             const STRLEN offset = (STRLEN)(current - start);
9441             swatch[offset >> 3] |= 1 << (offset & 7);
9442         }
9443
9444       join_end_of_list:
9445
9446         /* Quit if at the end of the list */
9447         if (i >= len) {
9448
9449             /* But first, have to deal with the highest possible code point on
9450              * the platform.  The previous code assumes that <end> is one
9451              * beyond where we want to populate, but that is impossible at the
9452              * platform's infinity, so have to handle it specially */
9453             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9454             {
9455                 const STRLEN offset = (STRLEN)(end - start);
9456                 swatch[offset >> 3] |= 1 << (offset & 7);
9457             }
9458             return;
9459         }
9460
9461         /* Advance to the next range, which will be for code points not in the
9462          * inversion list */
9463         current = array[i];
9464     }
9465
9466     return;
9467 }
9468
9469 void
9470 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9471                                          const bool complement_b, SV** output)
9472 {
9473     /* Take the union of two inversion lists and point '*output' to it.  On
9474      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9475      * even 'a' or 'b').  If to an inversion list, the contents of the original
9476      * list will be replaced by the union.  The first list, 'a', may be
9477      * NULL, in which case a copy of the second list is placed in '*output'.
9478      * If 'complement_b' is TRUE, the union is taken of the complement
9479      * (inversion) of 'b' instead of b itself.
9480      *
9481      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9482      * Richard Gillam, published by Addison-Wesley, and explained at some
9483      * length there.  The preface says to incorporate its examples into your
9484      * code at your own risk.
9485      *
9486      * The algorithm is like a merge sort. */
9487
9488     const UV* array_a;    /* a's array */
9489     const UV* array_b;
9490     UV len_a;       /* length of a's array */
9491     UV len_b;
9492
9493     SV* u;                      /* the resulting union */
9494     UV* array_u;
9495     UV len_u = 0;
9496
9497     UV i_a = 0;             /* current index into a's array */
9498     UV i_b = 0;
9499     UV i_u = 0;
9500
9501     /* running count, as explained in the algorithm source book; items are
9502      * stopped accumulating and are output when the count changes to/from 0.
9503      * The count is incremented when we start a range that's in an input's set,
9504      * and decremented when we start a range that's not in a set.  So this
9505      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9506      * and hence nothing goes into the union; 1, just one of the inputs is in
9507      * its set (and its current range gets added to the union); and 2 when both
9508      * inputs are in their sets.  */
9509     UV count = 0;
9510
9511     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9512     assert(a != b);
9513     assert(*output == NULL || is_invlist(*output));
9514
9515     len_b = _invlist_len(b);
9516     if (len_b == 0) {
9517
9518         /* Here, 'b' is empty, hence it's complement is all possible code
9519          * points.  So if the union includes the complement of 'b', it includes
9520          * everything, and we need not even look at 'a'.  It's easiest to
9521          * create a new inversion list that matches everything.  */
9522         if (complement_b) {
9523             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9524
9525             if (*output == NULL) { /* If the output didn't exist, just point it
9526                                       at the new list */
9527                 *output = everything;
9528             }
9529             else { /* Otherwise, replace its contents with the new list */
9530                 invlist_replace_list_destroys_src(*output, everything);
9531                 SvREFCNT_dec_NN(everything);
9532             }
9533
9534             return;
9535         }
9536
9537         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9538          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9539          * output will be empty */
9540
9541         if (a == NULL || _invlist_len(a) == 0) {
9542             if (*output == NULL) {
9543                 *output = _new_invlist(0);
9544             }
9545             else {
9546                 invlist_clear(*output);
9547             }
9548             return;
9549         }
9550
9551         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9552          * union.  We can just return a copy of 'a' if '*output' doesn't point
9553          * to an existing list */
9554         if (*output == NULL) {
9555             *output = invlist_clone(a, NULL);
9556             return;
9557         }
9558
9559         /* If the output is to overwrite 'a', we have a no-op, as it's
9560          * already in 'a' */
9561         if (*output == a) {
9562             return;
9563         }
9564
9565         /* Here, '*output' is to be overwritten by 'a' */
9566         u = invlist_clone(a, NULL);
9567         invlist_replace_list_destroys_src(*output, u);
9568         SvREFCNT_dec_NN(u);
9569
9570         return;
9571     }
9572
9573     /* Here 'b' is not empty.  See about 'a' */
9574
9575     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9576
9577         /* Here, 'a' is empty (and b is not).  That means the union will come
9578          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9579          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9580          * the clone */
9581
9582         SV ** dest = (*output == NULL) ? output : &u;
9583         *dest = invlist_clone(b, NULL);
9584         if (complement_b) {
9585             _invlist_invert(*dest);
9586         }
9587
9588         if (dest == &u) {
9589             invlist_replace_list_destroys_src(*output, u);
9590             SvREFCNT_dec_NN(u);
9591         }
9592
9593         return;
9594     }
9595
9596     /* Here both lists exist and are non-empty */
9597     array_a = invlist_array(a);
9598     array_b = invlist_array(b);
9599
9600     /* If are to take the union of 'a' with the complement of b, set it
9601      * up so are looking at b's complement. */
9602     if (complement_b) {
9603
9604         /* To complement, we invert: if the first element is 0, remove it.  To
9605          * do this, we just pretend the array starts one later */
9606         if (array_b[0] == 0) {
9607             array_b++;
9608             len_b--;
9609         }
9610         else {
9611
9612             /* But if the first element is not zero, we pretend the list starts
9613              * at the 0 that is always stored immediately before the array. */
9614             array_b--;
9615             len_b++;
9616         }
9617     }
9618
9619     /* Size the union for the worst case: that the sets are completely
9620      * disjoint */
9621     u = _new_invlist(len_a + len_b);
9622
9623     /* Will contain U+0000 if either component does */
9624     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9625                                       || (len_b > 0 && array_b[0] == 0));
9626
9627     /* Go through each input list item by item, stopping when have exhausted
9628      * one of them */
9629     while (i_a < len_a && i_b < len_b) {
9630         UV cp;      /* The element to potentially add to the union's array */
9631         bool cp_in_set;   /* is it in the the input list's set or not */
9632
9633         /* We need to take one or the other of the two inputs for the union.
9634          * Since we are merging two sorted lists, we take the smaller of the
9635          * next items.  In case of a tie, we take first the one that is in its
9636          * set.  If we first took the one not in its set, it would decrement
9637          * the count, possibly to 0 which would cause it to be output as ending
9638          * the range, and the next time through we would take the same number,
9639          * and output it again as beginning the next range.  By doing it the
9640          * opposite way, there is no possibility that the count will be
9641          * momentarily decremented to 0, and thus the two adjoining ranges will
9642          * be seamlessly merged.  (In a tie and both are in the set or both not
9643          * in the set, it doesn't matter which we take first.) */
9644         if (       array_a[i_a] < array_b[i_b]
9645             || (   array_a[i_a] == array_b[i_b]
9646                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9647         {
9648             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9649             cp = array_a[i_a++];
9650         }
9651         else {
9652             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9653             cp = array_b[i_b++];
9654         }
9655
9656         /* Here, have chosen which of the two inputs to look at.  Only output
9657          * if the running count changes to/from 0, which marks the
9658          * beginning/end of a range that's in the set */
9659         if (cp_in_set) {
9660             if (count == 0) {
9661                 array_u[i_u++] = cp;
9662             }
9663             count++;
9664         }
9665         else {
9666             count--;
9667             if (count == 0) {
9668                 array_u[i_u++] = cp;
9669             }
9670         }
9671     }
9672
9673
9674     /* The loop above increments the index into exactly one of the input lists
9675      * each iteration, and ends when either index gets to its list end.  That
9676      * means the other index is lower than its end, and so something is
9677      * remaining in that one.  We decrement 'count', as explained below, if
9678      * that list is in its set.  (i_a and i_b each currently index the element
9679      * beyond the one we care about.) */
9680     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9681         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9682     {
9683         count--;
9684     }
9685
9686     /* Above we decremented 'count' if the list that had unexamined elements in
9687      * it was in its set.  This has made it so that 'count' being non-zero
9688      * means there isn't anything left to output; and 'count' equal to 0 means
9689      * that what is left to output is precisely that which is left in the
9690      * non-exhausted input list.
9691      *
9692      * To see why, note first that the exhausted input obviously has nothing
9693      * left to add to the union.  If it was in its set at its end, that means
9694      * the set extends from here to the platform's infinity, and hence so does
9695      * the union and the non-exhausted set is irrelevant.  The exhausted set
9696      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9697      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9698      * 'count' remains at 1.  This is consistent with the decremented 'count'
9699      * != 0 meaning there's nothing left to add to the union.
9700      *
9701      * But if the exhausted input wasn't in its set, it contributed 0 to
9702      * 'count', and the rest of the union will be whatever the other input is.
9703      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9704      * otherwise it gets decremented to 0.  This is consistent with 'count'
9705      * == 0 meaning the remainder of the union is whatever is left in the
9706      * non-exhausted list. */
9707     if (count != 0) {
9708         len_u = i_u;
9709     }
9710     else {
9711         IV copy_count = len_a - i_a;
9712         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9713             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9714         }
9715         else { /* The non-exhausted input is b */
9716             copy_count = len_b - i_b;
9717             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9718         }
9719         len_u = i_u + copy_count;
9720     }
9721
9722     /* Set the result to the final length, which can change the pointer to
9723      * array_u, so re-find it.  (Note that it is unlikely that this will
9724      * change, as we are shrinking the space, not enlarging it) */
9725     if (len_u != _invlist_len(u)) {
9726         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9727         invlist_trim(u);
9728         array_u = invlist_array(u);
9729     }
9730
9731     if (*output == NULL) {  /* Simply return the new inversion list */
9732         *output = u;
9733     }
9734     else {
9735         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9736          * could instead free '*output', and then set it to 'u', but experience
9737          * has shown [perl #127392] that if the input is a mortal, we can get a
9738          * huge build-up of these during regex compilation before they get
9739          * freed. */
9740         invlist_replace_list_destroys_src(*output, u);
9741         SvREFCNT_dec_NN(u);
9742     }
9743
9744     return;
9745 }
9746
9747 void
9748 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9749                                                const bool complement_b, SV** i)
9750 {
9751     /* Take the intersection of two inversion lists and point '*i' to it.  On
9752      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9753      * even 'a' or 'b').  If to an inversion list, the contents of the original
9754      * list will be replaced by the intersection.  The first list, 'a', may be
9755      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9756      * TRUE, the result will be the intersection of 'a' and the complement (or
9757      * inversion) of 'b' instead of 'b' directly.
9758      *
9759      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9760      * Richard Gillam, published by Addison-Wesley, and explained at some
9761      * length there.  The preface says to incorporate its examples into your
9762      * code at your own risk.  In fact, it had bugs
9763      *
9764      * The algorithm is like a merge sort, and is essentially the same as the
9765      * union above
9766      */
9767
9768     const UV* array_a;          /* a's array */
9769     const UV* array_b;
9770     UV len_a;   /* length of a's array */
9771     UV len_b;
9772
9773     SV* r;                   /* the resulting intersection */
9774     UV* array_r;
9775     UV len_r = 0;
9776
9777     UV i_a = 0;             /* current index into a's array */
9778     UV i_b = 0;
9779     UV i_r = 0;
9780
9781     /* running count of how many of the two inputs are postitioned at ranges
9782      * that are in their sets.  As explained in the algorithm source book,
9783      * items are stopped accumulating and are output when the count changes
9784      * to/from 2.  The count is incremented when we start a range that's in an
9785      * input's set, and decremented when we start a range that's not in a set.
9786      * Only when it is 2 are we in the intersection. */
9787     UV count = 0;
9788
9789     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9790     assert(a != b);
9791     assert(*i == NULL || is_invlist(*i));
9792
9793     /* Special case if either one is empty */
9794     len_a = (a == NULL) ? 0 : _invlist_len(a);
9795     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9796         if (len_a != 0 && complement_b) {
9797
9798             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9799              * must be empty.  Here, also we are using 'b's complement, which
9800              * hence must be every possible code point.  Thus the intersection
9801              * is simply 'a'. */
9802
9803             if (*i == a) {  /* No-op */
9804                 return;
9805             }
9806
9807             if (*i == NULL) {
9808                 *i = invlist_clone(a, NULL);
9809                 return;
9810             }
9811
9812             r = invlist_clone(a, NULL);
9813             invlist_replace_list_destroys_src(*i, r);
9814             SvREFCNT_dec_NN(r);
9815             return;
9816         }
9817
9818         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9819          * intersection must be empty */
9820         if (*i == NULL) {
9821             *i = _new_invlist(0);
9822             return;
9823         }
9824
9825         invlist_clear(*i);
9826         return;
9827     }
9828
9829     /* Here both lists exist and are non-empty */
9830     array_a = invlist_array(a);
9831     array_b = invlist_array(b);
9832
9833     /* If are to take the intersection of 'a' with the complement of b, set it
9834      * up so are looking at b's complement. */
9835     if (complement_b) {
9836
9837         /* To complement, we invert: if the first element is 0, remove it.  To
9838          * do this, we just pretend the array starts one later */
9839         if (array_b[0] == 0) {
9840             array_b++;
9841             len_b--;
9842         }
9843         else {
9844
9845             /* But if the first element is not zero, we pretend the list starts
9846              * at the 0 that is always stored immediately before the array. */
9847             array_b--;
9848             len_b++;
9849         }
9850     }
9851
9852     /* Size the intersection for the worst case: that the intersection ends up
9853      * fragmenting everything to be completely disjoint */
9854     r= _new_invlist(len_a + len_b);
9855
9856     /* Will contain U+0000 iff both components do */
9857     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9858                                      && len_b > 0 && array_b[0] == 0);
9859
9860     /* Go through each list item by item, stopping when have exhausted one of
9861      * them */
9862     while (i_a < len_a && i_b < len_b) {
9863         UV cp;      /* The element to potentially add to the intersection's
9864                        array */
9865         bool cp_in_set; /* Is it in the input list's set or not */
9866
9867         /* We need to take one or the other of the two inputs for the
9868          * intersection.  Since we are merging two sorted lists, we take the
9869          * smaller of the next items.  In case of a tie, we take first the one
9870          * that is not in its set (a difference from the union algorithm).  If
9871          * we first took the one in its set, it would increment the count,
9872          * possibly to 2 which would cause it to be output as starting a range
9873          * in the intersection, and the next time through we would take that
9874          * same number, and output it again as ending the set.  By doing the
9875          * opposite of this, there is no possibility that the count will be
9876          * momentarily incremented to 2.  (In a tie and both are in the set or
9877          * both not in the set, it doesn't matter which we take first.) */
9878         if (       array_a[i_a] < array_b[i_b]
9879             || (   array_a[i_a] == array_b[i_b]
9880                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9881         {
9882             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9883             cp = array_a[i_a++];
9884         }
9885         else {
9886             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9887             cp= array_b[i_b++];
9888         }
9889
9890         /* Here, have chosen which of the two inputs to look at.  Only output
9891          * if the running count changes to/from 2, which marks the
9892          * beginning/end of a range that's in the intersection */
9893         if (cp_in_set) {
9894             count++;
9895             if (count == 2) {
9896                 array_r[i_r++] = cp;
9897             }
9898         }
9899         else {
9900             if (count == 2) {
9901                 array_r[i_r++] = cp;
9902             }
9903             count--;
9904         }
9905
9906     }
9907
9908     /* The loop above increments the index into exactly one of the input lists
9909      * each iteration, and ends when either index gets to its list end.  That
9910      * means the other index is lower than its end, and so something is
9911      * remaining in that one.  We increment 'count', as explained below, if the
9912      * exhausted list was in its set.  (i_a and i_b each currently index the
9913      * element beyond the one we care about.) */
9914     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9915         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9916     {
9917         count++;
9918     }
9919
9920     /* Above we incremented 'count' if the exhausted list was in its set.  This
9921      * has made it so that 'count' being below 2 means there is nothing left to
9922      * output; otheriwse what's left to add to the intersection is precisely
9923      * that which is left in the non-exhausted input list.
9924      *
9925      * To see why, note first that the exhausted input obviously has nothing
9926      * left to affect the intersection.  If it was in its set at its end, that
9927      * means the set extends from here to the platform's infinity, and hence
9928      * anything in the non-exhausted's list will be in the intersection, and
9929      * anything not in it won't be.  Hence, the rest of the intersection is
9930      * precisely what's in the non-exhausted list  The exhausted set also
9931      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9932      * it means 'count' is now at least 2.  This is consistent with the
9933      * incremented 'count' being >= 2 means to add the non-exhausted list to
9934      * the intersection.
9935      *
9936      * But if the exhausted input wasn't in its set, it contributed 0 to
9937      * 'count', and the intersection can't include anything further; the
9938      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9939      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9940      * further to add to the intersection. */
9941     if (count < 2) { /* Nothing left to put in the intersection. */
9942         len_r = i_r;
9943     }
9944     else { /* copy the non-exhausted list, unchanged. */
9945         IV copy_count = len_a - i_a;
9946         if (copy_count > 0) {   /* a is the one with stuff left */
9947             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9948         }
9949         else {  /* b is the one with stuff left */
9950             copy_count = len_b - i_b;
9951             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9952         }
9953         len_r = i_r + copy_count;
9954     }
9955
9956     /* Set the result to the final length, which can change the pointer to
9957      * array_r, so re-find it.  (Note that it is unlikely that this will
9958      * change, as we are shrinking the space, not enlarging it) */
9959     if (len_r != _invlist_len(r)) {
9960         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9961         invlist_trim(r);
9962         array_r = invlist_array(r);
9963     }
9964
9965     if (*i == NULL) { /* Simply return the calculated intersection */
9966         *i = r;
9967     }
9968     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9969               instead free '*i', and then set it to 'r', but experience has
9970               shown [perl #127392] that if the input is a mortal, we can get a
9971               huge build-up of these during regex compilation before they get
9972               freed. */
9973         if (len_r) {
9974             invlist_replace_list_destroys_src(*i, r);
9975         }
9976         else {
9977             invlist_clear(*i);
9978         }
9979         SvREFCNT_dec_NN(r);
9980     }
9981
9982     return;
9983 }
9984
9985 SV*
9986 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9987 {
9988     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9989      * set.  A pointer to the inversion list is returned.  This may actually be
9990      * a new list, in which case the passed in one has been destroyed.  The
9991      * passed-in inversion list can be NULL, in which case a new one is created
9992      * with just the one range in it.  The new list is not necessarily
9993      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9994      * result of this function.  The gain would not be large, and in many
9995      * cases, this is called multiple times on a single inversion list, so
9996      * anything freed may almost immediately be needed again.
9997      *
9998      * This used to mostly call the 'union' routine, but that is much more
9999      * heavyweight than really needed for a single range addition */
10000
10001     UV* array;              /* The array implementing the inversion list */
10002     UV len;                 /* How many elements in 'array' */
10003     SSize_t i_s;            /* index into the invlist array where 'start'
10004                                should go */
10005     SSize_t i_e = 0;        /* And the index where 'end' should go */
10006     UV cur_highest;         /* The highest code point in the inversion list
10007                                upon entry to this function */
10008
10009     /* This range becomes the whole inversion list if none already existed */
10010     if (invlist == NULL) {
10011         invlist = _new_invlist(2);
10012         _append_range_to_invlist(invlist, start, end);
10013         return invlist;
10014     }
10015
10016     /* Likewise, if the inversion list is currently empty */
10017     len = _invlist_len(invlist);
10018     if (len == 0) {
10019         _append_range_to_invlist(invlist, start, end);
10020         return invlist;
10021     }
10022
10023     /* Starting here, we have to know the internals of the list */
10024     array = invlist_array(invlist);
10025
10026     /* If the new range ends higher than the current highest ... */
10027     cur_highest = invlist_highest(invlist);
10028     if (end > cur_highest) {
10029
10030         /* If the whole range is higher, we can just append it */
10031         if (start > cur_highest) {
10032             _append_range_to_invlist(invlist, start, end);
10033             return invlist;
10034         }
10035
10036         /* Otherwise, add the portion that is higher ... */
10037         _append_range_to_invlist(invlist, cur_highest + 1, end);
10038
10039         /* ... and continue on below to handle the rest.  As a result of the
10040          * above append, we know that the index of the end of the range is the
10041          * final even numbered one of the array.  Recall that the final element
10042          * always starts a range that extends to infinity.  If that range is in
10043          * the set (meaning the set goes from here to infinity), it will be an
10044          * even index, but if it isn't in the set, it's odd, and the final
10045          * range in the set is one less, which is even. */
10046         if (end == UV_MAX) {
10047             i_e = len;
10048         }
10049         else {
10050             i_e = len - 2;
10051         }
10052     }
10053
10054     /* We have dealt with appending, now see about prepending.  If the new
10055      * range starts lower than the current lowest ... */
10056     if (start < array[0]) {
10057
10058         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10059          * Let the union code handle it, rather than having to know the
10060          * trickiness in two code places.  */
10061         if (UNLIKELY(start == 0)) {
10062             SV* range_invlist;
10063
10064             range_invlist = _new_invlist(2);
10065             _append_range_to_invlist(range_invlist, start, end);
10066
10067             _invlist_union(invlist, range_invlist, &invlist);
10068
10069             SvREFCNT_dec_NN(range_invlist);
10070
10071             return invlist;
10072         }
10073
10074         /* If the whole new range comes before the first entry, and doesn't
10075          * extend it, we have to insert it as an additional range */
10076         if (end < array[0] - 1) {
10077             i_s = i_e = -1;
10078             goto splice_in_new_range;
10079         }
10080
10081         /* Here the new range adjoins the existing first range, extending it
10082          * downwards. */
10083         array[0] = start;
10084
10085         /* And continue on below to handle the rest.  We know that the index of
10086          * the beginning of the range is the first one of the array */
10087         i_s = 0;
10088     }
10089     else { /* Not prepending any part of the new range to the existing list.
10090             * Find where in the list it should go.  This finds i_s, such that:
10091             *     invlist[i_s] <= start < array[i_s+1]
10092             */
10093         i_s = _invlist_search(invlist, start);
10094     }
10095
10096     /* At this point, any extending before the beginning of the inversion list
10097      * and/or after the end has been done.  This has made it so that, in the
10098      * code below, each endpoint of the new range is either in a range that is
10099      * in the set, or is in a gap between two ranges that are.  This means we
10100      * don't have to worry about exceeding the array bounds.
10101      *
10102      * Find where in the list the new range ends (but we can skip this if we
10103      * have already determined what it is, or if it will be the same as i_s,
10104      * which we already have computed) */
10105     if (i_e == 0) {
10106         i_e = (start == end)
10107               ? i_s
10108               : _invlist_search(invlist, end);
10109     }
10110
10111     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10112      * is a range that goes to infinity there is no element at invlist[i_e+1],
10113      * so only the first relation holds. */
10114
10115     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10116
10117         /* Here, the ranges on either side of the beginning of the new range
10118          * are in the set, and this range starts in the gap between them.
10119          *
10120          * The new range extends the range above it downwards if the new range
10121          * ends at or above that range's start */
10122         const bool extends_the_range_above = (   end == UV_MAX
10123                                               || end + 1 >= array[i_s+1]);
10124
10125         /* The new range extends the range below it upwards if it begins just
10126          * after where that range ends */
10127         if (start == array[i_s]) {
10128
10129             /* If the new range fills the entire gap between the other ranges,
10130              * they will get merged together.  Other ranges may also get
10131              * merged, depending on how many of them the new range spans.  In
10132              * the general case, we do the merge later, just once, after we
10133              * figure out how many to merge.  But in the case where the new
10134              * range exactly spans just this one gap (possibly extending into
10135              * the one above), we do the merge here, and an early exit.  This
10136              * is done here to avoid having to special case later. */
10137             if (i_e - i_s <= 1) {
10138
10139                 /* If i_e - i_s == 1, it means that the new range terminates
10140                  * within the range above, and hence 'extends_the_range_above'
10141                  * must be true.  (If the range above it extends to infinity,
10142                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10143                  * will be 0, so no harm done.) */
10144                 if (extends_the_range_above) {
10145                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10146                     invlist_set_len(invlist,
10147                                     len - 2,
10148                                     *(get_invlist_offset_addr(invlist)));
10149                     return invlist;
10150                 }
10151
10152                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10153                  * to the same range, and below we are about to decrement i_s
10154                  * */
10155                 i_e--;
10156             }
10157
10158             /* Here, the new range is adjacent to the one below.  (It may also
10159              * span beyond the range above, but that will get resolved later.)
10160              * Extend the range below to include this one. */
10161             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10162             i_s--;
10163             start = array[i_s];
10164         }
10165         else if (extends_the_range_above) {
10166
10167             /* Here the new range only extends the range above it, but not the
10168              * one below.  It merges with the one above.  Again, we keep i_e
10169              * and i_s in sync if they point to the same range */
10170             if (i_e == i_s) {
10171                 i_e++;
10172             }
10173             i_s++;
10174             array[i_s] = start;
10175         }
10176     }
10177
10178     /* Here, we've dealt with the new range start extending any adjoining
10179      * existing ranges.
10180      *
10181      * If the new range extends to infinity, it is now the final one,
10182      * regardless of what was there before */
10183     if (UNLIKELY(end == UV_MAX)) {
10184         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10185         return invlist;
10186     }
10187
10188     /* If i_e started as == i_s, it has also been dealt with,
10189      * and been updated to the new i_s, which will fail the following if */
10190     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10191
10192         /* Here, the ranges on either side of the end of the new range are in
10193          * the set, and this range ends in the gap between them.
10194          *
10195          * If this range is adjacent to (hence extends) the range above it, it
10196          * becomes part of that range; likewise if it extends the range below,
10197          * it becomes part of that range */
10198         if (end + 1 == array[i_e+1]) {
10199             i_e++;
10200             array[i_e] = start;
10201         }
10202         else if (start <= array[i_e]) {
10203             array[i_e] = end + 1;
10204             i_e--;
10205         }
10206     }
10207
10208     if (i_s == i_e) {
10209
10210         /* If the range fits entirely in an existing range (as possibly already
10211          * extended above), it doesn't add anything new */
10212         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10213             return invlist;
10214         }
10215
10216         /* Here, no part of the range is in the list.  Must add it.  It will
10217          * occupy 2 more slots */
10218       splice_in_new_range:
10219
10220         invlist_extend(invlist, len + 2);
10221         array = invlist_array(invlist);
10222         /* Move the rest of the array down two slots. Don't include any
10223          * trailing NUL */
10224         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10225
10226         /* Do the actual splice */
10227         array[i_e+1] = start;
10228         array[i_e+2] = end + 1;
10229         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10230         return invlist;
10231     }
10232
10233     /* Here the new range crossed the boundaries of a pre-existing range.  The
10234      * code above has adjusted things so that both ends are in ranges that are
10235      * in the set.  This means everything in between must also be in the set.
10236      * Just squash things together */
10237     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10238     invlist_set_len(invlist,
10239                     len - i_e + i_s,
10240                     *(get_invlist_offset_addr(invlist)));
10241
10242     return invlist;
10243 }
10244
10245 SV*
10246 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10247                                  UV** other_elements_ptr)
10248 {
10249     /* Create and return an inversion list whose contents are to be populated
10250      * by the caller.  The caller gives the number of elements (in 'size') and
10251      * the very first element ('element0').  This function will set
10252      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10253      * are to be placed.
10254      *
10255      * Obviously there is some trust involved that the caller will properly
10256      * fill in the other elements of the array.
10257      *
10258      * (The first element needs to be passed in, as the underlying code does
10259      * things differently depending on whether it is zero or non-zero) */
10260
10261     SV* invlist = _new_invlist(size);
10262     bool offset;
10263
10264     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10265
10266     invlist = add_cp_to_invlist(invlist, element0);
10267     offset = *get_invlist_offset_addr(invlist);
10268
10269     invlist_set_len(invlist, size, offset);
10270     *other_elements_ptr = invlist_array(invlist) + 1;
10271     return invlist;
10272 }
10273
10274 #endif
10275
10276 PERL_STATIC_INLINE SV*
10277 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10278     return _add_range_to_invlist(invlist, cp, cp);
10279 }
10280
10281 #ifndef PERL_IN_XSUB_RE
10282 void
10283 Perl__invlist_invert(pTHX_ SV* const invlist)
10284 {
10285     /* Complement the input inversion list.  This adds a 0 if the list didn't
10286      * have a zero; removes it otherwise.  As described above, the data
10287      * structure is set up so that this is very efficient */
10288
10289     PERL_ARGS_ASSERT__INVLIST_INVERT;
10290
10291     assert(! invlist_is_iterating(invlist));
10292
10293     /* The inverse of matching nothing is matching everything */
10294     if (_invlist_len(invlist) == 0) {
10295         _append_range_to_invlist(invlist, 0, UV_MAX);
10296         return;
10297     }
10298
10299     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10300 }
10301
10302 SV*
10303 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10304 {
10305
10306     /* Return a new inversion list that is a copy of the input one, which is
10307      * unchanged.  The new list will not be mortal even if the old one was. */
10308
10309     const STRLEN nominal_length = _invlist_len(invlist);    /* Why not +1 XXX */
10310     const STRLEN physical_length = SvCUR(invlist);
10311     const bool offset = *(get_invlist_offset_addr(invlist));
10312
10313     PERL_ARGS_ASSERT_INVLIST_CLONE;
10314
10315     /* Need to allocate extra space to accommodate Perl's addition of a
10316      * trailing NUL to SvPV's, since it thinks they are always strings */
10317     if (new_invlist == NULL) {
10318         new_invlist = _new_invlist(nominal_length);
10319     }
10320     else {
10321         sv_upgrade(new_invlist, SVt_INVLIST);
10322         initialize_invlist_guts(new_invlist, nominal_length);
10323     }
10324
10325     *(get_invlist_offset_addr(new_invlist)) = offset;
10326     invlist_set_len(new_invlist, nominal_length, offset);
10327     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10328
10329     return new_invlist;
10330 }
10331
10332 #endif
10333
10334 PERL_STATIC_INLINE STRLEN*
10335 S_get_invlist_iter_addr(SV* invlist)
10336 {
10337     /* Return the address of the UV that contains the current iteration
10338      * position */
10339
10340     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10341
10342     assert(is_invlist(invlist));
10343
10344     return &(((XINVLIST*) SvANY(invlist))->iterator);
10345 }
10346
10347 PERL_STATIC_INLINE void
10348 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10349 {
10350     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10351
10352     *get_invlist_iter_addr(invlist) = 0;
10353 }
10354
10355 PERL_STATIC_INLINE void
10356 S_invlist_iterfinish(SV* invlist)
10357 {
10358     /* Terminate iterator for invlist.  This is to catch development errors.
10359      * Any iteration that is interrupted before completed should call this
10360      * function.  Functions that add code points anywhere else but to the end
10361      * of an inversion list assert that they are not in the middle of an
10362      * iteration.  If they were, the addition would make the iteration
10363      * problematical: if the iteration hadn't reached the place where things
10364      * were being added, it would be ok */
10365
10366     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10367
10368     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10369 }
10370
10371 STATIC bool
10372 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10373 {
10374     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10375      * This call sets in <*start> and <*end>, the next range in <invlist>.
10376      * Returns <TRUE> if successful and the next call will return the next
10377      * range; <FALSE> if was already at the end of the list.  If the latter,
10378      * <*start> and <*end> are unchanged, and the next call to this function
10379      * will start over at the beginning of the list */
10380
10381     STRLEN* pos = get_invlist_iter_addr(invlist);
10382     UV len = _invlist_len(invlist);
10383     UV *array;
10384
10385     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10386
10387     if (*pos >= len) {
10388         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10389         return FALSE;
10390     }
10391
10392     array = invlist_array(invlist);
10393
10394     *start = array[(*pos)++];
10395
10396     if (*pos >= len) {
10397         *end = UV_MAX;
10398     }
10399     else {
10400         *end = array[(*pos)++] - 1;
10401     }
10402
10403     return TRUE;
10404 }
10405
10406 PERL_STATIC_INLINE UV
10407 S_invlist_highest(SV* const invlist)
10408 {
10409     /* Returns the highest code point that matches an inversion list.  This API
10410      * has an ambiguity, as it returns 0 under either the highest is actually
10411      * 0, or if the list is empty.  If this distinction matters to you, check
10412      * for emptiness before calling this function */
10413
10414     UV len = _invlist_len(invlist);
10415     UV *array;
10416
10417     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10418
10419     if (len == 0) {
10420         return 0;
10421     }
10422
10423     array = invlist_array(invlist);
10424
10425     /* The last element in the array in the inversion list always starts a
10426      * range that goes to infinity.  That range may be for code points that are
10427      * matched in the inversion list, or it may be for ones that aren't
10428      * matched.  In the latter case, the highest code point in the set is one
10429      * less than the beginning of this range; otherwise it is the final element
10430      * of this range: infinity */
10431     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10432            ? UV_MAX
10433            : array[len - 1] - 1;
10434 }
10435
10436 STATIC SV *
10437 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10438 {
10439     /* Get the contents of an inversion list into a string SV so that they can
10440      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10441      * traditionally done for debug tracing; otherwise it uses a format
10442      * suitable for just copying to the output, with blanks between ranges and
10443      * a dash between range components */
10444
10445     UV start, end;
10446     SV* output;
10447     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10448     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10449
10450     if (traditional_style) {
10451         output = newSVpvs("\n");
10452     }
10453     else {
10454         output = newSVpvs("");
10455     }
10456
10457     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10458
10459     assert(! invlist_is_iterating(invlist));
10460
10461     invlist_iterinit(invlist);
10462     while (invlist_iternext(invlist, &start, &end)) {
10463         if (end == UV_MAX) {
10464             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10465                                           start, intra_range_delimiter,
10466                                                  inter_range_delimiter);
10467         }
10468         else if (end != start) {
10469             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10470                                           start,
10471                                                    intra_range_delimiter,
10472                                                   end, inter_range_delimiter);
10473         }
10474         else {
10475             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10476                                           start, inter_range_delimiter);
10477         }
10478     }
10479
10480     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10481         SvCUR_set(output, SvCUR(output) - 1);
10482     }
10483
10484     return output;
10485 }
10486
10487 #ifndef PERL_IN_XSUB_RE
10488 void
10489 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10490                          const char * const indent, SV* const invlist)
10491 {
10492     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10493      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10494      * the string 'indent'.  The output looks like this:
10495          [0] 0x000A .. 0x000D
10496          [2] 0x0085
10497          [4] 0x2028 .. 0x2029
10498          [6] 0x3104 .. INFTY
10499      * This means that the first range of code points matched by the list are
10500      * 0xA through 0xD; the second range contains only the single code point
10501      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10502      * are used to define each range (except if the final range extends to
10503      * infinity, only a single element is needed).  The array index of the
10504      * first element for the corresponding range is given in brackets. */
10505
10506     UV start, end;
10507     STRLEN count = 0;
10508
10509     PERL_ARGS_ASSERT__INVLIST_DUMP;
10510
10511     if (invlist_is_iterating(invlist)) {
10512         Perl_dump_indent(aTHX_ level, file,
10513              "%sCan't dump inversion list because is in middle of iterating\n",
10514              indent);
10515         return;
10516     }
10517
10518     invlist_iterinit(invlist);
10519     while (invlist_iternext(invlist, &start, &end)) {
10520         if (end == UV_MAX) {
10521             Perl_dump_indent(aTHX_ level, file,
10522                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10523                                    indent, (UV)count, start);
10524         }
10525         else if (end != start) {
10526             Perl_dump_indent(aTHX_ level, file,
10527                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10528                                 indent, (UV)count, start,         end);
10529         }
10530         else {
10531             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10532                                             indent, (UV)count, start);
10533         }
10534         count += 2;
10535     }
10536 }
10537
10538 #endif
10539
10540 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10541 bool
10542 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10543 {
10544     /* Return a boolean as to if the two passed in inversion lists are
10545      * identical.  The final argument, if TRUE, says to take the complement of
10546      * the second inversion list before doing the comparison */
10547
10548     const UV len_a = _invlist_len(a);
10549     UV len_b = _invlist_len(b);
10550
10551     const UV* array_a = NULL;
10552     const UV* array_b = NULL;
10553
10554     PERL_ARGS_ASSERT__INVLISTEQ;
10555
10556     /* This code avoids accessing the arrays unless it knows the length is
10557      * non-zero */
10558
10559     if (len_a == 0) {
10560         if (len_b == 0) {
10561             return ! complement_b;
10562         }
10563     }
10564     else {
10565         array_a = invlist_array(a);
10566     }
10567
10568     if (len_b != 0) {
10569         array_b = invlist_array(b);
10570     }
10571
10572     /* If are to compare 'a' with the complement of b, set it
10573      * up so are looking at b's complement. */
10574     if (complement_b) {
10575
10576         /* The complement of nothing is everything, so <a> would have to have
10577          * just one element, starting at zero (ending at infinity) */
10578         if (len_b == 0) {
10579             return (len_a == 1 && array_a[0] == 0);
10580         }
10581         if (array_b[0] == 0) {
10582
10583             /* Otherwise, to complement, we invert.  Here, the first element is
10584              * 0, just remove it.  To do this, we just pretend the array starts
10585              * one later */
10586
10587             array_b++;
10588             len_b--;
10589         }
10590         else {
10591
10592             /* But if the first element is not zero, we pretend the list starts
10593              * at the 0 that is always stored immediately before the array. */
10594             array_b--;
10595             len_b++;
10596         }
10597     }
10598
10599     return    len_a == len_b
10600            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10601
10602 }
10603 #endif
10604
10605 /*
10606  * As best we can, determine the characters that can match the start of
10607  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10608  * can be false positive matches
10609  *
10610  * Returns the invlist as a new SV*; it is the caller's responsibility to
10611  * call SvREFCNT_dec() when done with it.
10612  */
10613 STATIC SV*
10614 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10615 {
10616     const U8 * s = (U8*)STRING(node);
10617     SSize_t bytelen = STR_LEN(node);
10618     UV uc;
10619     /* Start out big enough for 2 separate code points */
10620     SV* invlist = _new_invlist(4);
10621
10622     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10623
10624     if (! UTF) {
10625         uc = *s;
10626
10627         /* We punt and assume can match anything if the node begins
10628          * with a multi-character fold.  Things are complicated.  For
10629          * example, /ffi/i could match any of:
10630          *  "\N{LATIN SMALL LIGATURE FFI}"
10631          *  "\N{LATIN SMALL LIGATURE FF}I"
10632          *  "F\N{LATIN SMALL LIGATURE FI}"
10633          *  plus several other things; and making sure we have all the
10634          *  possibilities is hard. */
10635         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10636             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10637         }
10638         else {
10639             /* Any Latin1 range character can potentially match any
10640              * other depending on the locale */
10641             if (OP(node) == EXACTFL) {
10642                 _invlist_union(invlist, PL_Latin1, &invlist);
10643             }
10644             else {
10645                 /* But otherwise, it matches at least itself.  We can
10646                  * quickly tell if it has a distinct fold, and if so,
10647                  * it matches that as well */
10648                 invlist = add_cp_to_invlist(invlist, uc);
10649                 if (IS_IN_SOME_FOLD_L1(uc))
10650                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10651             }
10652
10653             /* Some characters match above-Latin1 ones under /i.  This
10654              * is true of EXACTFL ones when the locale is UTF-8 */
10655             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10656                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10657                                     && OP(node) != EXACTFAA_NO_TRIE)))
10658             {
10659                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10660             }
10661         }
10662     }
10663     else {  /* Pattern is UTF-8 */
10664         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10665         const U8* e = s + bytelen;
10666         IV fc;
10667
10668         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10669
10670         /* The only code points that aren't folded in a UTF EXACTFish
10671          * node are are the problematic ones in EXACTFL nodes */
10672         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10673             /* We need to check for the possibility that this EXACTFL
10674              * node begins with a multi-char fold.  Therefore we fold
10675              * the first few characters of it so that we can make that
10676              * check */
10677             U8 *d = folded;
10678             int i;
10679
10680             fc = -1;
10681             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10682                 if (isASCII(*s)) {
10683                     *(d++) = (U8) toFOLD(*s);
10684                     if (fc < 0) {       /* Save the first fold */
10685                         fc = *(d-1);
10686                     }
10687                     s++;
10688                 }
10689                 else {
10690                     STRLEN len;
10691                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10692                     if (fc < 0) {       /* Save the first fold */
10693                         fc = fold;
10694                     }
10695                     d += len;
10696                     s += UTF8SKIP(s);
10697                 }
10698             }
10699
10700             /* And set up so the code below that looks in this folded
10701              * buffer instead of the node's string */
10702             e = d;
10703             s = folded;
10704         }
10705
10706         /* When we reach here 's' points to the fold of the first
10707          * character(s) of the node; and 'e' points to far enough along
10708          * the folded string to be just past any possible multi-char
10709          * fold.
10710          *
10711          * Unlike the non-UTF-8 case, the macro for determining if a
10712          * string is a multi-char fold requires all the characters to
10713          * already be folded.  This is because of all the complications
10714          * if not.  Note that they are folded anyway, except in EXACTFL
10715          * nodes.  Like the non-UTF case above, we punt if the node
10716          * begins with a multi-char fold  */
10717
10718         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10719             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10720         }
10721         else {  /* Single char fold */
10722             unsigned int k;
10723             unsigned int first_fold;
10724             const unsigned int * remaining_folds;
10725             Size_t folds_count;
10726
10727             /* It matches itself */
10728             invlist = add_cp_to_invlist(invlist, fc);
10729
10730             /* ... plus all the things that fold to it, which are found in
10731              * PL_utf8_foldclosures */
10732             folds_count = _inverse_folds(fc, &first_fold,
10733                                                 &remaining_folds);
10734             for (k = 0; k < folds_count; k++) {
10735                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10736
10737                 /* /aa doesn't allow folds between ASCII and non- */
10738                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10739                     && isASCII(c) != isASCII(fc))
10740                 {
10741                     continue;
10742                 }
10743
10744                 invlist = add_cp_to_invlist(invlist, c);
10745             }
10746         }
10747     }
10748
10749     return invlist;
10750 }
10751
10752 #undef HEADER_LENGTH
10753 #undef TO_INTERNAL_SIZE
10754 #undef FROM_INTERNAL_SIZE
10755 #undef INVLIST_VERSION_ID
10756
10757 /* End of inversion list object */
10758
10759 STATIC void
10760 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10761 {
10762     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10763      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10764      * should point to the first flag; it is updated on output to point to the
10765      * final ')' or ':'.  There needs to be at least one flag, or this will
10766      * abort */
10767
10768     /* for (?g), (?gc), and (?o) warnings; warning
10769        about (?c) will warn about (?g) -- japhy    */
10770
10771 #define WASTED_O  0x01
10772 #define WASTED_G  0x02
10773 #define WASTED_C  0x04
10774 #define WASTED_GC (WASTED_G|WASTED_C)
10775     I32 wastedflags = 0x00;
10776     U32 posflags = 0, negflags = 0;
10777     U32 *flagsp = &posflags;
10778     char has_charset_modifier = '\0';
10779     regex_charset cs;
10780     bool has_use_defaults = FALSE;
10781     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10782     int x_mod_count = 0;
10783
10784     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10785
10786     /* '^' as an initial flag sets certain defaults */
10787     if (UCHARAT(RExC_parse) == '^') {
10788         RExC_parse++;
10789         has_use_defaults = TRUE;
10790         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10791         cs = (RExC_uni_semantics)
10792              ? REGEX_UNICODE_CHARSET
10793              : REGEX_DEPENDS_CHARSET;
10794         set_regex_charset(&RExC_flags, cs);
10795     }
10796     else {
10797         cs = get_regex_charset(RExC_flags);
10798         if (   cs == REGEX_DEPENDS_CHARSET
10799             && RExC_uni_semantics)
10800         {
10801             cs = REGEX_UNICODE_CHARSET;
10802         }
10803     }
10804
10805     while (RExC_parse < RExC_end) {
10806         /* && strchr("iogcmsx", *RExC_parse) */
10807         /* (?g), (?gc) and (?o) are useless here
10808            and must be globally applied -- japhy */
10809         switch (*RExC_parse) {
10810
10811             /* Code for the imsxn flags */
10812             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10813
10814             case LOCALE_PAT_MOD:
10815                 if (has_charset_modifier) {
10816                     goto excess_modifier;
10817                 }
10818                 else if (flagsp == &negflags) {
10819                     goto neg_modifier;
10820                 }
10821                 cs = REGEX_LOCALE_CHARSET;
10822                 has_charset_modifier = LOCALE_PAT_MOD;
10823                 break;
10824             case UNICODE_PAT_MOD:
10825                 if (has_charset_modifier) {
10826                     goto excess_modifier;
10827                 }
10828                 else if (flagsp == &negflags) {
10829                     goto neg_modifier;
10830                 }
10831                 cs = REGEX_UNICODE_CHARSET;
10832                 has_charset_modifier = UNICODE_PAT_MOD;
10833                 break;
10834             case ASCII_RESTRICT_PAT_MOD:
10835                 if (flagsp == &negflags) {
10836                     goto neg_modifier;
10837                 }
10838                 if (has_charset_modifier) {
10839                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10840                         goto excess_modifier;
10841                     }
10842                     /* Doubled modifier implies more restricted */
10843                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10844                 }
10845                 else {
10846                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10847                 }
10848                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10849                 break;
10850             case DEPENDS_PAT_MOD:
10851                 if (has_use_defaults) {
10852                     goto fail_modifiers;
10853                 }
10854                 else if (flagsp == &negflags) {
10855                     goto neg_modifier;
10856                 }
10857                 else if (has_charset_modifier) {
10858                     goto excess_modifier;
10859                 }
10860
10861                 /* The dual charset means unicode semantics if the
10862                  * pattern (or target, not known until runtime) are
10863                  * utf8, or something in the pattern indicates unicode
10864                  * semantics */
10865                 cs = (RExC_uni_semantics)
10866                      ? REGEX_UNICODE_CHARSET
10867                      : REGEX_DEPENDS_CHARSET;
10868                 has_charset_modifier = DEPENDS_PAT_MOD;
10869                 break;
10870               excess_modifier:
10871                 RExC_parse++;
10872                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10873                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10874                 }
10875                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10876                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10877                                         *(RExC_parse - 1));
10878                 }
10879                 else {
10880                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10881                 }
10882                 NOT_REACHED; /*NOTREACHED*/
10883               neg_modifier:
10884                 RExC_parse++;
10885                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10886                                     *(RExC_parse - 1));
10887                 NOT_REACHED; /*NOTREACHED*/
10888             case ONCE_PAT_MOD: /* 'o' */
10889             case GLOBAL_PAT_MOD: /* 'g' */
10890                 if (ckWARN(WARN_REGEXP)) {
10891                     const I32 wflagbit = *RExC_parse == 'o'
10892                                          ? WASTED_O
10893                                          : WASTED_G;
10894                     if (! (wastedflags & wflagbit) ) {
10895                         wastedflags |= wflagbit;
10896                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10897                         vWARN5(
10898                             RExC_parse + 1,
10899                             "Useless (%s%c) - %suse /%c modifier",
10900                             flagsp == &negflags ? "?-" : "?",
10901                             *RExC_parse,
10902                             flagsp == &negflags ? "don't " : "",
10903                             *RExC_parse
10904                         );
10905                     }
10906                 }
10907                 break;
10908
10909             case CONTINUE_PAT_MOD: /* 'c' */
10910                 if (ckWARN(WARN_REGEXP)) {
10911                     if (! (wastedflags & WASTED_C) ) {
10912                         wastedflags |= WASTED_GC;
10913                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10914                         vWARN3(
10915                             RExC_parse + 1,
10916                             "Useless (%sc) - %suse /gc modifier",
10917                             flagsp == &negflags ? "?-" : "?",
10918                             flagsp == &negflags ? "don't " : ""
10919                         );
10920                     }
10921                 }
10922                 break;
10923             case KEEPCOPY_PAT_MOD: /* 'p' */
10924                 if (flagsp == &negflags) {
10925                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10926                 } else {
10927                     *flagsp |= RXf_PMf_KEEPCOPY;
10928                 }
10929                 break;
10930             case '-':
10931                 /* A flag is a default iff it is following a minus, so
10932                  * if there is a minus, it means will be trying to
10933                  * re-specify a default which is an error */
10934                 if (has_use_defaults || flagsp == &negflags) {
10935                     goto fail_modifiers;
10936                 }
10937                 flagsp = &negflags;
10938                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10939                 x_mod_count = 0;
10940                 break;
10941             case ':':
10942             case ')':
10943
10944                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10945                     negflags |= RXf_PMf_EXTENDED_MORE;
10946                 }
10947                 RExC_flags |= posflags;
10948
10949                 if (negflags & RXf_PMf_EXTENDED) {
10950                     negflags |= RXf_PMf_EXTENDED_MORE;
10951                 }
10952                 RExC_flags &= ~negflags;
10953                 set_regex_charset(&RExC_flags, cs);
10954
10955                 return;
10956             default:
10957               fail_modifiers:
10958                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10959                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10960                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10961                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10962                 NOT_REACHED; /*NOTREACHED*/
10963         }
10964
10965         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10966     }
10967
10968     vFAIL("Sequence (?... not terminated");
10969 }
10970
10971 /*
10972  - reg - regular expression, i.e. main body or parenthesized thing
10973  *
10974  * Caller must absorb opening parenthesis.
10975  *
10976  * Combining parenthesis handling with the base level of regular expression
10977  * is a trifle forced, but the need to tie the tails of the branches to what
10978  * follows makes it hard to avoid.
10979  */
10980 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10981 #ifdef DEBUGGING
10982 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10983 #else
10984 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10985 #endif
10986
10987 PERL_STATIC_INLINE regnode_offset
10988 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10989                              I32 *flagp,
10990                              char * parse_start,
10991                              char ch
10992                       )
10993 {
10994     regnode_offset ret;
10995     char* name_start = RExC_parse;
10996     U32 num = 0;
10997     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10998     GET_RE_DEBUG_FLAGS_DECL;
10999
11000     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11001
11002     if (RExC_parse == name_start || *RExC_parse != ch) {
11003         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11004         vFAIL2("Sequence %.3s... not terminated", parse_start);
11005     }
11006
11007     if (sv_dat) {
11008         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11009         RExC_rxi->data->data[num]=(void*)sv_dat;
11010         SvREFCNT_inc_simple_void_NN(sv_dat);
11011     }
11012     RExC_sawback = 1;
11013     ret = reganode(pRExC_state,
11014                    ((! FOLD)
11015                      ? NREF
11016                      : (ASCII_FOLD_RESTRICTED)
11017                        ? NREFFA
11018                        : (AT_LEAST_UNI_SEMANTICS)
11019                          ? NREFFU
11020                          : (LOC)
11021                            ? NREFFL
11022                            : NREFF),
11023                     num);
11024     *flagp |= HASWIDTH;
11025
11026     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11027     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11028
11029     nextchar(pRExC_state);
11030     return ret;
11031 }
11032
11033 /* On success, returns the offset at which any next node should be placed into
11034  * the regex engine program being compiled.
11035  *
11036  * Returns 0 otherwise, with *flagp set to indicate why:
11037  *  TRYAGAIN        at the end of (?) that only sets flags.
11038  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
11039  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11040  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
11041  *  happen.  */
11042 STATIC regnode_offset
11043 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11044     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11045      * 2 is like 1, but indicates that nextchar() has been called to advance
11046      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
11047      * this flag alerts us to the need to check for that */
11048 {
11049     regnode_offset ret = 0;    /* Will be the head of the group. */
11050     regnode_offset br;
11051     regnode_offset lastbr;
11052     regnode_offset ender = 0;
11053     I32 parno = 0;
11054     I32 flags;
11055     U32 oregflags = RExC_flags;
11056     bool have_branch = 0;
11057     bool is_open = 0;
11058     I32 freeze_paren = 0;
11059     I32 after_freeze = 0;
11060     I32 num; /* numeric backreferences */
11061
11062     char * parse_start = RExC_parse; /* MJD */
11063     char * const oregcomp_parse = RExC_parse;
11064
11065     GET_RE_DEBUG_FLAGS_DECL;
11066
11067     PERL_ARGS_ASSERT_REG;
11068     DEBUG_PARSE("reg ");
11069
11070     *flagp = 0;                         /* Tentatively. */
11071
11072     /* Having this true makes it feasible to have a lot fewer tests for the
11073      * parse pointer being in scope.  For example, we can write
11074      *      while(isFOO(*RExC_parse)) RExC_parse++;
11075      * instead of
11076      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11077      */
11078     assert(*RExC_end == '\0');
11079
11080     /* Make an OPEN node, if parenthesized. */
11081     if (paren) {
11082
11083         /* Under /x, space and comments can be gobbled up between the '(' and
11084          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11085          * intervening space, as the sequence is a token, and a token should be
11086          * indivisible */
11087         bool has_intervening_patws = (paren == 2)
11088                                   && *(RExC_parse - 1) != '(';
11089
11090         if (RExC_parse >= RExC_end) {
11091             vFAIL("Unmatched (");
11092         }
11093
11094         if (paren == 'r') {     /* Atomic script run */
11095             paren = '>';
11096             goto parse_rest;
11097         }
11098         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11099             char *start_verb = RExC_parse + 1;
11100             STRLEN verb_len;
11101             char *start_arg = NULL;
11102             unsigned char op = 0;
11103             int arg_required = 0;
11104             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11105             bool has_upper = FALSE;
11106
11107             if (has_intervening_patws) {
11108                 RExC_parse++;   /* past the '*' */
11109
11110                 /* For strict backwards compatibility, don't change the message
11111                  * now that we also have lowercase operands */
11112                 if (isUPPER(*RExC_parse)) {
11113                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11114                 }
11115                 else {
11116                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11117                 }
11118             }
11119             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11120                 if ( *RExC_parse == ':' ) {
11121                     start_arg = RExC_parse + 1;
11122                     break;
11123                 }
11124                 else if (! UTF) {
11125                     if (isUPPER(*RExC_parse)) {
11126                         has_upper = TRUE;
11127                     }
11128                     RExC_parse++;
11129                 }
11130                 else {
11131                     RExC_parse += UTF8SKIP(RExC_parse);
11132                 }
11133             }
11134             verb_len = RExC_parse - start_verb;
11135             if ( start_arg ) {
11136                 if (RExC_parse >= RExC_end) {
11137                     goto unterminated_verb_pattern;
11138                 }
11139
11140                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11141                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11142                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11143                 }
11144                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11145                   unterminated_verb_pattern:
11146                     if (has_upper) {
11147                         vFAIL("Unterminated verb pattern argument");
11148                     }
11149                     else {
11150                         vFAIL("Unterminated '(*...' argument");
11151                     }
11152                 }
11153             } else {
11154                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11155                     if (has_upper) {
11156                         vFAIL("Unterminated verb pattern");
11157                     }
11158                     else {
11159                         vFAIL("Unterminated '(*...' construct");
11160                     }
11161                 }
11162             }
11163
11164             /* Here, we know that RExC_parse < RExC_end */
11165
11166             switch ( *start_verb ) {
11167             case 'A':  /* (*ACCEPT) */
11168                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11169                     op = ACCEPT;
11170                     internal_argval = RExC_nestroot;
11171                 }
11172                 break;
11173             case 'C':  /* (*COMMIT) */
11174                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11175                     op = COMMIT;
11176                 break;
11177             case 'F':  /* (*FAIL) */
11178                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11179                     op = OPFAIL;
11180                 }
11181                 break;
11182             case ':':  /* (*:NAME) */
11183             case 'M':  /* (*MARK:NAME) */
11184                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11185                     op = MARKPOINT;
11186                     arg_required = 1;
11187                 }
11188                 break;
11189             case 'P':  /* (*PRUNE) */
11190                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11191                     op = PRUNE;
11192                 break;
11193             case 'S':   /* (*SKIP) */
11194                 if ( memEQs(start_verb, verb_len,"SKIP") )
11195                     op = SKIP;
11196                 break;
11197             case 'T':  /* (*THEN) */
11198                 /* [19:06] <TimToady> :: is then */
11199                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11200                     op = CUTGROUP;
11201                     RExC_seen |= REG_CUTGROUP_SEEN;
11202                 }
11203                 break;
11204             case 'a':
11205                 if (   memEQs(start_verb, verb_len, "asr")
11206                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11207                 {
11208                     paren = 'r';        /* Mnemonic: recursed run */
11209                     goto script_run;
11210                 }
11211                 else if (memEQs(start_verb, verb_len, "atomic")) {
11212                     paren = 't';    /* AtOMIC */
11213                     goto alpha_assertions;
11214                 }
11215                 break;
11216             case 'p':
11217                 if (   memEQs(start_verb, verb_len, "plb")
11218                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11219                 {
11220                     paren = 'b';
11221                     goto lookbehind_alpha_assertions;
11222                 }
11223                 else if (   memEQs(start_verb, verb_len, "pla")
11224                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11225                 {
11226                     paren = 'a';
11227                     goto alpha_assertions;
11228                 }
11229                 break;
11230             case 'n':
11231                 if (   memEQs(start_verb, verb_len, "nlb")
11232                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11233                 {
11234                     paren = 'B';
11235                     goto lookbehind_alpha_assertions;
11236                 }
11237                 else if (   memEQs(start_verb, verb_len, "nla")
11238                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11239                 {
11240                     paren = 'A';
11241                     goto alpha_assertions;
11242                 }
11243                 break;
11244             case 's':
11245                 if (   memEQs(start_verb, verb_len, "sr")
11246                     || memEQs(start_verb, verb_len, "script_run"))
11247                 {
11248                     regnode_offset atomic;
11249
11250                     paren = 's';
11251
11252                    script_run:
11253
11254                     /* This indicates Unicode rules. */
11255                     REQUIRE_UNI_RULES(flagp, 0);
11256
11257                     if (! start_arg) {
11258                         goto no_colon;
11259                     }
11260
11261                     RExC_parse = start_arg;
11262
11263                     if (RExC_in_script_run) {
11264
11265                         /*  Nested script runs are treated as no-ops, because
11266                          *  if the nested one fails, the outer one must as
11267                          *  well.  It could fail sooner, and avoid (??{} with
11268                          *  side effects, but that is explicitly documented as
11269                          *  undefined behavior. */
11270
11271                         ret = 0;
11272
11273                         if (paren == 's') {
11274                             paren = ':';
11275                             goto parse_rest;
11276                         }
11277
11278                         /* But, the atomic part of a nested atomic script run
11279                          * isn't a no-op, but can be treated just like a '(?>'
11280                          * */
11281                         paren = '>';
11282                         goto parse_rest;
11283                     }
11284
11285                     /* By doing this here, we avoid extra warnings for nested
11286                      * script runs */
11287                     ckWARNexperimental(RExC_parse,
11288                         WARN_EXPERIMENTAL__SCRIPT_RUN,
11289                         "The script_run feature is experimental");
11290
11291                     if (paren == 's') {
11292                         /* Here, we're starting a new regular script run */
11293                         ret = reg_node(pRExC_state, SROPEN);
11294                         RExC_in_script_run = 1;
11295                         is_open = 1;
11296                         goto parse_rest;
11297                     }
11298
11299                     /* Here, we are starting an atomic script run.  This is
11300                      * handled by recursing to deal with the atomic portion
11301                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11302
11303                     ret = reg_node(pRExC_state, SROPEN);
11304
11305                     RExC_in_script_run = 1;
11306
11307                     atomic = reg(pRExC_state, 'r', &flags, depth);
11308                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11309                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11310                         return 0;
11311                     }
11312
11313                     REGTAIL(pRExC_state, ret, atomic);
11314
11315                     REGTAIL(pRExC_state, atomic,
11316                            reg_node(pRExC_state, SRCLOSE));
11317
11318                     RExC_in_script_run = 0;
11319                     return ret;
11320                 }
11321
11322                 break;
11323
11324             lookbehind_alpha_assertions:
11325                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11326                 RExC_in_lookbehind++;
11327                 /*FALLTHROUGH*/
11328
11329             alpha_assertions:
11330                 ckWARNexperimental(RExC_parse,
11331                         WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11332                         "The alpha_assertions feature is experimental");
11333
11334                 RExC_seen_zerolen++;
11335
11336                 if (! start_arg) {
11337                     goto no_colon;
11338                 }
11339
11340                 /* An empty negative lookahead assertion simply is failure */
11341                 if (paren == 'A' && RExC_parse == start_arg) {
11342                     ret=reganode(pRExC_state, OPFAIL, 0);
11343                     nextchar(pRExC_state);
11344                     return ret;
11345                 }
11346
11347                 RExC_parse = start_arg;
11348                 goto parse_rest;
11349
11350               no_colon:
11351                 vFAIL2utf8f(
11352                 "'(*%" UTF8f "' requires a terminating ':'",
11353                 UTF8fARG(UTF, verb_len, start_verb));
11354                 NOT_REACHED; /*NOTREACHED*/
11355
11356             } /* End of switch */
11357             if ( ! op ) {
11358                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11359                 if (has_upper || verb_len == 0) {
11360                     vFAIL2utf8f(
11361                     "Unknown verb pattern '%" UTF8f "'",
11362                     UTF8fARG(UTF, verb_len, start_verb));
11363                 }
11364                 else {
11365                     vFAIL2utf8f(
11366                     "Unknown '(*...)' construct '%" UTF8f "'",
11367                     UTF8fARG(UTF, verb_len, start_verb));
11368                 }
11369             }
11370             if ( RExC_parse == start_arg ) {
11371                 start_arg = NULL;
11372             }
11373             if ( arg_required && !start_arg ) {
11374                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11375                     verb_len, start_verb);
11376             }
11377             if (internal_argval == -1) {
11378                 ret = reganode(pRExC_state, op, 0);
11379             } else {
11380                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11381             }
11382             RExC_seen |= REG_VERBARG_SEEN;
11383             if (start_arg) {
11384                 SV *sv = newSVpvn( start_arg,
11385                                     RExC_parse - start_arg);
11386                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11387                                         STR_WITH_LEN("S"));
11388                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11389                 FLAGS(REGNODE_p(ret)) = 1;
11390             } else {
11391                 FLAGS(REGNODE_p(ret)) = 0;
11392             }
11393             if ( internal_argval != -1 )
11394                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11395             nextchar(pRExC_state);
11396             return ret;
11397         }
11398         else if (*RExC_parse == '?') { /* (?...) */
11399             bool is_logical = 0;
11400             const char * const seqstart = RExC_parse;
11401             const char * endptr;
11402             if (has_intervening_patws) {
11403                 RExC_parse++;
11404                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11405             }
11406
11407             RExC_parse++;           /* past the '?' */
11408             paren = *RExC_parse;    /* might be a trailing NUL, if not
11409                                        well-formed */
11410             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11411             if (RExC_parse > RExC_end) {
11412                 paren = '\0';
11413             }
11414             ret = 0;                    /* For look-ahead/behind. */
11415             switch (paren) {
11416
11417             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11418                 paren = *RExC_parse;
11419                 if ( paren == '<') {    /* (?P<...>) named capture */
11420                     RExC_parse++;
11421                     if (RExC_parse >= RExC_end) {
11422                         vFAIL("Sequence (?P<... not terminated");
11423                     }
11424                     goto named_capture;
11425                 }
11426                 else if (paren == '>') {   /* (?P>name) named recursion */
11427                     RExC_parse++;
11428                     if (RExC_parse >= RExC_end) {
11429                         vFAIL("Sequence (?P>... not terminated");
11430                     }
11431                     goto named_recursion;
11432                 }
11433                 else if (paren == '=') {   /* (?P=...)  named backref */
11434                     RExC_parse++;
11435                     return handle_named_backref(pRExC_state, flagp,
11436                                                 parse_start, ')');
11437                 }
11438                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11439                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11440                 vFAIL3("Sequence (%.*s...) not recognized",
11441                                 RExC_parse-seqstart, seqstart);
11442                 NOT_REACHED; /*NOTREACHED*/
11443             case '<':           /* (?<...) */
11444                 if (*RExC_parse == '!')
11445                     paren = ',';
11446                 else if (*RExC_parse != '=')
11447               named_capture:
11448                 {               /* (?<...>) */
11449                     char *name_start;
11450                     SV *svname;
11451                     paren= '>';
11452                 /* FALLTHROUGH */
11453             case '\'':          /* (?'...') */
11454                     name_start = RExC_parse;
11455                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11456                     if (   RExC_parse == name_start
11457                         || RExC_parse >= RExC_end
11458                         || *RExC_parse != paren)
11459                     {
11460                         vFAIL2("Sequence (?%c... not terminated",
11461                             paren=='>' ? '<' : paren);
11462                     }
11463                     {
11464                         HE *he_str;
11465                         SV *sv_dat = NULL;
11466                         if (!svname) /* shouldn't happen */
11467                             Perl_croak(aTHX_
11468                                 "panic: reg_scan_name returned NULL");
11469                         if (!RExC_paren_names) {
11470                             RExC_paren_names= newHV();
11471                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11472 #ifdef DEBUGGING
11473                             RExC_paren_name_list= newAV();
11474                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11475 #endif
11476                         }
11477                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11478                         if ( he_str )
11479                             sv_dat = HeVAL(he_str);
11480                         if ( ! sv_dat ) {
11481                             /* croak baby croak */
11482                             Perl_croak(aTHX_
11483                                 "panic: paren_name hash element allocation failed");
11484                         } else if ( SvPOK(sv_dat) ) {
11485                             /* (?|...) can mean we have dupes so scan to check
11486                                its already been stored. Maybe a flag indicating
11487                                we are inside such a construct would be useful,
11488                                but the arrays are likely to be quite small, so
11489                                for now we punt -- dmq */
11490                             IV count = SvIV(sv_dat);
11491                             I32 *pv = (I32*)SvPVX(sv_dat);
11492                             IV i;
11493                             for ( i = 0 ; i < count ; i++ ) {
11494                                 if ( pv[i] == RExC_npar ) {
11495                                     count = 0;
11496                                     break;
11497                                 }
11498                             }
11499                             if ( count ) {
11500                                 pv = (I32*)SvGROW(sv_dat,
11501                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11502                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11503                                 pv[count] = RExC_npar;
11504                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11505                             }
11506                         } else {
11507                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11508                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11509                                                                 sizeof(I32));
11510                             SvIOK_on(sv_dat);
11511                             SvIV_set(sv_dat, 1);
11512                         }
11513 #ifdef DEBUGGING
11514                         /* Yes this does cause a memory leak in debugging Perls
11515                          * */
11516                         if (!av_store(RExC_paren_name_list,
11517                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11518                             SvREFCNT_dec_NN(svname);
11519 #endif
11520
11521                         /*sv_dump(sv_dat);*/
11522                     }
11523                     nextchar(pRExC_state);
11524                     paren = 1;
11525                     goto capturing_parens;
11526                 }
11527
11528                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11529                 RExC_in_lookbehind++;
11530                 RExC_parse++;
11531                 if (RExC_parse >= RExC_end) {
11532                     vFAIL("Sequence (?... not terminated");
11533                 }
11534
11535                 /* FALLTHROUGH */
11536             case '=':           /* (?=...) */
11537                 RExC_seen_zerolen++;
11538                 break;
11539             case '!':           /* (?!...) */
11540                 RExC_seen_zerolen++;
11541                 /* check if we're really just a "FAIL" assertion */
11542                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11543                                         FALSE /* Don't force to /x */ );
11544                 if (*RExC_parse == ')') {
11545                     ret=reganode(pRExC_state, OPFAIL, 0);
11546                     nextchar(pRExC_state);
11547                     return ret;
11548                 }
11549                 break;
11550             case '|':           /* (?|...) */
11551                 /* branch reset, behave like a (?:...) except that
11552                    buffers in alternations share the same numbers */
11553                 paren = ':';
11554                 after_freeze = freeze_paren = RExC_npar;
11555
11556                 /* XXX This construct currently requires an extra pass.
11557                  * Investigation would be required to see if that could be
11558                  * changed */
11559                 REQUIRE_PARENS_PASS;
11560                 break;
11561             case ':':           /* (?:...) */
11562             case '>':           /* (?>...) */
11563                 break;
11564             case '$':           /* (?$...) */
11565             case '@':           /* (?@...) */
11566                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11567                 break;
11568             case '0' :           /* (?0) */
11569             case 'R' :           /* (?R) */
11570                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11571                     FAIL("Sequence (?R) not terminated");
11572                 num = 0;
11573                 RExC_seen |= REG_RECURSE_SEEN;
11574
11575                 /* XXX These constructs currently require an extra pass.
11576                  * It probably could be changed */
11577                 REQUIRE_PARENS_PASS;
11578
11579                 *flagp |= POSTPONED;
11580                 goto gen_recurse_regop;
11581                 /*notreached*/
11582             /* named and numeric backreferences */
11583             case '&':            /* (?&NAME) */
11584                 parse_start = RExC_parse - 1;
11585               named_recursion:
11586                 {
11587                     SV *sv_dat = reg_scan_name(pRExC_state,
11588                                                REG_RSN_RETURN_DATA);
11589                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11590                 }
11591                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11592                     vFAIL("Sequence (?&... not terminated");
11593                 goto gen_recurse_regop;
11594                 /* NOTREACHED */
11595             case '+':
11596                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11597                     RExC_parse++;
11598                     vFAIL("Illegal pattern");
11599                 }
11600                 goto parse_recursion;
11601                 /* NOTREACHED*/
11602             case '-': /* (?-1) */
11603                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11604                     RExC_parse--; /* rewind to let it be handled later */
11605                     goto parse_flags;
11606                 }
11607                 /* FALLTHROUGH */
11608             case '1': case '2': case '3': case '4': /* (?1) */
11609             case '5': case '6': case '7': case '8': case '9':
11610                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11611               parse_recursion:
11612                 {
11613                     bool is_neg = FALSE;
11614                     UV unum;
11615                     parse_start = RExC_parse - 1; /* MJD */
11616                     if (*RExC_parse == '-') {
11617                         RExC_parse++;
11618                         is_neg = TRUE;
11619                     }
11620                     endptr = RExC_end;
11621                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11622                         && unum <= I32_MAX
11623                     ) {
11624                         num = (I32)unum;
11625                         RExC_parse = (char*)endptr;
11626                     } else
11627                         num = I32_MAX;
11628                     if (is_neg) {
11629                         /* Some limit for num? */
11630                         num = -num;
11631                     }
11632                 }
11633                 if (*RExC_parse!=')')
11634                     vFAIL("Expecting close bracket");
11635
11636               gen_recurse_regop:
11637                 if ( paren == '-' ) {
11638                     /*
11639                     Diagram of capture buffer numbering.
11640                     Top line is the normal capture buffer numbers
11641                     Bottom line is the negative indexing as from
11642                     the X (the (?-2))
11643
11644                     +   1 2    3 4 5 X          6 7
11645                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11646                     -   5 4    3 2 1 X          x x
11647
11648                     */
11649                     num = RExC_npar + num;
11650                     if (num < 1)  {
11651
11652                         /* It might be a forward reference; we can't fail until
11653                          * we know, by completing the parse to get all the
11654                          * groups, and then reparsing */
11655                         if (RExC_total_parens > 0)  {
11656                             RExC_parse++;
11657                             vFAIL("Reference to nonexistent group");
11658                         }
11659                         else {
11660                             REQUIRE_PARENS_PASS;
11661                         }
11662                     }
11663                 } else if ( paren == '+' ) {
11664                     num = RExC_npar + num - 1;
11665                 }
11666                 /* We keep track how many GOSUB items we have produced.
11667                    To start off the ARG2L() of the GOSUB holds its "id",
11668                    which is used later in conjunction with RExC_recurse
11669                    to calculate the offset we need to jump for the GOSUB,
11670                    which it will store in the final representation.
11671                    We have to defer the actual calculation until much later
11672                    as the regop may move.
11673                  */
11674
11675                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11676                 if (num >= RExC_npar) {
11677
11678                     /* It might be a forward reference; we can't fail until we
11679                      * know, by completing the parse to get all the groups, and
11680                      * then reparsing */
11681                     if (RExC_total_parens > 0)  {
11682                         if (num >= RExC_total_parens) {
11683                             RExC_parse++;
11684                             vFAIL("Reference to nonexistent group");
11685                         }
11686                     }
11687                     else {
11688                         REQUIRE_PARENS_PASS;
11689                     }
11690                 }
11691                 RExC_recurse_count++;
11692                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11693                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11694                             22, "|    |", (int)(depth * 2 + 1), "",
11695                             (UV)ARG(REGNODE_p(ret)),
11696                             (IV)ARG2L(REGNODE_p(ret))));
11697                 RExC_seen |= REG_RECURSE_SEEN;
11698
11699                 Set_Node_Length(REGNODE_p(ret),
11700                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11701                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11702
11703                 *flagp |= POSTPONED;
11704                 assert(*RExC_parse == ')');
11705                 nextchar(pRExC_state);
11706                 return ret;
11707
11708             /* NOTREACHED */
11709
11710             case '?':           /* (??...) */
11711                 is_logical = 1;
11712                 if (*RExC_parse != '{') {
11713                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
11714                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11715                     vFAIL2utf8f(
11716                         "Sequence (%" UTF8f "...) not recognized",
11717                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11718                     NOT_REACHED; /*NOTREACHED*/
11719                 }
11720                 *flagp |= POSTPONED;
11721                 paren = '{';
11722                 RExC_parse++;
11723                 /* FALLTHROUGH */
11724             case '{':           /* (?{...}) */
11725             {
11726                 U32 n = 0;
11727                 struct reg_code_block *cb;
11728                 OP * o;
11729
11730                 RExC_seen_zerolen++;
11731
11732                 if (   !pRExC_state->code_blocks
11733                     || pRExC_state->code_index
11734                                         >= pRExC_state->code_blocks->count
11735                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11736                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11737                             - RExC_start)
11738                 ) {
11739                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11740                         FAIL("panic: Sequence (?{...}): no code block found\n");
11741                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11742                 }
11743                 /* this is a pre-compiled code block (?{...}) */
11744                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11745                 RExC_parse = RExC_start + cb->end;
11746                 o = cb->block;
11747                 if (cb->src_regex) {
11748                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11749                     RExC_rxi->data->data[n] =
11750                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11751                     RExC_rxi->data->data[n+1] = (void*)o;
11752                 }
11753                 else {
11754                     n = add_data(pRExC_state,
11755                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11756                     RExC_rxi->data->data[n] = (void*)o;
11757                 }
11758                 pRExC_state->code_index++;
11759                 nextchar(pRExC_state);
11760
11761                 if (is_logical) {
11762                     regnode_offset eval;
11763                     ret = reg_node(pRExC_state, LOGICAL);
11764
11765                     eval = reg2Lanode(pRExC_state, EVAL,
11766                                        n,
11767
11768                                        /* for later propagation into (??{})
11769                                         * return value */
11770                                        RExC_flags & RXf_PMf_COMPILETIME
11771                                       );
11772                     FLAGS(REGNODE_p(ret)) = 2;
11773                     REGTAIL(pRExC_state, ret, eval);
11774                     /* deal with the length of this later - MJD */
11775                     return ret;
11776                 }
11777                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11778                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11779                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11780                 return ret;
11781             }
11782             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11783             {
11784                 int is_define= 0;
11785                 const int DEFINE_len = sizeof("DEFINE") - 1;
11786                 if (    RExC_parse < RExC_end - 1
11787                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11788                             && (   RExC_parse[1] == '='
11789                                 || RExC_parse[1] == '!'
11790                                 || RExC_parse[1] == '<'
11791                                 || RExC_parse[1] == '{'))
11792                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11793                             && (   memBEGINs(RExC_parse + 1,
11794                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11795                                          "pla:")
11796                                 || memBEGINs(RExC_parse + 1,
11797                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11798                                          "plb:")
11799                                 || memBEGINs(RExC_parse + 1,
11800                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11801                                          "nla:")
11802                                 || memBEGINs(RExC_parse + 1,
11803                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11804                                          "nlb:")
11805                                 || memBEGINs(RExC_parse + 1,
11806                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11807                                          "positive_lookahead:")
11808                                 || memBEGINs(RExC_parse + 1,
11809                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11810                                          "positive_lookbehind:")
11811                                 || memBEGINs(RExC_parse + 1,
11812                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11813                                          "negative_lookahead:")
11814                                 || memBEGINs(RExC_parse + 1,
11815                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11816                                          "negative_lookbehind:"))))
11817                 ) { /* Lookahead or eval. */
11818                     I32 flag;
11819                     regnode_offset tail;
11820
11821                     ret = reg_node(pRExC_state, LOGICAL);
11822                     FLAGS(REGNODE_p(ret)) = 1;
11823
11824                     tail = reg(pRExC_state, 1, &flag, depth+1);
11825                     RETURN_FAIL_ON_RESTART(flag, flagp);
11826                     REGTAIL(pRExC_state, ret, tail);
11827                     goto insert_if;
11828                 }
11829                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11830                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11831                 {
11832                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11833                     char *name_start= RExC_parse++;
11834                     U32 num = 0;
11835                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11836                     if (   RExC_parse == name_start
11837                         || RExC_parse >= RExC_end
11838                         || *RExC_parse != ch)
11839                     {
11840                         vFAIL2("Sequence (?(%c... not terminated",
11841                             (ch == '>' ? '<' : ch));
11842                     }
11843                     RExC_parse++;
11844                     if (sv_dat) {
11845                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11846                         RExC_rxi->data->data[num]=(void*)sv_dat;
11847                         SvREFCNT_inc_simple_void_NN(sv_dat);
11848                     }
11849                     ret = reganode(pRExC_state, NGROUPP, num);
11850                     goto insert_if_check_paren;
11851                 }
11852                 else if (memBEGINs(RExC_parse,
11853                                    (STRLEN) (RExC_end - RExC_parse),
11854                                    "DEFINE"))
11855                 {
11856                     ret = reganode(pRExC_state, DEFINEP, 0);
11857                     RExC_parse += DEFINE_len;
11858                     is_define = 1;
11859                     goto insert_if_check_paren;
11860                 }
11861                 else if (RExC_parse[0] == 'R') {
11862                     RExC_parse++;
11863                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11864                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11865                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11866                      */
11867                     parno = 0;
11868                     if (RExC_parse[0] == '0') {
11869                         parno = 1;
11870                         RExC_parse++;
11871                     }
11872                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11873                         UV uv;
11874                         endptr = RExC_end;
11875                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11876                             && uv <= I32_MAX
11877                         ) {
11878                             parno = (I32)uv + 1;
11879                             RExC_parse = (char*)endptr;
11880                         }
11881                         /* else "Switch condition not recognized" below */
11882                     } else if (RExC_parse[0] == '&') {
11883                         SV *sv_dat;
11884                         RExC_parse++;
11885                         sv_dat = reg_scan_name(pRExC_state,
11886                                                REG_RSN_RETURN_DATA);
11887                         if (sv_dat)
11888                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11889                     }
11890                     ret = reganode(pRExC_state, INSUBP, parno);
11891                     goto insert_if_check_paren;
11892                 }
11893                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11894                     /* (?(1)...) */
11895                     char c;
11896                     UV uv;
11897                     endptr = RExC_end;
11898                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11899                         && uv <= I32_MAX
11900                     ) {
11901                         parno = (I32)uv;
11902                         RExC_parse = (char*)endptr;
11903                     }
11904                     else {
11905                         vFAIL("panic: grok_atoUV returned FALSE");
11906                     }
11907                     ret = reganode(pRExC_state, GROUPP, parno);
11908
11909                  insert_if_check_paren:
11910                     if (UCHARAT(RExC_parse) != ')') {
11911                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11912                         vFAIL("Switch condition not recognized");
11913                     }
11914                     nextchar(pRExC_state);
11915                   insert_if:
11916                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11917                     br = regbranch(pRExC_state, &flags, 1, depth+1);
11918                     if (br == 0) {
11919                         RETURN_FAIL_ON_RESTART(flags,flagp);
11920                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11921                               (UV) flags);
11922                     } else
11923                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
11924                                                           LONGJMP, 0));
11925                     c = UCHARAT(RExC_parse);
11926                     nextchar(pRExC_state);
11927                     if (flags&HASWIDTH)
11928                         *flagp |= HASWIDTH;
11929                     if (c == '|') {
11930                         if (is_define)
11931                             vFAIL("(?(DEFINE)....) does not allow branches");
11932
11933                         /* Fake one for optimizer.  */
11934                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11935
11936                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11937                             RETURN_FAIL_ON_RESTART(flags, flagp);
11938                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11939                                   (UV) flags);
11940                         }
11941                         REGTAIL(pRExC_state, ret, lastbr);
11942                         if (flags&HASWIDTH)
11943                             *flagp |= HASWIDTH;
11944                         c = UCHARAT(RExC_parse);
11945                         nextchar(pRExC_state);
11946                     }
11947                     else
11948                         lastbr = 0;
11949                     if (c != ')') {
11950                         if (RExC_parse >= RExC_end)
11951                             vFAIL("Switch (?(condition)... not terminated");
11952                         else
11953                             vFAIL("Switch (?(condition)... contains too many branches");
11954                     }
11955                     ender = reg_node(pRExC_state, TAIL);
11956                     REGTAIL(pRExC_state, br, ender);
11957                     if (lastbr) {
11958                         REGTAIL(pRExC_state, lastbr, ender);
11959                         REGTAIL(pRExC_state, REGNODE_OFFSET(
11960                                                 NEXTOPER(
11961                                                 NEXTOPER(REGNODE_p(lastbr)))),
11962                                              ender);
11963                     }
11964                     else
11965                         REGTAIL(pRExC_state, ret, ender);
11966 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
11967                     RExC_size++; /* XXX WHY do we need this?!!
11968                                     For large programs it seems to be required
11969                                     but I can't figure out why. -- dmq*/
11970 #endif
11971                     return ret;
11972                 }
11973                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11974                 vFAIL("Unknown switch condition (?(...))");
11975             }
11976             case '[':           /* (?[ ... ]) */
11977                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11978                                          oregcomp_parse);
11979             case 0: /* A NUL */
11980                 RExC_parse--; /* for vFAIL to print correctly */
11981                 vFAIL("Sequence (? incomplete");
11982                 break;
11983             default: /* e.g., (?i) */
11984                 RExC_parse = (char *) seqstart + 1;
11985               parse_flags:
11986                 parse_lparen_question_flags(pRExC_state);
11987                 if (UCHARAT(RExC_parse) != ':') {
11988                     if (RExC_parse < RExC_end)
11989                         nextchar(pRExC_state);
11990                     *flagp = TRYAGAIN;
11991                     return 0;
11992                 }
11993                 paren = ':';
11994                 nextchar(pRExC_state);
11995                 ret = 0;
11996                 goto parse_rest;
11997             } /* end switch */
11998         }
11999         else {
12000             if (*RExC_parse == '{') {
12001                 ckWARNregdep(RExC_parse + 1,
12002                             "Unescaped left brace in regex is "
12003                             "deprecated here (and will be fatal "
12004                             "in Perl 5.32), passed through");
12005             }
12006             /* Not bothering to indent here, as the above 'else' is temporary
12007              * */
12008         if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
12009           capturing_parens:
12010             parno = RExC_npar;
12011             RExC_npar++;
12012             if (RExC_total_parens <= 0) {
12013                 /* If we are in our first pass through (and maybe only pass),
12014                  * we  need to allocate memory for the capturing parentheses
12015                  * data structures.  Since we start at npar=1, when it reaches
12016                  * 2, for the first time it has something to put in it.  Above
12017                  * 2 means we extend what we already have */
12018                 if (RExC_npar == 2) {
12019                     /* setup RExC_open_parens, which holds the address of each
12020                      * OPEN tag, and to make things simpler for the 0 index the
12021                      * start of the program - this is used later for offsets */
12022                     Newxz(RExC_open_parens, RExC_npar, regnode_offset);
12023                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
12024
12025                     /* setup RExC_close_parens, which holds the address of each
12026                      * CLOSE tag, and to make things simpler for the 0 index
12027                      * the end of the program - this is used later for offsets
12028                      * */
12029                     Newxz(RExC_close_parens, RExC_npar, regnode_offset);
12030                     /* we dont know where end op starts yet, so we dont need to
12031                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12032                      * above */
12033                 }
12034                 else {
12035                     Renew(RExC_open_parens, RExC_npar, regnode_offset);
12036                     Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
12037
12038                     Renew(RExC_close_parens, RExC_npar, regnode_offset);
12039                     Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
12040                 }
12041             }
12042
12043             ret = reganode(pRExC_state, OPEN, parno);
12044             if (!RExC_nestroot)
12045                 RExC_nestroot = parno;
12046             if (RExC_open_parens && !RExC_open_parens[parno])
12047             {
12048                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12049                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12050                     22, "|    |", (int)(depth * 2 + 1), "",
12051                     (IV)parno, ret));
12052                 RExC_open_parens[parno]= ret;
12053             }
12054
12055             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12056             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12057             is_open = 1;
12058         } else {
12059             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12060             paren = ':';
12061             ret = 0;
12062         }
12063         }
12064     }
12065     else                        /* ! paren */
12066         ret = 0;
12067
12068    parse_rest:
12069     /* Pick up the branches, linking them together. */
12070     parse_start = RExC_parse;   /* MJD */
12071     br = regbranch(pRExC_state, &flags, 1, depth+1);
12072
12073     /*     branch_len = (paren != 0); */
12074
12075     if (br == 0) {
12076         RETURN_FAIL_ON_RESTART(flags, flagp);
12077         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12078     }
12079     if (*RExC_parse == '|') {
12080         if (RExC_use_BRANCHJ) {
12081             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12082         }
12083         else {                  /* MJD */
12084             reginsert(pRExC_state, BRANCH, br, depth+1);
12085             Set_Node_Length(REGNODE_p(br), paren != 0);
12086             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12087         }
12088         have_branch = 1;
12089     }
12090     else if (paren == ':') {
12091         *flagp |= flags&SIMPLE;
12092     }
12093     if (is_open) {                              /* Starts with OPEN. */
12094         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
12095     }
12096     else if (paren != '?')              /* Not Conditional */
12097         ret = br;
12098     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12099     lastbr = br;
12100     while (*RExC_parse == '|') {
12101         if (RExC_use_BRANCHJ) {
12102             ender = reganode(pRExC_state, LONGJMP, 0);
12103
12104             /* Append to the previous. */
12105             REGTAIL(pRExC_state,
12106                     REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12107                     ender);
12108         }
12109         nextchar(pRExC_state);
12110         if (freeze_paren) {
12111             if (RExC_npar > after_freeze)
12112                 after_freeze = RExC_npar;
12113             RExC_npar = freeze_paren;
12114         }
12115         br = regbranch(pRExC_state, &flags, 0, depth+1);
12116
12117         if (br == 0) {
12118             RETURN_FAIL_ON_RESTART(flags, flagp);
12119             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12120         }
12121         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
12122         lastbr = br;
12123         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12124     }
12125
12126     if (have_branch || paren != ':') {
12127         regnode * br;
12128
12129         /* Make a closing node, and hook it on the end. */
12130         switch (paren) {
12131         case ':':
12132             ender = reg_node(pRExC_state, TAIL);
12133             break;
12134         case 1: case 2:
12135             ender = reganode(pRExC_state, CLOSE, parno);
12136             if ( RExC_close_parens ) {
12137                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12138                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12139                         22, "|    |", (int)(depth * 2 + 1), "",
12140                         (IV)parno, ender));
12141                 RExC_close_parens[parno]= ender;
12142                 if (RExC_nestroot == parno)
12143                     RExC_nestroot = 0;
12144             }
12145             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12146             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12147             break;
12148         case 's':
12149             ender = reg_node(pRExC_state, SRCLOSE);
12150             RExC_in_script_run = 0;
12151             break;
12152         case '<':
12153         case 'a':
12154         case 'A':
12155         case 'b':
12156         case 'B':
12157         case ',':
12158         case '=':
12159         case '!':
12160             *flagp &= ~HASWIDTH;
12161             /* FALLTHROUGH */
12162         case 't':   /* aTomic */
12163         case '>':
12164             ender = reg_node(pRExC_state, SUCCEED);
12165             break;
12166         case 0:
12167             ender = reg_node(pRExC_state, END);
12168             assert(!RExC_end_op); /* there can only be one! */
12169             RExC_end_op = REGNODE_p(ender);
12170             if (RExC_close_parens) {
12171                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12172                     "%*s%*s Setting close paren #0 (END) to %d\n",
12173                     22, "|    |", (int)(depth * 2 + 1), "",
12174                     ender));
12175
12176                 RExC_close_parens[0]= ender;
12177             }
12178             break;
12179         }
12180         DEBUG_PARSE_r(
12181             DEBUG_PARSE_MSG("lsbr");
12182             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12183             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12184             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12185                           SvPV_nolen_const(RExC_mysv1),
12186                           (IV)lastbr,
12187                           SvPV_nolen_const(RExC_mysv2),
12188                           (IV)ender,
12189                           (IV)(ender - lastbr)
12190             );
12191         );
12192         REGTAIL(pRExC_state, lastbr, ender);
12193
12194         if (have_branch) {
12195             char is_nothing= 1;
12196             if (depth==1)
12197                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12198
12199             /* Hook the tails of the branches to the closing node. */
12200             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12201                 const U8 op = PL_regkind[OP(br)];
12202                 if (op == BRANCH) {
12203                     REGTAIL_STUDY(pRExC_state,
12204                                   REGNODE_OFFSET(NEXTOPER(br)),
12205                                   ender);
12206                     if ( OP(NEXTOPER(br)) != NOTHING
12207                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12208                         is_nothing= 0;
12209                 }
12210                 else if (op == BRANCHJ) {
12211                     REGTAIL_STUDY(pRExC_state,
12212                                   REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12213                                   ender);
12214                     /* for now we always disable this optimisation * /
12215                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12216                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12217                     */
12218                         is_nothing= 0;
12219                 }
12220             }
12221             if (is_nothing) {
12222                 regnode * ret_as_regnode = REGNODE_p(ret);
12223                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12224                                ? regnext(ret_as_regnode)
12225                                : ret_as_regnode;
12226                 DEBUG_PARSE_r(
12227                     DEBUG_PARSE_MSG("NADA");
12228                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12229                                      NULL, pRExC_state);
12230                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12231                                      NULL, pRExC_state);
12232                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12233                                   SvPV_nolen_const(RExC_mysv1),
12234                                   (IV)REG_NODE_NUM(ret_as_regnode),
12235                                   SvPV_nolen_const(RExC_mysv2),
12236                                   (IV)ender,
12237                                   (IV)(ender - ret)
12238                     );
12239                 );
12240                 OP(br)= NOTHING;
12241                 if (OP(REGNODE_p(ender)) == TAIL) {
12242                     NEXT_OFF(br)= 0;
12243                     RExC_emit= REGNODE_OFFSET(br) + 1;
12244                 } else {
12245                     regnode *opt;
12246                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12247                         OP(opt)= OPTIMIZED;
12248                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12249                 }
12250             }
12251         }
12252     }
12253
12254     {
12255         const char *p;
12256          /* Even/odd or x=don't care: 010101x10x */
12257         static const char parens[] = "=!aA<,>Bbt";
12258          /* flag below is set to 0 up through 'A'; 1 for larger */
12259
12260         if (paren && (p = strchr(parens, paren))) {
12261             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12262             int flag = (p - parens) > 3;
12263
12264             if (paren == '>' || paren == 't') {
12265                 node = SUSPEND, flag = 0;
12266             }
12267
12268             reginsert(pRExC_state, node, ret, depth+1);
12269             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12270             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12271             FLAGS(REGNODE_p(ret)) = flag;
12272             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
12273         }
12274     }
12275
12276     /* Check for proper termination. */
12277     if (paren) {
12278         /* restore original flags, but keep (?p) and, if we've encountered
12279          * something in the parse that changes /d rules into /u, keep the /u */
12280         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12281         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12282             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12283         }
12284         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12285             RExC_parse = oregcomp_parse;
12286             vFAIL("Unmatched (");
12287         }
12288         nextchar(pRExC_state);
12289     }
12290     else if (!paren && RExC_parse < RExC_end) {
12291         if (*RExC_parse == ')') {
12292             RExC_parse++;
12293             vFAIL("Unmatched )");
12294         }
12295         else
12296             FAIL("Junk on end of regexp");      /* "Can't happen". */
12297         NOT_REACHED; /* NOTREACHED */
12298     }
12299
12300     if (RExC_in_lookbehind) {
12301         RExC_in_lookbehind--;
12302     }
12303     if (after_freeze > RExC_npar)
12304         RExC_npar = after_freeze;
12305     return(ret);
12306 }
12307
12308 /*
12309  - regbranch - one alternative of an | operator
12310  *
12311  * Implements the concatenation operator.
12312  *
12313  * On success, returns the offset at which any next node should be placed into
12314  * the regex engine program being compiled.
12315  *
12316  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12317  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12318  * UTF-8
12319  */
12320 STATIC regnode_offset
12321 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12322 {
12323     regnode_offset ret;
12324     regnode_offset chain = 0;
12325     regnode_offset latest;
12326     I32 flags = 0, c = 0;
12327     GET_RE_DEBUG_FLAGS_DECL;
12328
12329     PERL_ARGS_ASSERT_REGBRANCH;
12330
12331     DEBUG_PARSE("brnc");
12332
12333     if (first)
12334         ret = 0;
12335     else {
12336         if (RExC_use_BRANCHJ)
12337             ret = reganode(pRExC_state, BRANCHJ, 0);
12338         else {
12339             ret = reg_node(pRExC_state, BRANCH);
12340             Set_Node_Length(REGNODE_p(ret), 1);
12341         }
12342     }
12343
12344     *flagp = WORST;                     /* Tentatively. */
12345
12346     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12347                             FALSE /* Don't force to /x */ );
12348     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12349         flags &= ~TRYAGAIN;
12350         latest = regpiece(pRExC_state, &flags, depth+1);
12351         if (latest == 0) {
12352             if (flags & TRYAGAIN)
12353                 continue;
12354             RETURN_FAIL_ON_RESTART(flags, flagp);
12355             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12356         }
12357         else if (ret == 0)
12358             ret = latest;
12359         *flagp |= flags&(HASWIDTH|POSTPONED);
12360         if (chain == 0)         /* First piece. */
12361             *flagp |= flags&SPSTART;
12362         else {
12363             /* FIXME adding one for every branch after the first is probably
12364              * excessive now we have TRIE support. (hv) */
12365             MARK_NAUGHTY(1);
12366             if (     chain > (SSize_t) BRANCH_MAX_OFFSET
12367                 && ! RExC_use_BRANCHJ)
12368             {
12369                 /* XXX We could just redo this branch, but figuring out what
12370                  * bookkeeping needs to be reset is a pain */
12371                 REQUIRE_BRANCHJ(flagp, 0);
12372             }
12373             REGTAIL(pRExC_state, chain, latest);
12374         }
12375         chain = latest;
12376         c++;
12377     }
12378     if (chain == 0) {   /* Loop ran zero times. */
12379         chain = reg_node(pRExC_state, NOTHING);
12380         if (ret == 0)
12381             ret = chain;
12382     }
12383     if (c == 1) {
12384         *flagp |= flags&SIMPLE;
12385     }
12386
12387     return ret;
12388 }
12389
12390 /*
12391  - regpiece - something followed by possible quantifier * + ? {n,m}
12392  *
12393  * Note that the branching code sequences used for ? and the general cases
12394  * of * and + are somewhat optimized:  they use the same NOTHING node as
12395  * both the endmarker for their branch list and the body of the last branch.
12396  * It might seem that this node could be dispensed with entirely, but the
12397  * endmarker role is not redundant.
12398  *
12399  * On success, returns the offset at which any next node should be placed into
12400  * the regex engine program being compiled.
12401  *
12402  * Returns 0 otherwise, with *flagp set to indicate why:
12403  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12404  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12405  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12406  */
12407 STATIC regnode_offset
12408 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12409 {
12410     regnode_offset ret;
12411     char op;
12412     char *next;
12413     I32 flags;
12414     const char * const origparse = RExC_parse;
12415     I32 min;
12416     I32 max = REG_INFTY;
12417 #ifdef RE_TRACK_PATTERN_OFFSETS
12418     char *parse_start;
12419 #endif
12420     const char *maxpos = NULL;
12421     UV uv;
12422
12423     /* Save the original in case we change the emitted regop to a FAIL. */
12424     const regnode_offset orig_emit = RExC_emit;
12425
12426     GET_RE_DEBUG_FLAGS_DECL;
12427
12428     PERL_ARGS_ASSERT_REGPIECE;
12429
12430     DEBUG_PARSE("piec");
12431
12432     ret = regatom(pRExC_state, &flags, depth+1);
12433     if (ret == 0) {
12434         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12435         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12436     }
12437
12438     op = *RExC_parse;
12439
12440     if (op == '{' && regcurly(RExC_parse)) {
12441         maxpos = NULL;
12442 #ifdef RE_TRACK_PATTERN_OFFSETS
12443         parse_start = RExC_parse; /* MJD */
12444 #endif
12445         next = RExC_parse + 1;
12446         while (isDIGIT(*next) || *next == ',') {
12447             if (*next == ',') {
12448                 if (maxpos)
12449                     break;
12450                 else
12451                     maxpos = next;
12452             }
12453             next++;
12454         }
12455         if (*next == '}') {             /* got one */
12456             const char* endptr;
12457             if (!maxpos)
12458                 maxpos = next;
12459             RExC_parse++;
12460             if (isDIGIT(*RExC_parse)) {
12461                 endptr = RExC_end;
12462                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12463                     vFAIL("Invalid quantifier in {,}");
12464                 if (uv >= REG_INFTY)
12465                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12466                 min = (I32)uv;
12467             } else {
12468                 min = 0;
12469             }
12470             if (*maxpos == ',')
12471                 maxpos++;
12472             else
12473                 maxpos = RExC_parse;
12474             if (isDIGIT(*maxpos)) {
12475                 endptr = RExC_end;
12476                 if (!grok_atoUV(maxpos, &uv, &endptr))
12477                     vFAIL("Invalid quantifier in {,}");
12478                 if (uv >= REG_INFTY)
12479                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12480                 max = (I32)uv;
12481             } else {
12482                 max = REG_INFTY;                /* meaning "infinity" */
12483             }
12484             RExC_parse = next;
12485             nextchar(pRExC_state);
12486             if (max < min) {    /* If can't match, warn and optimize to fail
12487                                    unconditionally */
12488                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12489                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12490                 NEXT_OFF(REGNODE_p(orig_emit)) =
12491                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12492                 return ret;
12493             }
12494             else if (min == max && *RExC_parse == '?')
12495             {
12496                 ckWARN2reg(RExC_parse + 1,
12497                            "Useless use of greediness modifier '%c'",
12498                            *RExC_parse);
12499             }
12500
12501           do_curly:
12502             if ((flags&SIMPLE)) {
12503                 if (min == 0 && max == REG_INFTY) {
12504                     reginsert(pRExC_state, STAR, ret, depth+1);
12505                     MARK_NAUGHTY(4);
12506                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12507                     goto nest_check;
12508                 }
12509                 if (min == 1 && max == REG_INFTY) {
12510                     reginsert(pRExC_state, PLUS, ret, depth+1);
12511                     MARK_NAUGHTY(3);
12512                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12513                     goto nest_check;
12514                 }
12515                 MARK_NAUGHTY_EXP(2, 2);
12516                 reginsert(pRExC_state, CURLY, ret, depth+1);
12517                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12518                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12519             }
12520             else {
12521                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12522
12523                 FLAGS(REGNODE_p(w)) = 0;
12524                 REGTAIL(pRExC_state, ret, w);
12525                 if (RExC_use_BRANCHJ) {
12526                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12527                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12528                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12529                 }
12530                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12531                                 /* MJD hk */
12532                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12533                 Set_Node_Length(REGNODE_p(ret),
12534                                 op == '{' ? (RExC_parse - parse_start) : 1);
12535
12536                 if (RExC_use_BRANCHJ)
12537                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12538                                                        LONGJMP. */
12539                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
12540                 RExC_whilem_seen++;
12541                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12542             }
12543             FLAGS(REGNODE_p(ret)) = 0;
12544
12545             if (min > 0)
12546                 *flagp = WORST;
12547             if (max > 0)
12548                 *flagp |= HASWIDTH;
12549             ARG1_SET(REGNODE_p(ret), (U16)min);
12550             ARG2_SET(REGNODE_p(ret), (U16)max);
12551             if (max == REG_INFTY)
12552                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12553
12554             goto nest_check;
12555         }
12556     }
12557
12558     if (!ISMULT1(op)) {
12559         *flagp = flags;
12560         return(ret);
12561     }
12562
12563 #if 0                           /* Now runtime fix should be reliable. */
12564
12565     /* if this is reinstated, don't forget to put this back into perldiag:
12566
12567             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12568
12569            (F) The part of the regexp subject to either the * or + quantifier
12570            could match an empty string. The {#} shows in the regular
12571            expression about where the problem was discovered.
12572
12573     */
12574
12575     if (!(flags&HASWIDTH) && op != '?')
12576       vFAIL("Regexp *+ operand could be empty");
12577 #endif
12578
12579 #ifdef RE_TRACK_PATTERN_OFFSETS
12580     parse_start = RExC_parse;
12581 #endif
12582     nextchar(pRExC_state);
12583
12584     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12585
12586     if (op == '*') {
12587         min = 0;
12588         goto do_curly;
12589     }
12590     else if (op == '+') {
12591         min = 1;
12592         goto do_curly;
12593     }
12594     else if (op == '?') {
12595         min = 0; max = 1;
12596         goto do_curly;
12597     }
12598   nest_check:
12599     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12600         ckWARN2reg(RExC_parse,
12601                    "%" UTF8f " matches null string many times",
12602                    UTF8fARG(UTF, (RExC_parse >= origparse
12603                                  ? RExC_parse - origparse
12604                                  : 0),
12605                    origparse));
12606     }
12607
12608     if (*RExC_parse == '?') {
12609         nextchar(pRExC_state);
12610         reginsert(pRExC_state, MINMOD, ret, depth+1);
12611         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
12612     }
12613     else if (*RExC_parse == '+') {
12614         regnode_offset ender;
12615         nextchar(pRExC_state);
12616         ender = reg_node(pRExC_state, SUCCEED);
12617         REGTAIL(pRExC_state, ret, ender);
12618         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12619         ender = reg_node(pRExC_state, TAIL);
12620         REGTAIL(pRExC_state, ret, ender);
12621     }
12622
12623     if (ISMULT2(RExC_parse)) {
12624         RExC_parse++;
12625         vFAIL("Nested quantifiers");
12626     }
12627
12628     return(ret);
12629 }
12630
12631 STATIC bool
12632 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12633                 regnode_offset * node_p,
12634                 UV * code_point_p,
12635                 int * cp_count,
12636                 I32 * flagp,
12637                 const bool strict,
12638                 const U32 depth
12639     )
12640 {
12641  /* This routine teases apart the various meanings of \N and returns
12642   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12643   * in the current context.
12644   *
12645   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12646   *
12647   * If <code_point_p> is not NULL, the context is expecting the result to be a
12648   * single code point.  If this \N instance turns out to a single code point,
12649   * the function returns TRUE and sets *code_point_p to that code point.
12650   *
12651   * If <node_p> is not NULL, the context is expecting the result to be one of
12652   * the things representable by a regnode.  If this \N instance turns out to be
12653   * one such, the function generates the regnode, returns TRUE and sets *node_p
12654   * to point to the offset of that regnode into the regex engine program being
12655   * compiled.
12656   *
12657   * If this instance of \N isn't legal in any context, this function will
12658   * generate a fatal error and not return.
12659   *
12660   * On input, RExC_parse should point to the first char following the \N at the
12661   * time of the call.  On successful return, RExC_parse will have been updated
12662   * to point to just after the sequence identified by this routine.  Also
12663   * *flagp has been updated as needed.
12664   *
12665   * When there is some problem with the current context and this \N instance,
12666   * the function returns FALSE, without advancing RExC_parse, nor setting
12667   * *node_p, nor *code_point_p, nor *flagp.
12668   *
12669   * If <cp_count> is not NULL, the caller wants to know the length (in code
12670   * points) that this \N sequence matches.  This is set, and the input is
12671   * parsed for errors, even if the function returns FALSE, as detailed below.
12672   *
12673   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12674   *
12675   * Probably the most common case is for the \N to specify a single code point.
12676   * *cp_count will be set to 1, and *code_point_p will be set to that code
12677   * point.
12678   *
12679   * Another possibility is for the input to be an empty \N{}, which for
12680   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
12681   * will be set to a generated NOTHING node.
12682   *
12683   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12684   * set to 0. *node_p will be set to a generated REG_ANY node.
12685   *
12686   * The fourth possibility is that \N resolves to a sequence of more than one
12687   * code points.  *cp_count will be set to the number of code points in the
12688   * sequence. *node_p will be set to a generated node returned by this
12689   * function calling S_reg().
12690   *
12691   * The final possibility is that it is premature to be calling this function;
12692   * the parse needs to be restarted.  This can happen when this changes from
12693   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12694   * latter occurs only when the fourth possibility would otherwise be in
12695   * effect, and is because one of those code points requires the pattern to be
12696   * recompiled as UTF-8.  The function returns FALSE, and sets the
12697   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12698   * happens, the caller needs to desist from continuing parsing, and return
12699   * this information to its caller.  This is not set for when there is only one
12700   * code point, as this can be called as part of an ANYOF node, and they can
12701   * store above-Latin1 code points without the pattern having to be in UTF-8.
12702   *
12703   * For non-single-quoted regexes, the tokenizer has resolved character and
12704   * sequence names inside \N{...} into their Unicode values, normalizing the
12705   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12706   * hex-represented code points in the sequence.  This is done there because
12707   * the names can vary based on what charnames pragma is in scope at the time,
12708   * so we need a way to take a snapshot of what they resolve to at the time of
12709   * the original parse. [perl #56444].
12710   *
12711   * That parsing is skipped for single-quoted regexes, so we may here get
12712   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
12713   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
12714   * is legal and handled here.  The code point is Unicode, and has to be
12715   * translated into the native character set for non-ASCII platforms.
12716   */
12717
12718     char * endbrace;    /* points to '}' following the name */
12719     char* p = RExC_parse; /* Temporary */
12720
12721     SV * substitute_parse = NULL;
12722     char *orig_end;
12723     char *save_start;
12724     I32 flags;
12725     Size_t count = 0;   /* code point count kept internally by this function */
12726
12727     GET_RE_DEBUG_FLAGS_DECL;
12728
12729     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12730
12731     GET_RE_DEBUG_FLAGS;
12732
12733     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12734     assert(! (node_p && cp_count));               /* At most 1 should be set */
12735
12736     if (cp_count) {     /* Initialize return for the most common case */
12737         *cp_count = 1;
12738     }
12739
12740     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12741      * modifier.  The other meanings do not, so use a temporary until we find
12742      * out which we are being called with */
12743     skip_to_be_ignored_text(pRExC_state, &p,
12744                             FALSE /* Don't force to /x */ );
12745
12746     /* Disambiguate between \N meaning a named character versus \N meaning
12747      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12748      * quantifier, or there is no '{' at all */
12749     if (*p != '{' || regcurly(p)) {
12750         RExC_parse = p;
12751         if (cp_count) {
12752             *cp_count = -1;
12753         }
12754
12755         if (! node_p) {
12756             return FALSE;
12757         }
12758
12759         *node_p = reg_node(pRExC_state, REG_ANY);
12760         *flagp |= HASWIDTH|SIMPLE;
12761         MARK_NAUGHTY(1);
12762         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12763         return TRUE;
12764     }
12765
12766     /* The test above made sure that the next real character is a '{', but
12767      * under the /x modifier, it could be separated by space (or a comment and
12768      * \n) and this is not allowed (for consistency with \x{...} and the
12769      * tokenizer handling of \N{NAME}). */
12770     if (*RExC_parse != '{') {
12771         vFAIL("Missing braces on \\N{}");
12772     }
12773
12774     RExC_parse++;       /* Skip past the '{' */
12775
12776     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12777     if (! endbrace) { /* no trailing brace */
12778         vFAIL2("Missing right brace on \\%c{}", 'N');
12779     }
12780
12781     /* Here, we have decided it should be a named character or sequence */
12782     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12783                                         semantics */
12784
12785     if (endbrace == RExC_parse) {   /* empty: \N{} */
12786         if (strict) {
12787             RExC_parse++;   /* Position after the "}" */
12788             vFAIL("Zero length \\N{}");
12789         }
12790         if (cp_count) {
12791             *cp_count = 0;
12792         }
12793         nextchar(pRExC_state);
12794         if (! node_p) {
12795             return FALSE;
12796         }
12797
12798         *node_p = reg_node(pRExC_state, NOTHING);
12799         return TRUE;
12800     }
12801
12802     /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
12803     if (   endbrace - RExC_parse < 2
12804         || strnNE(RExC_parse, "U+", 2))
12805     {
12806         RExC_parse = endbrace;  /* position msg's '<--HERE' */
12807         vFAIL("\\N{NAME} must be resolved by the lexer");
12808     }
12809
12810         /* This code purposely indented below because of future changes coming */
12811
12812         /* We can get to here when the input is \N{U+...} or when toke.c has
12813          * converted a name to the \N{U+...} form.  This include changing a
12814          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12815
12816         RExC_parse += 2;    /* Skip past the 'U+' */
12817
12818         /* Code points are separated by dots.  The '}' terminates the whole
12819          * thing. */
12820
12821         do {    /* Loop until the ending brace */
12822             UV cp = 0;
12823             char * start_digit;     /* The first of the current code point */
12824             if (! isXDIGIT(*RExC_parse)) {
12825                 RExC_parse++;
12826                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12827             }
12828
12829             start_digit = RExC_parse;
12830             count++;
12831
12832             /* Loop through the hex digits of the current code point */
12833             do {
12834                 /* Adding this digit will shift the result 4 bits.  If that
12835                  * result would be above the legal max, it's overflow */
12836                 if (cp > MAX_LEGAL_CP >> 4) {
12837
12838                     /* Find the end of the code point */
12839                     do {
12840                         RExC_parse ++;
12841                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12842
12843                     /* Be sure to synchronize this message with the similar one
12844                      * in utf8.c */
12845                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
12846                         " permissible max is 0x%" UVxf,
12847                         (int) (RExC_parse - start_digit), start_digit,
12848                         MAX_LEGAL_CP);
12849                 }
12850
12851                 /* Accumulate this (valid) digit into the running total */
12852                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
12853
12854                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
12855                  * underscore separator */
12856                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12857                     RExC_parse++;
12858                 }
12859             } while (isXDIGIT(*RExC_parse));
12860
12861             /* Here, have accumulated the next code point */
12862             if (RExC_parse >= endbrace) {   /* If done ... */
12863                 if (count != 1) {
12864                     goto do_concat;
12865                 }
12866
12867                 /* Here, is a single code point; fail if doesn't want that */
12868                 if (! code_point_p) {
12869                     RExC_parse = p;
12870                     return FALSE;
12871                 }
12872
12873                 /* A single code point is easy to handle; just return it */
12874                 *code_point_p = UNI_TO_NATIVE(cp);
12875                 RExC_parse = endbrace;
12876                 nextchar(pRExC_state);
12877                 return TRUE;
12878             }
12879
12880             /* Here, the only legal thing would be a multiple character
12881              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
12882              * character must be a dot (and the one after that can't be the
12883              * endbrace, or we'd have something like \N{U+100.} ) */
12884             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12885                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
12886                                 ? UTF8SKIP(RExC_parse)
12887                                 : 1;
12888                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
12889                     RExC_parse = endbrace;
12890                 }
12891                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12892             }
12893
12894             /* Here, looks like its really a multiple character sequence.  Fail
12895              * if that's not what the caller wants.  But continue with counting
12896              * and error checking if they still want a count */
12897             if (! node_p && ! cp_count) {
12898                 return FALSE;
12899             }
12900
12901             /* What is done here is to convert this to a sub-pattern of the
12902              * form \x{char1}\x{char2}...  and then call reg recursively to
12903              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
12904              * atomicness, while not having to worry about special handling
12905              * that some code points may have.  We don't create a subpattern,
12906              * but go through the motions of code point counting and error
12907              * checking, if the caller doesn't want a node returned. */
12908
12909             if (node_p && count == 1) {
12910                 substitute_parse = newSVpvs("?:");
12911             }
12912
12913           do_concat:
12914
12915             if (node_p) {
12916                 /* Convert to notation the rest of the code understands */
12917                 sv_catpvs(substitute_parse, "\\x{");
12918                 sv_catpvn(substitute_parse, start_digit,
12919                                             RExC_parse - start_digit);
12920                 sv_catpvs(substitute_parse, "}");
12921             }
12922
12923             /* Move to after the dot (or ending brace the final time through.)
12924              * */
12925             RExC_parse++;
12926             count++;
12927
12928         } while (RExC_parse < endbrace);
12929
12930         if (! node_p) { /* Doesn't want the node */
12931             assert (cp_count);
12932
12933             *cp_count = count;
12934             return FALSE;
12935         }
12936
12937         sv_catpvs(substitute_parse, ")");
12938
12939 #ifdef EBCDIC
12940         /* The values are Unicode, and therefore have to be converted to native
12941          * on a non-Unicode (meaning non-ASCII) platform. */
12942         RExC_recode_x_to_native = 1;
12943 #endif
12944
12945     /* Here, we have the string the name evaluates to, ready to be parsed,
12946      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
12947      * constructs.  This can be called from within a substitute parse already.
12948      * The error reporting mechanism doesn't work for 2 levels of this, but the
12949      * code above has validated this new construct, so there should be no
12950      * errors generated by the below.  And this isn' an exact copy, so the
12951      * mechanism to seamlessly deal with this won't work, so turn off warnings
12952      * during it */
12953     save_start = RExC_start;
12954     orig_end = RExC_end;
12955
12956     RExC_parse = RExC_start = SvPVX(substitute_parse);
12957     RExC_end = RExC_parse + SvCUR(substitute_parse);
12958     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
12959
12960     *node_p = reg(pRExC_state, 1, &flags, depth+1);
12961
12962     /* Restore the saved values */
12963     RESTORE_WARNINGS;
12964     RExC_start = save_start;
12965     RExC_parse = endbrace;
12966     RExC_end = orig_end;
12967 #ifdef EBCDIC
12968     RExC_recode_x_to_native = 0;
12969 #endif
12970
12971     SvREFCNT_dec_NN(substitute_parse);
12972
12973     if (! *node_p) {
12974         RETURN_FAIL_ON_RESTART(flags, flagp);
12975         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
12976             (UV) flags);
12977     }
12978     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12979
12980     nextchar(pRExC_state);
12981
12982     return TRUE;
12983 }
12984
12985
12986 PERL_STATIC_INLINE U8
12987 S_compute_EXACTish(RExC_state_t *pRExC_state)
12988 {
12989     U8 op;
12990
12991     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12992
12993     if (! FOLD) {
12994         return (LOC)
12995                 ? EXACTL
12996                 : EXACT;
12997     }
12998
12999     op = get_regex_charset(RExC_flags);
13000     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13001         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13002                  been, so there is no hole */
13003     }
13004
13005     return op + EXACTF;
13006 }
13007
13008 STATIC bool
13009 S_new_regcurly(const char *s, const char *e)
13010 {
13011     /* This is a temporary function designed to match the most lenient form of
13012      * a {m,n} quantifier we ever envision, with either number omitted, and
13013      * spaces anywhere between/before/after them.
13014      *
13015      * If this function fails, then the string it matches is very unlikely to
13016      * ever be considered a valid quantifier, so we can allow the '{' that
13017      * begins it to be considered as a literal */
13018
13019     bool has_min = FALSE;
13020     bool has_max = FALSE;
13021
13022     PERL_ARGS_ASSERT_NEW_REGCURLY;
13023
13024     if (s >= e || *s++ != '{')
13025         return FALSE;
13026
13027     while (s < e && isSPACE(*s)) {
13028         s++;
13029     }
13030     while (s < e && isDIGIT(*s)) {
13031         has_min = TRUE;
13032         s++;
13033     }
13034     while (s < e && isSPACE(*s)) {
13035         s++;
13036     }
13037
13038     if (*s == ',') {
13039         s++;
13040         while (s < e && isSPACE(*s)) {
13041             s++;
13042         }
13043         while (s < e && isDIGIT(*s)) {
13044             has_max = TRUE;
13045             s++;
13046         }
13047         while (s < e && isSPACE(*s)) {
13048             s++;
13049         }
13050     }
13051
13052     return s < e && *s == '}' && (has_min || has_max);
13053 }
13054
13055 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13056  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13057
13058 static I32
13059 S_backref_value(char *p, char *e)
13060 {
13061     const char* endptr = e;
13062     UV val;
13063     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13064         return (I32)val;
13065     return I32_MAX;
13066 }
13067
13068
13069 /*
13070  - regatom - the lowest level
13071
13072    Try to identify anything special at the start of the current parse position.
13073    If there is, then handle it as required. This may involve generating a
13074    single regop, such as for an assertion; or it may involve recursing, such as
13075    to handle a () structure.
13076
13077    If the string doesn't start with something special then we gobble up
13078    as much literal text as we can.  If we encounter a quantifier, we have to
13079    back off the final literal character, as that quantifier applies to just it
13080    and not to the whole string of literals.
13081
13082    Once we have been able to handle whatever type of thing started the
13083    sequence, we return the offset into the regex engine program being compiled
13084    at which any  next regnode should be placed.
13085
13086    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13087    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13088    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13089    Otherwise does not return 0.
13090
13091    Note: we have to be careful with escapes, as they can be both literal
13092    and special, and in the case of \10 and friends, context determines which.
13093
13094    A summary of the code structure is:
13095
13096    switch (first_byte) {
13097         cases for each special:
13098             handle this special;
13099             break;
13100         case '\\':
13101             switch (2nd byte) {
13102                 cases for each unambiguous special:
13103                     handle this special;
13104                     break;
13105                 cases for each ambigous special/literal:
13106                     disambiguate;
13107                     if (special)  handle here
13108                     else goto defchar;
13109                 default: // unambiguously literal:
13110                     goto defchar;
13111             }
13112         default:  // is a literal char
13113             // FALL THROUGH
13114         defchar:
13115             create EXACTish node for literal;
13116             while (more input and node isn't full) {
13117                 switch (input_byte) {
13118                    cases for each special;
13119                        make sure parse pointer is set so that the next call to
13120                            regatom will see this special first
13121                        goto loopdone; // EXACTish node terminated by prev. char
13122                    default:
13123                        append char to EXACTISH node;
13124                 }
13125                 get next input byte;
13126             }
13127         loopdone:
13128    }
13129    return the generated node;
13130
13131    Specifically there are two separate switches for handling
13132    escape sequences, with the one for handling literal escapes requiring
13133    a dummy entry for all of the special escapes that are actually handled
13134    by the other.
13135
13136 */
13137
13138 STATIC regnode_offset
13139 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13140 {
13141     regnode_offset ret = 0;
13142     I32 flags = 0;
13143     char *parse_start;
13144     U8 op;
13145     int invert = 0;
13146     U8 arg;
13147
13148     GET_RE_DEBUG_FLAGS_DECL;
13149
13150     *flagp = WORST;             /* Tentatively. */
13151
13152     DEBUG_PARSE("atom");
13153
13154     PERL_ARGS_ASSERT_REGATOM;
13155
13156   tryagain:
13157     parse_start = RExC_parse;
13158     assert(RExC_parse < RExC_end);
13159     switch ((U8)*RExC_parse) {
13160     case '^':
13161         RExC_seen_zerolen++;
13162         nextchar(pRExC_state);
13163         if (RExC_flags & RXf_PMf_MULTILINE)
13164             ret = reg_node(pRExC_state, MBOL);
13165         else
13166             ret = reg_node(pRExC_state, SBOL);
13167         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13168         break;
13169     case '$':
13170         nextchar(pRExC_state);
13171         if (*RExC_parse)
13172             RExC_seen_zerolen++;
13173         if (RExC_flags & RXf_PMf_MULTILINE)
13174             ret = reg_node(pRExC_state, MEOL);
13175         else
13176             ret = reg_node(pRExC_state, SEOL);
13177         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13178         break;
13179     case '.':
13180         nextchar(pRExC_state);
13181         if (RExC_flags & RXf_PMf_SINGLELINE)
13182             ret = reg_node(pRExC_state, SANY);
13183         else
13184             ret = reg_node(pRExC_state, REG_ANY);
13185         *flagp |= HASWIDTH|SIMPLE;
13186         MARK_NAUGHTY(1);
13187         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13188         break;
13189     case '[':
13190     {
13191         char * const oregcomp_parse = ++RExC_parse;
13192         ret = regclass(pRExC_state, flagp, depth+1,
13193                        FALSE, /* means parse the whole char class */
13194                        TRUE, /* allow multi-char folds */
13195                        FALSE, /* don't silence non-portable warnings. */
13196                        (bool) RExC_strict,
13197                        TRUE, /* Allow an optimized regnode result */
13198                        NULL);
13199         if (ret == 0) {
13200             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13201             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13202                   (UV) *flagp);
13203         }
13204         if (*RExC_parse != ']') {
13205             RExC_parse = oregcomp_parse;
13206             vFAIL("Unmatched [");
13207         }
13208         nextchar(pRExC_state);
13209         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13210         break;
13211     }
13212     case '(':
13213         nextchar(pRExC_state);
13214         ret = reg(pRExC_state, 2, &flags, depth+1);
13215         if (ret == 0) {
13216                 if (flags & TRYAGAIN) {
13217                     if (RExC_parse >= RExC_end) {
13218                          /* Make parent create an empty node if needed. */
13219                         *flagp |= TRYAGAIN;
13220                         return(0);
13221                     }
13222                     goto tryagain;
13223                 }
13224                 RETURN_FAIL_ON_RESTART(flags, flagp);
13225                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13226                                                                  (UV) flags);
13227         }
13228         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13229         break;
13230     case '|':
13231     case ')':
13232         if (flags & TRYAGAIN) {
13233             *flagp |= TRYAGAIN;
13234             return 0;
13235         }
13236         vFAIL("Internal urp");
13237                                 /* Supposed to be caught earlier. */
13238         break;
13239     case '?':
13240     case '+':
13241     case '*':
13242         RExC_parse++;
13243         vFAIL("Quantifier follows nothing");
13244         break;
13245     case '\\':
13246         /* Special Escapes
13247
13248            This switch handles escape sequences that resolve to some kind
13249            of special regop and not to literal text. Escape sequences that
13250            resolve to literal text are handled below in the switch marked
13251            "Literal Escapes".
13252
13253            Every entry in this switch *must* have a corresponding entry
13254            in the literal escape switch. However, the opposite is not
13255            required, as the default for this switch is to jump to the
13256            literal text handling code.
13257         */
13258         RExC_parse++;
13259         switch ((U8)*RExC_parse) {
13260         /* Special Escapes */
13261         case 'A':
13262             RExC_seen_zerolen++;
13263             ret = reg_node(pRExC_state, SBOL);
13264             /* SBOL is shared with /^/ so we set the flags so we can tell
13265              * /\A/ from /^/ in split. */
13266             FLAGS(REGNODE_p(ret)) = 1;
13267             *flagp |= SIMPLE;
13268             goto finish_meta_pat;
13269         case 'G':
13270             ret = reg_node(pRExC_state, GPOS);
13271             RExC_seen |= REG_GPOS_SEEN;
13272             *flagp |= SIMPLE;
13273             goto finish_meta_pat;
13274         case 'K':
13275             RExC_seen_zerolen++;
13276             ret = reg_node(pRExC_state, KEEPS);
13277             *flagp |= SIMPLE;
13278             /* XXX:dmq : disabling in-place substitution seems to
13279              * be necessary here to avoid cases of memory corruption, as
13280              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13281              */
13282             RExC_seen |= REG_LOOKBEHIND_SEEN;
13283             goto finish_meta_pat;
13284         case 'Z':
13285             ret = reg_node(pRExC_state, SEOL);
13286             *flagp |= SIMPLE;
13287             RExC_seen_zerolen++;                /* Do not optimize RE away */
13288             goto finish_meta_pat;
13289         case 'z':
13290             ret = reg_node(pRExC_state, EOS);
13291             *flagp |= SIMPLE;
13292             RExC_seen_zerolen++;                /* Do not optimize RE away */
13293             goto finish_meta_pat;
13294         case 'C':
13295             vFAIL("\\C no longer supported");
13296         case 'X':
13297             ret = reg_node(pRExC_state, CLUMP);
13298             *flagp |= HASWIDTH;
13299             goto finish_meta_pat;
13300
13301         case 'W':
13302             invert = 1;
13303             /* FALLTHROUGH */
13304         case 'w':
13305             arg = ANYOF_WORDCHAR;
13306             goto join_posix;
13307
13308         case 'B':
13309             invert = 1;
13310             /* FALLTHROUGH */
13311         case 'b':
13312           {
13313             U8 flags = 0;
13314             regex_charset charset = get_regex_charset(RExC_flags);
13315
13316             RExC_seen_zerolen++;
13317             RExC_seen |= REG_LOOKBEHIND_SEEN;
13318             op = BOUND + charset;
13319
13320             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13321                 flags = TRADITIONAL_BOUND;
13322                 if (op > BOUNDA) {  /* /aa is same as /a */
13323                     op = BOUNDA;
13324                 }
13325             }
13326             else {
13327                 STRLEN length;
13328                 char name = *RExC_parse;
13329                 char * endbrace = NULL;
13330                 RExC_parse += 2;
13331                 if (RExC_parse < RExC_end) {
13332                     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13333                 }
13334
13335                 if (! endbrace) {
13336                     vFAIL2("Missing right brace on \\%c{}", name);
13337                 }
13338                 /* XXX Need to decide whether to take spaces or not.  Should be
13339                  * consistent with \p{}, but that currently is SPACE, which
13340                  * means vertical too, which seems wrong
13341                  * while (isBLANK(*RExC_parse)) {
13342                     RExC_parse++;
13343                 }*/
13344                 if (endbrace == RExC_parse) {
13345                     RExC_parse++;  /* After the '}' */
13346                     vFAIL2("Empty \\%c{}", name);
13347                 }
13348                 length = endbrace - RExC_parse;
13349                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13350                     length--;
13351                 }*/
13352                 switch (*RExC_parse) {
13353                     case 'g':
13354                         if (    length != 1
13355                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13356                         {
13357                             goto bad_bound_type;
13358                         }
13359                         flags = GCB_BOUND;
13360                         break;
13361                     case 'l':
13362                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13363                             goto bad_bound_type;
13364                         }
13365                         flags = LB_BOUND;
13366                         break;
13367                     case 's':
13368                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13369                             goto bad_bound_type;
13370                         }
13371                         flags = SB_BOUND;
13372                         break;
13373                     case 'w':
13374                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13375                             goto bad_bound_type;
13376                         }
13377                         flags = WB_BOUND;
13378                         break;
13379                     default:
13380                       bad_bound_type:
13381                         RExC_parse = endbrace;
13382                         vFAIL2utf8f(
13383                             "'%" UTF8f "' is an unknown bound type",
13384                             UTF8fARG(UTF, length, endbrace - length));
13385                         NOT_REACHED; /*NOTREACHED*/
13386                 }
13387                 RExC_parse = endbrace;
13388                 REQUIRE_UNI_RULES(flagp, 0);
13389
13390                 if (op == BOUND) {
13391                     op = BOUNDU;
13392                 }
13393                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13394                     op = BOUNDU;
13395                     length += 4;
13396
13397                     /* Don't have to worry about UTF-8, in this message because
13398                      * to get here the contents of the \b must be ASCII */
13399                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13400                               "Using /u for '%.*s' instead of /%s",
13401                               (unsigned) length,
13402                               endbrace - length + 1,
13403                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13404                               ? ASCII_RESTRICT_PAT_MODS
13405                               : ASCII_MORE_RESTRICT_PAT_MODS);
13406                 }
13407             }
13408
13409             if (op == BOUND) {
13410                 RExC_seen_d_op = TRUE;
13411             }
13412             else if (op == BOUNDL) {
13413                 RExC_contains_locale = 1;
13414             }
13415
13416             if (invert) {
13417                 op += NBOUND - BOUND;
13418             }
13419
13420             ret = reg_node(pRExC_state, op);
13421             FLAGS(REGNODE_p(ret)) = flags;
13422
13423             *flagp |= SIMPLE;
13424
13425             goto finish_meta_pat;
13426           }
13427
13428         case 'D':
13429             invert = 1;
13430             /* FALLTHROUGH */
13431         case 'd':
13432             arg = ANYOF_DIGIT;
13433             if (! DEPENDS_SEMANTICS) {
13434                 goto join_posix;
13435             }
13436
13437             /* \d doesn't have any matches in the upper Latin1 range, hence /d
13438              * is equivalent to /u.  Changing to /u saves some branches at
13439              * runtime */
13440             op = POSIXU;
13441             goto join_posix_op_known;
13442
13443         case 'R':
13444             ret = reg_node(pRExC_state, LNBREAK);
13445             *flagp |= HASWIDTH|SIMPLE;
13446             goto finish_meta_pat;
13447
13448         case 'H':
13449             invert = 1;
13450             /* FALLTHROUGH */
13451         case 'h':
13452             arg = ANYOF_BLANK;
13453             op = POSIXU;
13454             goto join_posix_op_known;
13455
13456         case 'V':
13457             invert = 1;
13458             /* FALLTHROUGH */
13459         case 'v':
13460             arg = ANYOF_VERTWS;
13461             op = POSIXU;
13462             goto join_posix_op_known;
13463
13464         case 'S':
13465             invert = 1;
13466             /* FALLTHROUGH */
13467         case 's':
13468             arg = ANYOF_SPACE;
13469
13470           join_posix:
13471
13472             op = POSIXD + get_regex_charset(RExC_flags);
13473             if (op > POSIXA) {  /* /aa is same as /a */
13474                 op = POSIXA;
13475             }
13476             else if (op == POSIXL) {
13477                 RExC_contains_locale = 1;
13478             }
13479             else if (op == POSIXD) {
13480                 RExC_seen_d_op = TRUE;
13481             }
13482
13483           join_posix_op_known:
13484
13485             if (invert) {
13486                 op += NPOSIXD - POSIXD;
13487             }
13488
13489             ret = reg_node(pRExC_state, op);
13490             FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13491
13492             *flagp |= HASWIDTH|SIMPLE;
13493             /* FALLTHROUGH */
13494
13495           finish_meta_pat:
13496             if (   UCHARAT(RExC_parse + 1) == '{'
13497                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13498             {
13499                 RExC_parse += 2;
13500                 vFAIL("Unescaped left brace in regex is illegal here");
13501             }
13502             nextchar(pRExC_state);
13503             Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13504             break;
13505         case 'p':
13506         case 'P':
13507             RExC_parse--;
13508
13509             ret = regclass(pRExC_state, flagp, depth+1,
13510                            TRUE, /* means just parse this element */
13511                            FALSE, /* don't allow multi-char folds */
13512                            FALSE, /* don't silence non-portable warnings.  It
13513                                      would be a bug if these returned
13514                                      non-portables */
13515                            (bool) RExC_strict,
13516                            TRUE, /* Allow an optimized regnode result */
13517                            NULL);
13518             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13519             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13520              * multi-char folds are allowed.  */
13521             if (!ret)
13522                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13523                       (UV) *flagp);
13524
13525             RExC_parse--;
13526
13527             Set_Node_Offset(REGNODE_p(ret), parse_start);
13528             Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13529             nextchar(pRExC_state);
13530             break;
13531         case 'N':
13532             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13533              * \N{...} evaluates to a sequence of more than one code points).
13534              * The function call below returns a regnode, which is our result.
13535              * The parameters cause it to fail if the \N{} evaluates to a
13536              * single code point; we handle those like any other literal.  The
13537              * reason that the multicharacter case is handled here and not as
13538              * part of the EXACtish code is because of quantifiers.  In
13539              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13540              * this way makes that Just Happen. dmq.
13541              * join_exact() will join this up with adjacent EXACTish nodes
13542              * later on, if appropriate. */
13543             ++RExC_parse;
13544             if (grok_bslash_N(pRExC_state,
13545                               &ret,     /* Want a regnode returned */
13546                               NULL,     /* Fail if evaluates to a single code
13547                                            point */
13548                               NULL,     /* Don't need a count of how many code
13549                                            points */
13550                               flagp,
13551                               RExC_strict,
13552                               depth)
13553             ) {
13554                 break;
13555             }
13556
13557             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13558
13559             /* Here, evaluates to a single code point.  Go get that */
13560             RExC_parse = parse_start;
13561             goto defchar;
13562
13563         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13564       parse_named_seq:
13565         {
13566             char ch;
13567             if (   RExC_parse >= RExC_end - 1
13568                 || ((   ch = RExC_parse[1]) != '<'
13569                                       && ch != '\''
13570                                       && ch != '{'))
13571             {
13572                 RExC_parse++;
13573                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13574                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13575             } else {
13576                 RExC_parse += 2;
13577                 ret = handle_named_backref(pRExC_state,
13578                                            flagp,
13579                                            parse_start,
13580                                            (ch == '<')
13581                                            ? '>'
13582                                            : (ch == '{')
13583                                              ? '}'
13584                                              : '\'');
13585             }
13586             break;
13587         }
13588         case 'g':
13589         case '1': case '2': case '3': case '4':
13590         case '5': case '6': case '7': case '8': case '9':
13591             {
13592                 I32 num;
13593                 bool hasbrace = 0;
13594
13595                 if (*RExC_parse == 'g') {
13596                     bool isrel = 0;
13597
13598                     RExC_parse++;
13599                     if (*RExC_parse == '{') {
13600                         RExC_parse++;
13601                         hasbrace = 1;
13602                     }
13603                     if (*RExC_parse == '-') {
13604                         RExC_parse++;
13605                         isrel = 1;
13606                     }
13607                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13608                         if (isrel) RExC_parse--;
13609                         RExC_parse -= 2;
13610                         goto parse_named_seq;
13611                     }
13612
13613                     if (RExC_parse >= RExC_end) {
13614                         goto unterminated_g;
13615                     }
13616                     num = S_backref_value(RExC_parse, RExC_end);
13617                     if (num == 0)
13618                         vFAIL("Reference to invalid group 0");
13619                     else if (num == I32_MAX) {
13620                          if (isDIGIT(*RExC_parse))
13621                             vFAIL("Reference to nonexistent group");
13622                         else
13623                           unterminated_g:
13624                             vFAIL("Unterminated \\g... pattern");
13625                     }
13626
13627                     if (isrel) {
13628                         num = RExC_npar - num;
13629                         if (num < 1)
13630                             vFAIL("Reference to nonexistent or unclosed group");
13631                     }
13632                 }
13633                 else {
13634                     num = S_backref_value(RExC_parse, RExC_end);
13635                     /* bare \NNN might be backref or octal - if it is larger
13636                      * than or equal RExC_npar then it is assumed to be an
13637                      * octal escape. Note RExC_npar is +1 from the actual
13638                      * number of parens. */
13639                     /* Note we do NOT check if num == I32_MAX here, as that is
13640                      * handled by the RExC_npar check */
13641
13642                     if (
13643                         /* any numeric escape < 10 is always a backref */
13644                         num > 9
13645                         /* any numeric escape < RExC_npar is a backref */
13646                         && num >= RExC_npar
13647                         /* cannot be an octal escape if it starts with 8 */
13648                         && *RExC_parse != '8'
13649                         /* cannot be an octal escape it it starts with 9 */
13650                         && *RExC_parse != '9'
13651                     ) {
13652                         /* Probably not meant to be a backref, instead likely
13653                          * to be an octal character escape, e.g. \35 or \777.
13654                          * The above logic should make it obvious why using
13655                          * octal escapes in patterns is problematic. - Yves */
13656                         RExC_parse = parse_start;
13657                         goto defchar;
13658                     }
13659                 }
13660
13661                 /* At this point RExC_parse points at a numeric escape like
13662                  * \12 or \88 or something similar, which we should NOT treat
13663                  * as an octal escape. It may or may not be a valid backref
13664                  * escape. For instance \88888888 is unlikely to be a valid
13665                  * backref. */
13666                 while (isDIGIT(*RExC_parse))
13667                     RExC_parse++;
13668                 if (hasbrace) {
13669                     if (*RExC_parse != '}')
13670                         vFAIL("Unterminated \\g{...} pattern");
13671                     RExC_parse++;
13672                 }
13673                 if (num >= (I32)RExC_npar) {
13674
13675                     /* It might be a forward reference; we can't fail until we
13676                      * know, by completing the parse to get all the groups, and
13677                      * then reparsing */
13678                     if (RExC_total_parens > 0)  {
13679                         if (num >= RExC_total_parens)  {
13680                             vFAIL("Reference to nonexistent group");
13681                         }
13682                     }
13683                     else {
13684                         REQUIRE_PARENS_PASS;
13685                     }
13686                 }
13687                 RExC_sawback = 1;
13688                 ret = reganode(pRExC_state,
13689                                ((! FOLD)
13690                                  ? REF
13691                                  : (ASCII_FOLD_RESTRICTED)
13692                                    ? REFFA
13693                                    : (AT_LEAST_UNI_SEMANTICS)
13694                                      ? REFFU
13695                                      : (LOC)
13696                                        ? REFFL
13697                                        : REFF),
13698                                 num);
13699                 if (OP(REGNODE_p(ret)) == REFF) {
13700                     RExC_seen_d_op = TRUE;
13701                 }
13702                 *flagp |= HASWIDTH;
13703
13704                 /* override incorrect value set in reganode MJD */
13705                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13706                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13707                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13708                                         FALSE /* Don't force to /x */ );
13709             }
13710             break;
13711         case '\0':
13712             if (RExC_parse >= RExC_end)
13713                 FAIL("Trailing \\");
13714             /* FALLTHROUGH */
13715         default:
13716             /* Do not generate "unrecognized" warnings here, we fall
13717                back into the quick-grab loop below */
13718             RExC_parse = parse_start;
13719             goto defchar;
13720         } /* end of switch on a \foo sequence */
13721         break;
13722
13723     case '#':
13724
13725         /* '#' comments should have been spaced over before this function was
13726          * called */
13727         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13728         /*
13729         if (RExC_flags & RXf_PMf_EXTENDED) {
13730             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13731             if (RExC_parse < RExC_end)
13732                 goto tryagain;
13733         }
13734         */
13735
13736         /* FALLTHROUGH */
13737
13738     default:
13739           defchar: {
13740
13741             /* Here, we have determined that the next thing is probably a
13742              * literal character.  RExC_parse points to the first byte of its
13743              * definition.  (It still may be an escape sequence that evaluates
13744              * to a single character) */
13745
13746             STRLEN len = 0;
13747             UV ender = 0;
13748             char *p;
13749             char *s;
13750
13751 /* This allows us to fill a node with just enough spare so that if the final
13752  * character folds, its expansion is guaranteed to fit */
13753 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13754
13755             char *s0;
13756             U8 upper_parse = MAX_NODE_STRING_SIZE;
13757
13758             /* We start out as an EXACT node, even if under /i, until we find a
13759              * character which is in a fold.  The algorithm now segregates into
13760              * separate nodes, characters that fold from those that don't under
13761              * /i.  (This hopefully will create nodes that are fixed strings
13762              * even under /i, giving the optimizer something to grab on to.)
13763              * So, if a node has something in it and the next character is in
13764              * the opposite category, that node is closed up, and the function
13765              * returns.  Then regatom is called again, and a new node is
13766              * created for the new category. */
13767             U8 node_type = EXACT;
13768
13769             /* Assume the node will be fully used; the excess is given back at
13770              * the end.  We can't make any other length assumptions, as a byte
13771              * input sequence could shrink down. */
13772             Ptrdiff_t initial_size = STR_SZ(256);
13773
13774             bool next_is_quantifier;
13775             char * oldp = NULL;
13776
13777             /* We can convert EXACTF nodes to EXACTFU if they contain only
13778              * characters that match identically regardless of the target
13779              * string's UTF8ness.  The reason to do this is that EXACTF is not
13780              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13781              * runtime.
13782              *
13783              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13784              * contain only above-Latin1 characters (hence must be in UTF8),
13785              * which don't participate in folds with Latin1-range characters,
13786              * as the latter's folds aren't known until runtime. */
13787             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13788
13789             /* Single-character EXACTish nodes are almost always SIMPLE.  This
13790              * allows us to override this as encountered */
13791             U8 maybe_SIMPLE = SIMPLE;
13792
13793             /* Does this node contain something that can't match unless the
13794              * target string is (also) in UTF-8 */
13795             bool requires_utf8_target = FALSE;
13796
13797             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13798             bool has_ss = FALSE;
13799
13800             /* So is the MICRO SIGN */
13801             bool has_micro_sign = FALSE;
13802
13803             /* Allocate an EXACT node.  The node_type may change below to
13804              * another EXACTish node, but since the size of the node doesn't
13805              * change, it works */
13806             ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
13807             FILL_NODE(ret, node_type);
13808             RExC_emit++;
13809
13810             s = STRING(REGNODE_p(ret));
13811
13812             s0 = s;
13813
13814           reparse:
13815
13816             /* This breaks under rare circumstances.  If folding, we do not
13817              * want to split a node at a character that is a non-final in a
13818              * multi-char fold, as an input string could just happen to want to
13819              * match across the node boundary.  The code at the end of the loop
13820              * looks for this, and backs off until it finds not such a
13821              * character, but it is possible (though extremely, extremely
13822              * unlikely) for all characters in the node to be non-final fold
13823              * ones, in which case we just leave the node fully filled, and
13824              * hope that it doesn't match the string in just the wrong place */
13825
13826             assert( ! UTF     /* Is at the beginning of a character */
13827                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13828                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13829
13830
13831             /* Here, we have a literal character.  Find the maximal string of
13832              * them in the input that we can fit into a single EXACTish node.
13833              * We quit at the first non-literal or when the node gets full, or
13834              * under /i the categorization of folding/non-folding character
13835              * changes */
13836             for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
13837
13838                 /* In most cases each iteration adds one byte to the output.
13839                  * The exceptions override this */
13840                 Size_t added_len = 1;
13841
13842                 oldp = p;
13843
13844                 /* White space has already been ignored */
13845                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13846                        || ! is_PATWS_safe((p), RExC_end, UTF));
13847
13848                 switch ((U8)*p) {
13849                 case '^':
13850                 case '$':
13851                 case '.':
13852                 case '[':
13853                 case '(':
13854                 case ')':
13855                 case '|':
13856                     goto loopdone;
13857                 case '\\':
13858                     /* Literal Escapes Switch
13859
13860                        This switch is meant to handle escape sequences that
13861                        resolve to a literal character.
13862
13863                        Every escape sequence that represents something
13864                        else, like an assertion or a char class, is handled
13865                        in the switch marked 'Special Escapes' above in this
13866                        routine, but also has an entry here as anything that
13867                        isn't explicitly mentioned here will be treated as
13868                        an unescaped equivalent literal.
13869                     */
13870
13871                     switch ((U8)*++p) {
13872
13873                     /* These are all the special escapes. */
13874                     case 'A':             /* Start assertion */
13875                     case 'b': case 'B':   /* Word-boundary assertion*/
13876                     case 'C':             /* Single char !DANGEROUS! */
13877                     case 'd': case 'D':   /* digit class */
13878                     case 'g': case 'G':   /* generic-backref, pos assertion */
13879                     case 'h': case 'H':   /* HORIZWS */
13880                     case 'k': case 'K':   /* named backref, keep marker */
13881                     case 'p': case 'P':   /* Unicode property */
13882                               case 'R':   /* LNBREAK */
13883                     case 's': case 'S':   /* space class */
13884                     case 'v': case 'V':   /* VERTWS */
13885                     case 'w': case 'W':   /* word class */
13886                     case 'X':             /* eXtended Unicode "combining
13887                                              character sequence" */
13888                     case 'z': case 'Z':   /* End of line/string assertion */
13889                         --p;
13890                         goto loopdone;
13891
13892                     /* Anything after here is an escape that resolves to a
13893                        literal. (Except digits, which may or may not)
13894                      */
13895                     case 'n':
13896                         ender = '\n';
13897                         p++;
13898                         break;
13899                     case 'N': /* Handle a single-code point named character. */
13900                         RExC_parse = p + 1;
13901                         if (! grok_bslash_N(pRExC_state,
13902                                             NULL,   /* Fail if evaluates to
13903                                                        anything other than a
13904                                                        single code point */
13905                                             &ender, /* The returned single code
13906                                                        point */
13907                                             NULL,   /* Don't need a count of
13908                                                        how many code points */
13909                                             flagp,
13910                                             RExC_strict,
13911                                             depth)
13912                         ) {
13913                             if (*flagp & NEED_UTF8)
13914                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13915                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13916
13917                             /* Here, it wasn't a single code point.  Go close
13918                              * up this EXACTish node.  The switch() prior to
13919                              * this switch handles the other cases */
13920                             RExC_parse = p = oldp;
13921                             goto loopdone;
13922                         }
13923                         p = RExC_parse;
13924                         RExC_parse = parse_start;
13925
13926                         /* The \N{} means the pattern, if previously /d,
13927                          * becomes /u.  That means it can't be an EXACTF node,
13928                          * but an EXACTFU */
13929                         if (node_type == EXACTF) {
13930                             node_type = EXACTFU;
13931
13932                             /* If the node already contains something that
13933                              * differs between EXACTF and EXACTFU, reparse it
13934                              * as EXACTFU */
13935                             if (! maybe_exactfu) {
13936                                 len = 0;
13937                                 s = s0;
13938                                 goto reparse;
13939                             }
13940                         }
13941
13942                         break;
13943                     case 'r':
13944                         ender = '\r';
13945                         p++;
13946                         break;
13947                     case 't':
13948                         ender = '\t';
13949                         p++;
13950                         break;
13951                     case 'f':
13952                         ender = '\f';
13953                         p++;
13954                         break;
13955                     case 'e':
13956                         ender = ESC_NATIVE;
13957                         p++;
13958                         break;
13959                     case 'a':
13960                         ender = '\a';
13961                         p++;
13962                         break;
13963                     case 'o':
13964                         {
13965                             UV result;
13966                             const char* error_msg;
13967
13968                             bool valid = grok_bslash_o(&p,
13969                                                        RExC_end,
13970                                                        &result,
13971                                                        &error_msg,
13972                                                        TO_OUTPUT_WARNINGS(p),
13973                                                        (bool) RExC_strict,
13974                                                        TRUE, /* Output warnings
13975                                                                 for non-
13976                                                                 portables */
13977                                                        UTF);
13978                             if (! valid) {
13979                                 RExC_parse = p; /* going to die anyway; point
13980                                                    to exact spot of failure */
13981                                 vFAIL(error_msg);
13982                             }
13983                             UPDATE_WARNINGS_LOC(p - 1);
13984                             ender = result;
13985                             break;
13986                         }
13987                     case 'x':
13988                         {
13989                             UV result = UV_MAX; /* initialize to erroneous
13990                                                    value */
13991                             const char* error_msg;
13992
13993                             bool valid = grok_bslash_x(&p,
13994                                                        RExC_end,
13995                                                        &result,
13996                                                        &error_msg,
13997                                                        TO_OUTPUT_WARNINGS(p),
13998                                                        (bool) RExC_strict,
13999                                                        TRUE, /* Silence warnings
14000                                                                 for non-
14001                                                                 portables */
14002                                                        UTF);
14003                             if (! valid) {
14004                                 RExC_parse = p; /* going to die anyway; point
14005                                                    to exact spot of failure */
14006                                 vFAIL(error_msg);
14007                             }
14008                             UPDATE_WARNINGS_LOC(p - 1);
14009                             ender = result;
14010
14011                             if (ender < 0x100) {
14012 #ifdef EBCDIC
14013                                 if (RExC_recode_x_to_native) {
14014                                     ender = LATIN1_TO_NATIVE(ender);
14015                                 }
14016 #endif
14017                             }
14018                             break;
14019                         }
14020                     case 'c':
14021                         p++;
14022                         ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14023                         UPDATE_WARNINGS_LOC(p);
14024                         p++;
14025                         break;
14026                     case '8': case '9': /* must be a backreference */
14027                         --p;
14028                         /* we have an escape like \8 which cannot be an octal escape
14029                          * so we exit the loop, and let the outer loop handle this
14030                          * escape which may or may not be a legitimate backref. */
14031                         goto loopdone;
14032                     case '1': case '2': case '3':case '4':
14033                     case '5': case '6': case '7':
14034                         /* When we parse backslash escapes there is ambiguity
14035                          * between backreferences and octal escapes. Any escape
14036                          * from \1 - \9 is a backreference, any multi-digit
14037                          * escape which does not start with 0 and which when
14038                          * evaluated as decimal could refer to an already
14039                          * parsed capture buffer is a back reference. Anything
14040                          * else is octal.
14041                          *
14042                          * Note this implies that \118 could be interpreted as
14043                          * 118 OR as "\11" . "8" depending on whether there
14044                          * were 118 capture buffers defined already in the
14045                          * pattern.  */
14046
14047                         /* NOTE, RExC_npar is 1 more than the actual number of
14048                          * parens we have seen so far, hence the "<" as opposed
14049                          * to "<=" */
14050                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14051                         {  /* Not to be treated as an octal constant, go
14052                                    find backref */
14053                             --p;
14054                             goto loopdone;
14055                         }
14056                         /* FALLTHROUGH */
14057                     case '0':
14058                         {
14059                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14060                             STRLEN numlen = 3;
14061                             ender = grok_oct(p, &numlen, &flags, NULL);
14062                             p += numlen;
14063                             if (   isDIGIT(*p)  /* like \08, \178 */
14064                                 && ckWARN(WARN_REGEXP)
14065                                 && numlen < 3)
14066                             {
14067                                 reg_warn_non_literal_string(
14068                                          p + 1,
14069                                          form_short_octal_warning(p, numlen));
14070                             }
14071                         }
14072                         break;
14073                     case '\0':
14074                         if (p >= RExC_end)
14075                             FAIL("Trailing \\");
14076                         /* FALLTHROUGH */
14077                     default:
14078                         if (isALPHANUMERIC(*p)) {
14079                             /* An alpha followed by '{' is going to fail next
14080                              * iteration, so don't output this warning in that
14081                              * case */
14082                             if (! isALPHA(*p) || *(p + 1) != '{') {
14083                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14084                                                   " passed through", p);
14085                             }
14086                         }
14087                         goto normal_default;
14088                     } /* End of switch on '\' */
14089                     break;
14090                 case '{':
14091                     /* Trying to gain new uses for '{' without breaking too
14092                      * much existing code is hard.  The solution currently
14093                      * adopted is:
14094                      *  1)  If there is no ambiguity that a '{' should always
14095                      *      be taken literally, at the start of a construct, we
14096                      *      just do so.
14097                      *  2)  If the literal '{' conflicts with our desired use
14098                      *      of it as a metacharacter, we die.  The deprecation
14099                      *      cycles for this have come and gone.
14100                      *  3)  If there is ambiguity, we raise a simple warning.
14101                      *      This could happen, for example, if the user
14102                      *      intended it to introduce a quantifier, but slightly
14103                      *      misspelled the quantifier.  Without this warning,
14104                      *      the quantifier would silently be taken as a literal
14105                      *      string of characters instead of a meta construct */
14106                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14107                         if (      RExC_strict
14108                             || (  p > parse_start + 1
14109                                 && isALPHA_A(*(p - 1))
14110                                 && *(p - 2) == '\\')
14111                             || new_regcurly(p, RExC_end))
14112                         {
14113                             RExC_parse = p + 1;
14114                             vFAIL("Unescaped left brace in regex is "
14115                                   "illegal here");
14116                         }
14117                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14118                                          " passed through");
14119                     }
14120                     goto normal_default;
14121                 case '}':
14122                 case ']':
14123                     if (p > RExC_parse && RExC_strict) {
14124                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14125                     }
14126                     /*FALLTHROUGH*/
14127                 default:    /* A literal character */
14128                   normal_default:
14129                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14130                         STRLEN numlen;
14131                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14132                                                &numlen, UTF8_ALLOW_DEFAULT);
14133                         p += numlen;
14134                     }
14135                     else
14136                         ender = (U8) *p++;
14137                     break;
14138                 } /* End of switch on the literal */
14139
14140                 /* Here, have looked at the literal character, and <ender>
14141                  * contains its ordinal; <p> points to the character after it.
14142                  * */
14143
14144                 if (ender > 255) {
14145                     REQUIRE_UTF8(flagp);
14146                 }
14147
14148                 /* We need to check if the next non-ignored thing is a
14149                  * quantifier.  Move <p> to after anything that should be
14150                  * ignored, which, as a side effect, positions <p> for the next
14151                  * loop iteration */
14152                 skip_to_be_ignored_text(pRExC_state, &p,
14153                                         FALSE /* Don't force to /x */ );
14154
14155                 /* If the next thing is a quantifier, it applies to this
14156                  * character only, which means that this character has to be in
14157                  * its own node and can't just be appended to the string in an
14158                  * existing node, so if there are already other characters in
14159                  * the node, close the node with just them, and set up to do
14160                  * this character again next time through, when it will be the
14161                  * only thing in its new node */
14162
14163                 next_is_quantifier =    LIKELY(p < RExC_end)
14164                                      && UNLIKELY(ISMULT2(p));
14165
14166                 if (next_is_quantifier && LIKELY(len)) {
14167                     p = oldp;
14168                     goto loopdone;
14169                 }
14170
14171                 /* Ready to add 'ender' to the node */
14172
14173                 if (! FOLD) {  /* The simple case, just append the literal */
14174
14175                       not_fold_common:
14176                         if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14177                             *(s++) = (char) ender;
14178                         }
14179                         else {
14180                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14181                             added_len = (char *) new_s - s;
14182                             s = (char *) new_s;
14183
14184                             if (ender > 255)  {
14185                                 requires_utf8_target = TRUE;
14186                             }
14187                         }
14188                 }
14189                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14190
14191                     /* Here are folding under /l, and the code point is
14192                      * problematic.  If this is the first character in the
14193                      * node, change the node type to folding.   Otherwise, if
14194                      * this is the first problematic character, close up the
14195                      * existing node, so can start a new node with this one */
14196                     if (! len) {
14197                         node_type = EXACTFL;
14198                         RExC_contains_locale = 1;
14199                     }
14200                     else if (node_type == EXACT) {
14201                         p = oldp;
14202                         goto loopdone;
14203                     }
14204
14205                     /* This problematic code point means we can't simplify
14206                      * things */
14207                     maybe_exactfu = FALSE;
14208
14209                     /* Here, we are adding a problematic fold character.
14210                      * "Problematic" in this context means that its fold isn't
14211                      * known until runtime.  (The non-problematic code points
14212                      * are the above-Latin1 ones that fold to also all
14213                      * above-Latin1.  Their folds don't vary no matter what the
14214                      * locale is.) But here we have characters whose fold
14215                      * depends on the locale.  We just add in the unfolded
14216                      * character, and wait until runtime to fold it */
14217                     goto not_fold_common;
14218                 }
14219                 else /* regular fold; see if actually is in a fold */
14220                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14221                          || (ender > 255
14222                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14223                 {
14224                     /* Here, folding, but the character isn't in a fold.
14225                      *
14226                      * Start a new node if previous characters in the node were
14227                      * folded */
14228                     if (len && node_type != EXACT) {
14229                         p = oldp;
14230                         goto loopdone;
14231                     }
14232
14233                     /* Here, continuing a node with non-folded characters.  Add
14234                      * this one */
14235                     goto not_fold_common;
14236                 }
14237                 else {  /* Here, does participate in some fold */
14238
14239                     /* If this is the first character in the node, change its
14240                      * type to folding.  Otherwise, if this is the first
14241                      * folding character in the node, close up the existing
14242                      * node, so can start a new node with this one.  */
14243                     if (! len) {
14244                         node_type = compute_EXACTish(pRExC_state);
14245                     }
14246                     else if (node_type == EXACT) {
14247                         p = oldp;
14248                         goto loopdone;
14249                     }
14250
14251                     if (UTF) {  /* Use the folded value */
14252                         if (UVCHR_IS_INVARIANT(ender)) {
14253                             *(s)++ = (U8) toFOLD(ender);
14254                         }
14255                         else {
14256                             ender = _to_uni_fold_flags(
14257                                     ender,
14258                                     (U8 *) s,
14259                                     &added_len,
14260                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14261                                                     ? FOLD_FLAGS_NOMIX_ASCII
14262                                                     : 0));
14263                             s += added_len;
14264
14265                             if (   ender > 255
14266                                 && LIKELY(ender != GREEK_SMALL_LETTER_MU))
14267                             {
14268                                 /* U+B5 folds to the MU, so its possible for a
14269                                  * non-UTF-8 target to match it */
14270                                 requires_utf8_target = TRUE;
14271                             }
14272                         }
14273                     }
14274                     else {
14275
14276                         /* Here is non-UTF8.  First, see if the character's
14277                          * fold differs between /d and /u. */
14278                         if (PL_fold[ender] != PL_fold_latin1[ender]) {
14279                             maybe_exactfu = FALSE;
14280                         }
14281
14282 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14283    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14284                                       || UNICODE_DOT_DOT_VERSION > 0)
14285
14286                         /* On non-ancient Unicode versions, this includes the
14287                          * multi-char fold SHARP S to 'ss' */
14288
14289                         if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
14290                                  || (   isALPHA_FOLD_EQ(ender, 's')
14291                                      && len > 0
14292                                      && isALPHA_FOLD_EQ(*(s-1), 's')))
14293                         {
14294                             /* Here, we have one of the following:
14295                              *  a)  a SHARP S.  This folds to 'ss' only under
14296                              *      /u rules.  If we are in that situation,
14297                              *      fold the SHARP S to 'ss'.  See the comments
14298                              *      for join_exact() as to why we fold this
14299                              *      non-UTF at compile time, and no others.
14300                              *  b)  'ss'.  When under /u, there's nothing
14301                              *      special needed to be done here.  The
14302                              *      previous iteration handled the first 's',
14303                              *      and this iteration will handle the second.
14304                              *      If, on the otherhand it's not /u, we have
14305                              *      to exclude the possibility of moving to /u,
14306                              *      so that we won't generate an unwanted
14307                              *      match, unless, at runtime, the target
14308                              *      string is in UTF-8.
14309                              * */
14310
14311                             has_ss = TRUE;
14312                             maybe_exactfu = FALSE;  /* Can't generate an
14313                                                        EXACTFU node (unless we
14314                                                        already are in one) */
14315                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14316                                 maybe_SIMPLE = 0;
14317                                 if (node_type == EXACTFU) {
14318                                     *(s++) = 's';
14319
14320                                     /* Let the code below add in the extra 's' */
14321                                     ender = 's';
14322                                     added_len = 2;
14323                                 }
14324                             }
14325                         }
14326 #endif
14327
14328                         else if (UNLIKELY(ender == MICRO_SIGN)) {
14329                             has_micro_sign = TRUE;
14330                         }
14331
14332                         *(s++) = (char) (DEPENDS_SEMANTICS)
14333                                         ? toFOLD(ender)
14334
14335                                           /* Under /u, the fold of any
14336                                            * character in the 0-255 range
14337                                            * happens to be its lowercase
14338                                            * equivalent, except for LATIN SMALL
14339                                            * LETTER SHARP S, which was handled
14340                                            * above, and the MICRO SIGN, whose
14341                                            * fold requires UTF-8 to represent.
14342                                            * */
14343                                         : toLOWER_L1(ender);
14344                     }
14345                 } /* End of adding current character to the node */
14346
14347                 len += added_len;
14348
14349                 if (next_is_quantifier) {
14350
14351                     /* Here, the next input is a quantifier, and to get here,
14352                      * the current character is the only one in the node. */
14353                     goto loopdone;
14354                 }
14355
14356             } /* End of loop through literal characters */
14357
14358             /* Here we have either exhausted the input or ran out of room in
14359              * the node.  (If we encountered a character that can't be in the
14360              * node, transfer is made directly to <loopdone>, and so we
14361              * wouldn't have fallen off the end of the loop.)  In the latter
14362              * case, we artificially have to split the node into two, because
14363              * we just don't have enough space to hold everything.  This
14364              * creates a problem if the final character participates in a
14365              * multi-character fold in the non-final position, as a match that
14366              * should have occurred won't, due to the way nodes are matched,
14367              * and our artificial boundary.  So back off until we find a non-
14368              * problematic character -- one that isn't at the beginning or
14369              * middle of such a fold.  (Either it doesn't participate in any
14370              * folds, or appears only in the final position of all the folds it
14371              * does participate in.)  A better solution with far fewer false
14372              * positives, and that would fill the nodes more completely, would
14373              * be to actually have available all the multi-character folds to
14374              * test against, and to back-off only far enough to be sure that
14375              * this node isn't ending with a partial one.  <upper_parse> is set
14376              * further below (if we need to reparse the node) to include just
14377              * up through that final non-problematic character that this code
14378              * identifies, so when it is set to less than the full node, we can
14379              * skip the rest of this */
14380             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14381                 PERL_UINT_FAST8_T backup_count = 0;
14382
14383                 const STRLEN full_len = len;
14384
14385                 assert(len >= MAX_NODE_STRING_SIZE);
14386
14387                 /* Here, <s> points to just beyond where we have output the
14388                  * final character of the node.  Look backwards through the
14389                  * string until find a non- problematic character */
14390
14391                 if (! UTF) {
14392
14393                     /* This has no multi-char folds to non-UTF characters */
14394                     if (ASCII_FOLD_RESTRICTED) {
14395                         goto loopdone;
14396                     }
14397
14398                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
14399                         backup_count++;
14400                     }
14401                     len = s - s0 + 1;
14402                 }
14403                 else {
14404
14405                     /* Point to the first byte of the final character */
14406                     s = (char *) utf8_hop((U8 *) s, -1);
14407
14408                     while (s >= s0) {   /* Search backwards until find
14409                                            a non-problematic char */
14410                         if (UTF8_IS_INVARIANT(*s)) {
14411
14412                             /* There are no ascii characters that participate
14413                              * in multi-char folds under /aa.  In EBCDIC, the
14414                              * non-ascii invariants are all control characters,
14415                              * so don't ever participate in any folds. */
14416                             if (ASCII_FOLD_RESTRICTED
14417                                 || ! IS_NON_FINAL_FOLD(*s))
14418                             {
14419                                 break;
14420                             }
14421                         }
14422                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14423                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14424                                                                   *s, *(s+1))))
14425                             {
14426                                 break;
14427                             }
14428                         }
14429                         else if (! _invlist_contains_cp(
14430                                         PL_NonFinalFold,
14431                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
14432                         {
14433                             break;
14434                         }
14435
14436                         /* Here, the current character is problematic in that
14437                          * it does occur in the non-final position of some
14438                          * fold, so try the character before it, but have to
14439                          * special case the very first byte in the string, so
14440                          * we don't read outside the string */
14441                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14442                         backup_count++;
14443                     } /* End of loop backwards through the string */
14444
14445                     /* If there were only problematic characters in the string,
14446                      * <s> will point to before s0, in which case the length
14447                      * should be 0, otherwise include the length of the
14448                      * non-problematic character just found */
14449                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14450                 }
14451
14452                 /* Here, have found the final character, if any, that is
14453                  * non-problematic as far as ending the node without splitting
14454                  * it across a potential multi-char fold.  <len> contains the
14455                  * number of bytes in the node up-to and including that
14456                  * character, or is 0 if there is no such character, meaning
14457                  * the whole node contains only problematic characters.  In
14458                  * this case, give up and just take the node as-is.  We can't
14459                  * do any better */
14460                 if (len == 0) {
14461                     len = full_len;
14462
14463                 } else {
14464
14465                     /* Here, the node does contain some characters that aren't
14466                      * problematic.  If we didn't have to backup any, then the
14467                      * final character in the node is non-problematic, and we
14468                      * can take the node as-is */
14469                     if (backup_count == 0) {
14470                         goto loopdone;
14471                     }
14472                     else if (backup_count == 1) {
14473
14474                         /* If the final character is problematic, but the
14475                          * penultimate is not, back-off that last character to
14476                          * later start a new node with it */
14477                         p = oldp;
14478                         goto loopdone;
14479                     }
14480
14481                     /* Here, the final non-problematic character is earlier
14482                      * in the input than the penultimate character.  What we do
14483                      * is reparse from the beginning, going up only as far as
14484                      * this final ok one, thus guaranteeing that the node ends
14485                      * in an acceptable character.  The reason we reparse is
14486                      * that we know how far in the character is, but we don't
14487                      * know how to correlate its position with the input parse.
14488                      * An alternate implementation would be to build that
14489                      * correlation as we go along during the original parse,
14490                      * but that would entail extra work for every node, whereas
14491                      * this code gets executed only when the string is too
14492                      * large for the node, and the final two characters are
14493                      * problematic, an infrequent occurrence.  Yet another
14494                      * possible strategy would be to save the tail of the
14495                      * string, and the next time regatom is called, initialize
14496                      * with that.  The problem with this is that unless you
14497                      * back off one more character, you won't be guaranteed
14498                      * regatom will get called again, unless regbranch,
14499                      * regpiece ... are also changed.  If you do back off that
14500                      * extra character, so that there is input guaranteed to
14501                      * force calling regatom, you can't handle the case where
14502                      * just the first character in the node is acceptable.  I
14503                      * (khw) decided to try this method which doesn't have that
14504                      * pitfall; if performance issues are found, we can do a
14505                      * combination of the current approach plus that one */
14506                     upper_parse = len;
14507                     len = 0;
14508                     s = s0;
14509                     goto reparse;
14510                 }
14511             }   /* End of verifying node ends with an appropriate char */
14512
14513           loopdone:   /* Jumped to when encounters something that shouldn't be
14514                          in the node */
14515
14516             /* Free up any over-allocated space */
14517             change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
14518
14519             /* I (khw) don't know if you can get here with zero length, but the
14520              * old code handled this situation by creating a zero-length EXACT
14521              * node.  Might as well be NOTHING instead */
14522             if (len == 0) {
14523                 OP(REGNODE_p(ret)) = NOTHING;
14524             }
14525             else {
14526
14527                 /* If the node type is EXACT here, check to see if it
14528                  * should be EXACTL, or EXACT_ONLY8. */
14529                 if (node_type == EXACT) {
14530                     if (LOC) {
14531                         node_type = EXACTL;
14532                     }
14533                     else if (requires_utf8_target) {
14534                         node_type = EXACT_ONLY8;
14535                     }
14536                 } else if (FOLD) {
14537                     if (    UNLIKELY(has_micro_sign || has_ss)
14538                         && (node_type == EXACTFU || (   node_type == EXACTF
14539                                                      && maybe_exactfu)))
14540                     {   /* These two conditions are problematic in non-UTF-8
14541                            EXACTFU nodes. */
14542                         assert(! UTF);
14543                         node_type = EXACTFUP;
14544                     }
14545                     else if (node_type == EXACTFL) {
14546
14547                         /* 'maybe_exactfu' is deliberately set above to
14548                          * indicate this node type, where all code points in it
14549                          * are above 255 */
14550                         if (maybe_exactfu) {
14551                             node_type = EXACTFLU8;
14552                         }
14553                     }
14554                     else if (node_type == EXACTF) {  /* Means is /di */
14555
14556                         /* If 'maybe_exactfu' is clear, then we need to stay
14557                          * /di.  If it is set, it means there are no code
14558                          * points that match differently depending on UTF8ness
14559                          * of the target string, so it can become an EXACTFU
14560                          * node */
14561                         if (! maybe_exactfu) {
14562                             RExC_seen_d_op = TRUE;
14563                         }
14564                         else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
14565                                  || isALPHA_FOLD_EQ(ender, 's'))
14566                         {
14567                             /* But, if the node begins or ends in an 's' we
14568                              * have to defer changing it into an EXACTFU, as
14569                              * the node could later get joined with another one
14570                              * that ends or begins with 's' creating an 'ss'
14571                              * sequence which would then wrongly match the
14572                              * sharp s without the target being UTF-8.  We
14573                              * create a special node that we resolve later when
14574                              * we join nodes together */
14575
14576                             node_type = EXACTFU_S_EDGE;
14577                         }
14578                         else {
14579                             node_type = EXACTFU;
14580                         }
14581                     }
14582
14583                     if (requires_utf8_target && node_type == EXACTFU) {
14584                         node_type = EXACTFU_ONLY8;
14585                     }
14586                 }
14587
14588                 OP(REGNODE_p(ret)) = node_type;
14589                 STR_LEN(REGNODE_p(ret)) = len;
14590                 RExC_emit += STR_SZ(len);
14591
14592                 /* If the node isn't a single character, it can't be SIMPLE */
14593                 if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
14594                     maybe_SIMPLE = 0;
14595                 }
14596
14597                 *flagp |= HASWIDTH | maybe_SIMPLE;
14598             }
14599
14600             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
14601             RExC_parse = p;
14602
14603             {
14604                 /* len is STRLEN which is unsigned, need to copy to signed */
14605                 IV iv = len;
14606                 if (iv < 0)
14607                     vFAIL("Internal disaster");
14608             }
14609
14610         } /* End of label 'defchar:' */
14611         break;
14612     } /* End of giant switch on input character */
14613
14614     /* Position parse to next real character */
14615     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14616                                             FALSE /* Don't force to /x */ );
14617     if (   *RExC_parse == '{'
14618         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14619     {
14620         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14621             RExC_parse++;
14622             vFAIL("Unescaped left brace in regex is illegal here");
14623         }
14624         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14625                                   " passed through");
14626     }
14627
14628     return(ret);
14629 }
14630
14631
14632 STATIC void
14633 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14634 {
14635     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
14636      * sets up the bitmap and any flags, removing those code points from the
14637      * inversion list, setting it to NULL should it become completely empty */
14638
14639     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14640     assert(PL_regkind[OP(node)] == ANYOF);
14641
14642     /* There is no bitmap for this node type */
14643     if (OP(node) == ANYOFH) {
14644         return;
14645     }
14646
14647     ANYOF_BITMAP_ZERO(node);
14648     if (*invlist_ptr) {
14649
14650         /* This gets set if we actually need to modify things */
14651         bool change_invlist = FALSE;
14652
14653         UV start, end;
14654
14655         /* Start looking through *invlist_ptr */
14656         invlist_iterinit(*invlist_ptr);
14657         while (invlist_iternext(*invlist_ptr, &start, &end)) {
14658             UV high;
14659             int i;
14660
14661             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14662                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14663             }
14664
14665             /* Quit if are above what we should change */
14666             if (start >= NUM_ANYOF_CODE_POINTS) {
14667                 break;
14668             }
14669
14670             change_invlist = TRUE;
14671
14672             /* Set all the bits in the range, up to the max that we are doing */
14673             high = (end < NUM_ANYOF_CODE_POINTS - 1)
14674                    ? end
14675                    : NUM_ANYOF_CODE_POINTS - 1;
14676             for (i = start; i <= (int) high; i++) {
14677                 if (! ANYOF_BITMAP_TEST(node, i)) {
14678                     ANYOF_BITMAP_SET(node, i);
14679                 }
14680             }
14681         }
14682         invlist_iterfinish(*invlist_ptr);
14683
14684         /* Done with loop; remove any code points that are in the bitmap from
14685          * *invlist_ptr; similarly for code points above the bitmap if we have
14686          * a flag to match all of them anyways */
14687         if (change_invlist) {
14688             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14689         }
14690         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14691             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14692         }
14693
14694         /* If have completely emptied it, remove it completely */
14695         if (_invlist_len(*invlist_ptr) == 0) {
14696             SvREFCNT_dec_NN(*invlist_ptr);
14697             *invlist_ptr = NULL;
14698         }
14699     }
14700 }
14701
14702 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14703    Character classes ([:foo:]) can also be negated ([:^foo:]).
14704    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14705    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14706    but trigger failures because they are currently unimplemented. */
14707
14708 #define POSIXCC_DONE(c)   ((c) == ':')
14709 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14710 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14711 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14712
14713 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
14714 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
14715 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
14716
14717 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14718
14719 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14720  * routine. q.v. */
14721 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
14722         if (posix_warnings) {                                               \
14723             if (! RExC_warn_text ) RExC_warn_text =                         \
14724                                          (AV *) sv_2mortal((SV *) newAV()); \
14725             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
14726                                              WARNING_PREFIX                 \
14727                                              text                           \
14728                                              REPORT_LOCATION,               \
14729                                              REPORT_LOCATION_ARGS(p)));     \
14730         }                                                                   \
14731     } STMT_END
14732 #define CLEAR_POSIX_WARNINGS()                                              \
14733     STMT_START {                                                            \
14734         if (posix_warnings && RExC_warn_text)                               \
14735             av_clear(RExC_warn_text);                                       \
14736     } STMT_END
14737
14738 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
14739     STMT_START {                                                            \
14740         CLEAR_POSIX_WARNINGS();                                             \
14741         return ret;                                                         \
14742     } STMT_END
14743
14744 STATIC int
14745 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14746
14747     const char * const s,      /* Where the putative posix class begins.
14748                                   Normally, this is one past the '['.  This
14749                                   parameter exists so it can be somewhere
14750                                   besides RExC_parse. */
14751     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14752                                   NULL */
14753     AV ** posix_warnings,      /* Where to place any generated warnings, or
14754                                   NULL */
14755     const bool check_only      /* Don't die if error */
14756 )
14757 {
14758     /* This parses what the caller thinks may be one of the three POSIX
14759      * constructs:
14760      *  1) a character class, like [:blank:]
14761      *  2) a collating symbol, like [. .]
14762      *  3) an equivalence class, like [= =]
14763      * In the latter two cases, it croaks if it finds a syntactically legal
14764      * one, as these are not handled by Perl.
14765      *
14766      * The main purpose is to look for a POSIX character class.  It returns:
14767      *  a) the class number
14768      *      if it is a completely syntactically and semantically legal class.
14769      *      'updated_parse_ptr', if not NULL, is set to point to just after the
14770      *      closing ']' of the class
14771      *  b) OOB_NAMEDCLASS
14772      *      if it appears that one of the three POSIX constructs was meant, but
14773      *      its specification was somehow defective.  'updated_parse_ptr', if
14774      *      not NULL, is set to point to the character just after the end
14775      *      character of the class.  See below for handling of warnings.
14776      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14777      *      if it  doesn't appear that a POSIX construct was intended.
14778      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
14779      *      raised.
14780      *
14781      * In b) there may be errors or warnings generated.  If 'check_only' is
14782      * TRUE, then any errors are discarded.  Warnings are returned to the
14783      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
14784      * instead it is NULL, warnings are suppressed.
14785      *
14786      * The reason for this function, and its complexity is that a bracketed
14787      * character class can contain just about anything.  But it's easy to
14788      * mistype the very specific posix class syntax but yielding a valid
14789      * regular bracketed class, so it silently gets compiled into something
14790      * quite unintended.
14791      *
14792      * The solution adopted here maintains backward compatibility except that
14793      * it adds a warning if it looks like a posix class was intended but
14794      * improperly specified.  The warning is not raised unless what is input
14795      * very closely resembles one of the 14 legal posix classes.  To do this,
14796      * it uses fuzzy parsing.  It calculates how many single-character edits it
14797      * would take to transform what was input into a legal posix class.  Only
14798      * if that number is quite small does it think that the intention was a
14799      * posix class.  Obviously these are heuristics, and there will be cases
14800      * where it errs on one side or another, and they can be tweaked as
14801      * experience informs.
14802      *
14803      * The syntax for a legal posix class is:
14804      *
14805      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14806      *
14807      * What this routine considers syntactically to be an intended posix class
14808      * is this (the comments indicate some restrictions that the pattern
14809      * doesn't show):
14810      *
14811      *  qr/(?x: \[?                         # The left bracket, possibly
14812      *                                      # omitted
14813      *          \h*                         # possibly followed by blanks
14814      *          (?: \^ \h* )?               # possibly a misplaced caret
14815      *          [:;]?                       # The opening class character,
14816      *                                      # possibly omitted.  A typo
14817      *                                      # semi-colon can also be used.
14818      *          \h*
14819      *          \^?                         # possibly a correctly placed
14820      *                                      # caret, but not if there was also
14821      *                                      # a misplaced one
14822      *          \h*
14823      *          .{3,15}                     # The class name.  If there are
14824      *                                      # deviations from the legal syntax,
14825      *                                      # its edit distance must be close
14826      *                                      # to a real class name in order
14827      *                                      # for it to be considered to be
14828      *                                      # an intended posix class.
14829      *          \h*
14830      *          [[:punct:]]?                # The closing class character,
14831      *                                      # possibly omitted.  If not a colon
14832      *                                      # nor semi colon, the class name
14833      *                                      # must be even closer to a valid
14834      *                                      # one
14835      *          \h*
14836      *          \]?                         # The right bracket, possibly
14837      *                                      # omitted.
14838      *     )/
14839      *
14840      * In the above, \h must be ASCII-only.
14841      *
14842      * These are heuristics, and can be tweaked as field experience dictates.
14843      * There will be cases when someone didn't intend to specify a posix class
14844      * that this warns as being so.  The goal is to minimize these, while
14845      * maximizing the catching of things intended to be a posix class that
14846      * aren't parsed as such.
14847      */
14848
14849     const char* p             = s;
14850     const char * const e      = RExC_end;
14851     unsigned complement       = 0;      /* If to complement the class */
14852     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
14853     bool has_opening_bracket  = FALSE;
14854     bool has_opening_colon    = FALSE;
14855     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
14856                                                    valid class */
14857     const char * possible_end = NULL;   /* used for a 2nd parse pass */
14858     const char* name_start;             /* ptr to class name first char */
14859
14860     /* If the number of single-character typos the input name is away from a
14861      * legal name is no more than this number, it is considered to have meant
14862      * the legal name */
14863     int max_distance          = 2;
14864
14865     /* to store the name.  The size determines the maximum length before we
14866      * decide that no posix class was intended.  Should be at least
14867      * sizeof("alphanumeric") */
14868     UV input_text[15];
14869     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14870
14871     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14872
14873     CLEAR_POSIX_WARNINGS();
14874
14875     if (p >= e) {
14876         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14877     }
14878
14879     if (*(p - 1) != '[') {
14880         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14881         found_problem = TRUE;
14882     }
14883     else {
14884         has_opening_bracket = TRUE;
14885     }
14886
14887     /* They could be confused and think you can put spaces between the
14888      * components */
14889     if (isBLANK(*p)) {
14890         found_problem = TRUE;
14891
14892         do {
14893             p++;
14894         } while (p < e && isBLANK(*p));
14895
14896         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14897     }
14898
14899     /* For [. .] and [= =].  These are quite different internally from [: :],
14900      * so they are handled separately.  */
14901     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14902                                             and 1 for at least one char in it
14903                                           */
14904     {
14905         const char open_char  = *p;
14906         const char * temp_ptr = p + 1;
14907
14908         /* These two constructs are not handled by perl, and if we find a
14909          * syntactically valid one, we croak.  khw, who wrote this code, finds
14910          * this explanation of them very unclear:
14911          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14912          * And searching the rest of the internet wasn't very helpful either.
14913          * It looks like just about any byte can be in these constructs,
14914          * depending on the locale.  But unless the pattern is being compiled
14915          * under /l, which is very rare, Perl runs under the C or POSIX locale.
14916          * In that case, it looks like [= =] isn't allowed at all, and that
14917          * [. .] could be any single code point, but for longer strings the
14918          * constituent characters would have to be the ASCII alphabetics plus
14919          * the minus-hyphen.  Any sensible locale definition would limit itself
14920          * to these.  And any portable one definitely should.  Trying to parse
14921          * the general case is a nightmare (see [perl #127604]).  So, this code
14922          * looks only for interiors of these constructs that match:
14923          *      qr/.|[-\w]{2,}/
14924          * Using \w relaxes the apparent rules a little, without adding much
14925          * danger of mistaking something else for one of these constructs.
14926          *
14927          * [. .] in some implementations described on the internet is usable to
14928          * escape a character that otherwise is special in bracketed character
14929          * classes.  For example [.].] means a literal right bracket instead of
14930          * the ending of the class
14931          *
14932          * [= =] can legitimately contain a [. .] construct, but we don't
14933          * handle this case, as that [. .] construct will later get parsed
14934          * itself and croak then.  And [= =] is checked for even when not under
14935          * /l, as Perl has long done so.
14936          *
14937          * The code below relies on there being a trailing NUL, so it doesn't
14938          * have to keep checking if the parse ptr < e.
14939          */
14940         if (temp_ptr[1] == open_char) {
14941             temp_ptr++;
14942         }
14943         else while (    temp_ptr < e
14944                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14945         {
14946             temp_ptr++;
14947         }
14948
14949         if (*temp_ptr == open_char) {
14950             temp_ptr++;
14951             if (*temp_ptr == ']') {
14952                 temp_ptr++;
14953                 if (! found_problem && ! check_only) {
14954                     RExC_parse = (char *) temp_ptr;
14955                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
14956                             "extensions", open_char, open_char);
14957                 }
14958
14959                 /* Here, the syntax wasn't completely valid, or else the call
14960                  * is to check-only */
14961                 if (updated_parse_ptr) {
14962                     *updated_parse_ptr = (char *) temp_ptr;
14963                 }
14964
14965                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14966             }
14967         }
14968
14969         /* If we find something that started out to look like one of these
14970          * constructs, but isn't, we continue below so that it can be checked
14971          * for being a class name with a typo of '.' or '=' instead of a colon.
14972          * */
14973     }
14974
14975     /* Here, we think there is a possibility that a [: :] class was meant, and
14976      * we have the first real character.  It could be they think the '^' comes
14977      * first */
14978     if (*p == '^') {
14979         found_problem = TRUE;
14980         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14981         complement = 1;
14982         p++;
14983
14984         if (isBLANK(*p)) {
14985             found_problem = TRUE;
14986
14987             do {
14988                 p++;
14989             } while (p < e && isBLANK(*p));
14990
14991             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14992         }
14993     }
14994
14995     /* But the first character should be a colon, which they could have easily
14996      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14997      * distinguish from a colon, so treat that as a colon).  */
14998     if (*p == ':') {
14999         p++;
15000         has_opening_colon = TRUE;
15001     }
15002     else if (*p == ';') {
15003         found_problem = TRUE;
15004         p++;
15005         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15006         has_opening_colon = TRUE;
15007     }
15008     else {
15009         found_problem = TRUE;
15010         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15011
15012         /* Consider an initial punctuation (not one of the recognized ones) to
15013          * be a left terminator */
15014         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15015             p++;
15016         }
15017     }
15018
15019     /* They may think that you can put spaces between the components */
15020     if (isBLANK(*p)) {
15021         found_problem = TRUE;
15022
15023         do {
15024             p++;
15025         } while (p < e && isBLANK(*p));
15026
15027         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15028     }
15029
15030     if (*p == '^') {
15031
15032         /* We consider something like [^:^alnum:]] to not have been intended to
15033          * be a posix class, but XXX maybe we should */
15034         if (complement) {
15035             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15036         }
15037
15038         complement = 1;
15039         p++;
15040     }
15041
15042     /* Again, they may think that you can put spaces between the components */
15043     if (isBLANK(*p)) {
15044         found_problem = TRUE;
15045
15046         do {
15047             p++;
15048         } while (p < e && isBLANK(*p));
15049
15050         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15051     }
15052
15053     if (*p == ']') {
15054
15055         /* XXX This ']' may be a typo, and something else was meant.  But
15056          * treating it as such creates enough complications, that that
15057          * possibility isn't currently considered here.  So we assume that the
15058          * ']' is what is intended, and if we've already found an initial '[',
15059          * this leaves this construct looking like [:] or [:^], which almost
15060          * certainly weren't intended to be posix classes */
15061         if (has_opening_bracket) {
15062             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15063         }
15064
15065         /* But this function can be called when we parse the colon for
15066          * something like qr/[alpha:]]/, so we back up to look for the
15067          * beginning */
15068         p--;
15069
15070         if (*p == ';') {
15071             found_problem = TRUE;
15072             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15073         }
15074         else if (*p != ':') {
15075
15076             /* XXX We are currently very restrictive here, so this code doesn't
15077              * consider the possibility that, say, /[alpha.]]/ was intended to
15078              * be a posix class. */
15079             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15080         }
15081
15082         /* Here we have something like 'foo:]'.  There was no initial colon,
15083          * and we back up over 'foo.  XXX Unlike the going forward case, we
15084          * don't handle typos of non-word chars in the middle */
15085         has_opening_colon = FALSE;
15086         p--;
15087
15088         while (p > RExC_start && isWORDCHAR(*p)) {
15089             p--;
15090         }
15091         p++;
15092
15093         /* Here, we have positioned ourselves to where we think the first
15094          * character in the potential class is */
15095     }
15096
15097     /* Now the interior really starts.  There are certain key characters that
15098      * can end the interior, or these could just be typos.  To catch both
15099      * cases, we may have to do two passes.  In the first pass, we keep on
15100      * going unless we come to a sequence that matches
15101      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15102      * This means it takes a sequence to end the pass, so two typos in a row if
15103      * that wasn't what was intended.  If the class is perfectly formed, just
15104      * this one pass is needed.  We also stop if there are too many characters
15105      * being accumulated, but this number is deliberately set higher than any
15106      * real class.  It is set high enough so that someone who thinks that
15107      * 'alphanumeric' is a correct name would get warned that it wasn't.
15108      * While doing the pass, we keep track of where the key characters were in
15109      * it.  If we don't find an end to the class, and one of the key characters
15110      * was found, we redo the pass, but stop when we get to that character.
15111      * Thus the key character was considered a typo in the first pass, but a
15112      * terminator in the second.  If two key characters are found, we stop at
15113      * the second one in the first pass.  Again this can miss two typos, but
15114      * catches a single one
15115      *
15116      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15117      * point to the first key character.  For the second pass, it starts as -1.
15118      * */
15119
15120     name_start = p;
15121   parse_name:
15122     {
15123         bool has_blank               = FALSE;
15124         bool has_upper               = FALSE;
15125         bool has_terminating_colon   = FALSE;
15126         bool has_terminating_bracket = FALSE;
15127         bool has_semi_colon          = FALSE;
15128         unsigned int name_len        = 0;
15129         int punct_count              = 0;
15130
15131         while (p < e) {
15132
15133             /* Squeeze out blanks when looking up the class name below */
15134             if (isBLANK(*p) ) {
15135                 has_blank = TRUE;
15136                 found_problem = TRUE;
15137                 p++;
15138                 continue;
15139             }
15140
15141             /* The name will end with a punctuation */
15142             if (isPUNCT(*p)) {
15143                 const char * peek = p + 1;
15144
15145                 /* Treat any non-']' punctuation followed by a ']' (possibly
15146                  * with intervening blanks) as trying to terminate the class.
15147                  * ']]' is very likely to mean a class was intended (but
15148                  * missing the colon), but the warning message that gets
15149                  * generated shows the error position better if we exit the
15150                  * loop at the bottom (eventually), so skip it here. */
15151                 if (*p != ']') {
15152                     if (peek < e && isBLANK(*peek)) {
15153                         has_blank = TRUE;
15154                         found_problem = TRUE;
15155                         do {
15156                             peek++;
15157                         } while (peek < e && isBLANK(*peek));
15158                     }
15159
15160                     if (peek < e && *peek == ']') {
15161                         has_terminating_bracket = TRUE;
15162                         if (*p == ':') {
15163                             has_terminating_colon = TRUE;
15164                         }
15165                         else if (*p == ';') {
15166                             has_semi_colon = TRUE;
15167                             has_terminating_colon = TRUE;
15168                         }
15169                         else {
15170                             found_problem = TRUE;
15171                         }
15172                         p = peek + 1;
15173                         goto try_posix;
15174                     }
15175                 }
15176
15177                 /* Here we have punctuation we thought didn't end the class.
15178                  * Keep track of the position of the key characters that are
15179                  * more likely to have been class-enders */
15180                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15181
15182                     /* Allow just one such possible class-ender not actually
15183                      * ending the class. */
15184                     if (possible_end) {
15185                         break;
15186                     }
15187                     possible_end = p;
15188                 }
15189
15190                 /* If we have too many punctuation characters, no use in
15191                  * keeping going */
15192                 if (++punct_count > max_distance) {
15193                     break;
15194                 }
15195
15196                 /* Treat the punctuation as a typo. */
15197                 input_text[name_len++] = *p;
15198                 p++;
15199             }
15200             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15201                 input_text[name_len++] = toLOWER(*p);
15202                 has_upper = TRUE;
15203                 found_problem = TRUE;
15204                 p++;
15205             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15206                 input_text[name_len++] = *p;
15207                 p++;
15208             }
15209             else {
15210                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15211                 p+= UTF8SKIP(p);
15212             }
15213
15214             /* The declaration of 'input_text' is how long we allow a potential
15215              * class name to be, before saying they didn't mean a class name at
15216              * all */
15217             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15218                 break;
15219             }
15220         }
15221
15222         /* We get to here when the possible class name hasn't been properly
15223          * terminated before:
15224          *   1) we ran off the end of the pattern; or
15225          *   2) found two characters, each of which might have been intended to
15226          *      be the name's terminator
15227          *   3) found so many punctuation characters in the purported name,
15228          *      that the edit distance to a valid one is exceeded
15229          *   4) we decided it was more characters than anyone could have
15230          *      intended to be one. */
15231
15232         found_problem = TRUE;
15233
15234         /* In the final two cases, we know that looking up what we've
15235          * accumulated won't lead to a match, even a fuzzy one. */
15236         if (   name_len >= C_ARRAY_LENGTH(input_text)
15237             || punct_count > max_distance)
15238         {
15239             /* If there was an intermediate key character that could have been
15240              * an intended end, redo the parse, but stop there */
15241             if (possible_end && possible_end != (char *) -1) {
15242                 possible_end = (char *) -1; /* Special signal value to say
15243                                                we've done a first pass */
15244                 p = name_start;
15245                 goto parse_name;
15246             }
15247
15248             /* Otherwise, it can't have meant to have been a class */
15249             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15250         }
15251
15252         /* If we ran off the end, and the final character was a punctuation
15253          * one, back up one, to look at that final one just below.  Later, we
15254          * will restore the parse pointer if appropriate */
15255         if (name_len && p == e && isPUNCT(*(p-1))) {
15256             p--;
15257             name_len--;
15258         }
15259
15260         if (p < e && isPUNCT(*p)) {
15261             if (*p == ']') {
15262                 has_terminating_bracket = TRUE;
15263
15264                 /* If this is a 2nd ']', and the first one is just below this
15265                  * one, consider that to be the real terminator.  This gives a
15266                  * uniform and better positioning for the warning message  */
15267                 if (   possible_end
15268                     && possible_end != (char *) -1
15269                     && *possible_end == ']'
15270                     && name_len && input_text[name_len - 1] == ']')
15271                 {
15272                     name_len--;
15273                     p = possible_end;
15274
15275                     /* And this is actually equivalent to having done the 2nd
15276                      * pass now, so set it to not try again */
15277                     possible_end = (char *) -1;
15278                 }
15279             }
15280             else {
15281                 if (*p == ':') {
15282                     has_terminating_colon = TRUE;
15283                 }
15284                 else if (*p == ';') {
15285                     has_semi_colon = TRUE;
15286                     has_terminating_colon = TRUE;
15287                 }
15288                 p++;
15289             }
15290         }
15291
15292     try_posix:
15293
15294         /* Here, we have a class name to look up.  We can short circuit the
15295          * stuff below for short names that can't possibly be meant to be a
15296          * class name.  (We can do this on the first pass, as any second pass
15297          * will yield an even shorter name) */
15298         if (name_len < 3) {
15299             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15300         }
15301
15302         /* Find which class it is.  Initially switch on the length of the name.
15303          * */
15304         switch (name_len) {
15305             case 4:
15306                 if (memEQs(name_start, 4, "word")) {
15307                     /* this is not POSIX, this is the Perl \w */
15308                     class_number = ANYOF_WORDCHAR;
15309                 }
15310                 break;
15311             case 5:
15312                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15313                  *                        graph lower print punct space upper
15314                  * Offset 4 gives the best switch position.  */
15315                 switch (name_start[4]) {
15316                     case 'a':
15317                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15318                             class_number = ANYOF_ALPHA;
15319                         break;
15320                     case 'e':
15321                         if (memBEGINs(name_start, 5, "spac")) /* space */
15322                             class_number = ANYOF_SPACE;
15323                         break;
15324                     case 'h':
15325                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15326                             class_number = ANYOF_GRAPH;
15327                         break;
15328                     case 'i':
15329                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15330                             class_number = ANYOF_ASCII;
15331                         break;
15332                     case 'k':
15333                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15334                             class_number = ANYOF_BLANK;
15335                         break;
15336                     case 'l':
15337                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15338                             class_number = ANYOF_CNTRL;
15339                         break;
15340                     case 'm':
15341                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15342                             class_number = ANYOF_ALPHANUMERIC;
15343                         break;
15344                     case 'r':
15345                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15346                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15347                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15348                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15349                         break;
15350                     case 't':
15351                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15352                             class_number = ANYOF_DIGIT;
15353                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15354                             class_number = ANYOF_PRINT;
15355                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15356                             class_number = ANYOF_PUNCT;
15357                         break;
15358                 }
15359                 break;
15360             case 6:
15361                 if (memEQs(name_start, 6, "xdigit"))
15362                     class_number = ANYOF_XDIGIT;
15363                 break;
15364         }
15365
15366         /* If the name exactly matches a posix class name the class number will
15367          * here be set to it, and the input almost certainly was meant to be a
15368          * posix class, so we can skip further checking.  If instead the syntax
15369          * is exactly correct, but the name isn't one of the legal ones, we
15370          * will return that as an error below.  But if neither of these apply,
15371          * it could be that no posix class was intended at all, or that one
15372          * was, but there was a typo.  We tease these apart by doing fuzzy
15373          * matching on the name */
15374         if (class_number == OOB_NAMEDCLASS && found_problem) {
15375             const UV posix_names[][6] = {
15376                                                 { 'a', 'l', 'n', 'u', 'm' },
15377                                                 { 'a', 'l', 'p', 'h', 'a' },
15378                                                 { 'a', 's', 'c', 'i', 'i' },
15379                                                 { 'b', 'l', 'a', 'n', 'k' },
15380                                                 { 'c', 'n', 't', 'r', 'l' },
15381                                                 { 'd', 'i', 'g', 'i', 't' },
15382                                                 { 'g', 'r', 'a', 'p', 'h' },
15383                                                 { 'l', 'o', 'w', 'e', 'r' },
15384                                                 { 'p', 'r', 'i', 'n', 't' },
15385                                                 { 'p', 'u', 'n', 'c', 't' },
15386                                                 { 's', 'p', 'a', 'c', 'e' },
15387                                                 { 'u', 'p', 'p', 'e', 'r' },
15388                                                 { 'w', 'o', 'r', 'd' },
15389                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15390                                             };
15391             /* The names of the above all have added NULs to make them the same
15392              * size, so we need to also have the real lengths */
15393             const UV posix_name_lengths[] = {
15394                                                 sizeof("alnum") - 1,
15395                                                 sizeof("alpha") - 1,
15396                                                 sizeof("ascii") - 1,
15397                                                 sizeof("blank") - 1,
15398                                                 sizeof("cntrl") - 1,
15399                                                 sizeof("digit") - 1,
15400                                                 sizeof("graph") - 1,
15401                                                 sizeof("lower") - 1,
15402                                                 sizeof("print") - 1,
15403                                                 sizeof("punct") - 1,
15404                                                 sizeof("space") - 1,
15405                                                 sizeof("upper") - 1,
15406                                                 sizeof("word")  - 1,
15407                                                 sizeof("xdigit")- 1
15408                                             };
15409             unsigned int i;
15410             int temp_max = max_distance;    /* Use a temporary, so if we
15411                                                reparse, we haven't changed the
15412                                                outer one */
15413
15414             /* Use a smaller max edit distance if we are missing one of the
15415              * delimiters */
15416             if (   has_opening_bracket + has_opening_colon < 2
15417                 || has_terminating_bracket + has_terminating_colon < 2)
15418             {
15419                 temp_max--;
15420             }
15421
15422             /* See if the input name is close to a legal one */
15423             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15424
15425                 /* Short circuit call if the lengths are too far apart to be
15426                  * able to match */
15427                 if (abs( (int) (name_len - posix_name_lengths[i]))
15428                     > temp_max)
15429                 {
15430                     continue;
15431                 }
15432
15433                 if (edit_distance(input_text,
15434                                   posix_names[i],
15435                                   name_len,
15436                                   posix_name_lengths[i],
15437                                   temp_max
15438                                  )
15439                     > -1)
15440                 { /* If it is close, it probably was intended to be a class */
15441                     goto probably_meant_to_be;
15442                 }
15443             }
15444
15445             /* Here the input name is not close enough to a valid class name
15446              * for us to consider it to be intended to be a posix class.  If
15447              * we haven't already done so, and the parse found a character that
15448              * could have been terminators for the name, but which we absorbed
15449              * as typos during the first pass, repeat the parse, signalling it
15450              * to stop at that character */
15451             if (possible_end && possible_end != (char *) -1) {
15452                 possible_end = (char *) -1;
15453                 p = name_start;
15454                 goto parse_name;
15455             }
15456
15457             /* Here neither pass found a close-enough class name */
15458             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15459         }
15460
15461     probably_meant_to_be:
15462
15463         /* Here we think that a posix specification was intended.  Update any
15464          * parse pointer */
15465         if (updated_parse_ptr) {
15466             *updated_parse_ptr = (char *) p;
15467         }
15468
15469         /* If a posix class name was intended but incorrectly specified, we
15470          * output or return the warnings */
15471         if (found_problem) {
15472
15473             /* We set flags for these issues in the parse loop above instead of
15474              * adding them to the list of warnings, because we can parse it
15475              * twice, and we only want one warning instance */
15476             if (has_upper) {
15477                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15478             }
15479             if (has_blank) {
15480                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15481             }
15482             if (has_semi_colon) {
15483                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15484             }
15485             else if (! has_terminating_colon) {
15486                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15487             }
15488             if (! has_terminating_bracket) {
15489                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15490             }
15491
15492             if (   posix_warnings
15493                 && RExC_warn_text
15494                 && av_top_index(RExC_warn_text) > -1)
15495             {
15496                 *posix_warnings = RExC_warn_text;
15497             }
15498         }
15499         else if (class_number != OOB_NAMEDCLASS) {
15500             /* If it is a known class, return the class.  The class number
15501              * #defines are structured so each complement is +1 to the normal
15502              * one */
15503             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15504         }
15505         else if (! check_only) {
15506
15507             /* Here, it is an unrecognized class.  This is an error (unless the
15508             * call is to check only, which we've already handled above) */
15509             const char * const complement_string = (complement)
15510                                                    ? "^"
15511                                                    : "";
15512             RExC_parse = (char *) p;
15513             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15514                         complement_string,
15515                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15516         }
15517     }
15518
15519     return OOB_NAMEDCLASS;
15520 }
15521 #undef ADD_POSIX_WARNING
15522
15523 STATIC unsigned  int
15524 S_regex_set_precedence(const U8 my_operator) {
15525
15526     /* Returns the precedence in the (?[...]) construct of the input operator,
15527      * specified by its character representation.  The precedence follows
15528      * general Perl rules, but it extends this so that ')' and ']' have (low)
15529      * precedence even though they aren't really operators */
15530
15531     switch (my_operator) {
15532         case '!':
15533             return 5;
15534         case '&':
15535             return 4;
15536         case '^':
15537         case '|':
15538         case '+':
15539         case '-':
15540             return 3;
15541         case ')':
15542             return 2;
15543         case ']':
15544             return 1;
15545     }
15546
15547     NOT_REACHED; /* NOTREACHED */
15548     return 0;   /* Silence compiler warning */
15549 }
15550
15551 STATIC regnode_offset
15552 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15553                     I32 *flagp, U32 depth,
15554                     char * const oregcomp_parse)
15555 {
15556     /* Handle the (?[...]) construct to do set operations */
15557
15558     U8 curchar;                     /* Current character being parsed */
15559     UV start, end;                  /* End points of code point ranges */
15560     SV* final = NULL;               /* The end result inversion list */
15561     SV* result_string;              /* 'final' stringified */
15562     AV* stack;                      /* stack of operators and operands not yet
15563                                        resolved */
15564     AV* fence_stack = NULL;         /* A stack containing the positions in
15565                                        'stack' of where the undealt-with left
15566                                        parens would be if they were actually
15567                                        put there */
15568     /* The 'volatile' is a workaround for an optimiser bug
15569      * in Solaris Studio 12.3. See RT #127455 */
15570     volatile IV fence = 0;          /* Position of where most recent undealt-
15571                                        with left paren in stack is; -1 if none.
15572                                      */
15573     STRLEN len;                     /* Temporary */
15574     regnode_offset node;                  /* Temporary, and final regnode returned by
15575                                        this function */
15576     const bool save_fold = FOLD;    /* Temporary */
15577     char *save_end, *save_parse;    /* Temporaries */
15578     const bool in_locale = LOC;     /* we turn off /l during processing */
15579
15580     GET_RE_DEBUG_FLAGS_DECL;
15581
15582     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15583
15584     DEBUG_PARSE("xcls");
15585
15586     if (in_locale) {
15587         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15588     }
15589
15590     /* The use of this operator implies /u.  This is required so that the
15591      * compile time values are valid in all runtime cases */
15592     REQUIRE_UNI_RULES(flagp, 0);
15593
15594     ckWARNexperimental(RExC_parse,
15595                        WARN_EXPERIMENTAL__REGEX_SETS,
15596                        "The regex_sets feature is experimental");
15597
15598     /* Everything in this construct is a metacharacter.  Operands begin with
15599      * either a '\' (for an escape sequence), or a '[' for a bracketed
15600      * character class.  Any other character should be an operator, or
15601      * parenthesis for grouping.  Both types of operands are handled by calling
15602      * regclass() to parse them.  It is called with a parameter to indicate to
15603      * return the computed inversion list.  The parsing here is implemented via
15604      * a stack.  Each entry on the stack is a single character representing one
15605      * of the operators; or else a pointer to an operand inversion list. */
15606
15607 #define IS_OPERATOR(a) SvIOK(a)
15608 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
15609
15610     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
15611      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15612      * with pronouncing it called it Reverse Polish instead, but now that YOU
15613      * know how to pronounce it you can use the correct term, thus giving due
15614      * credit to the person who invented it, and impressing your geek friends.
15615      * Wikipedia says that the pronounciation of "Ł" has been changing so that
15616      * it is now more like an English initial W (as in wonk) than an L.)
15617      *
15618      * This means that, for example, 'a | b & c' is stored on the stack as
15619      *
15620      * c  [4]
15621      * b  [3]
15622      * &  [2]
15623      * a  [1]
15624      * |  [0]
15625      *
15626      * where the numbers in brackets give the stack [array] element number.
15627      * In this implementation, parentheses are not stored on the stack.
15628      * Instead a '(' creates a "fence" so that the part of the stack below the
15629      * fence is invisible except to the corresponding ')' (this allows us to
15630      * replace testing for parens, by using instead subtraction of the fence
15631      * position).  As new operands are processed they are pushed onto the stack
15632      * (except as noted in the next paragraph).  New operators of higher
15633      * precedence than the current final one are inserted on the stack before
15634      * the lhs operand (so that when the rhs is pushed next, everything will be
15635      * in the correct positions shown above.  When an operator of equal or
15636      * lower precedence is encountered in parsing, all the stacked operations
15637      * of equal or higher precedence are evaluated, leaving the result as the
15638      * top entry on the stack.  This makes higher precedence operations
15639      * evaluate before lower precedence ones, and causes operations of equal
15640      * precedence to left associate.
15641      *
15642      * The only unary operator '!' is immediately pushed onto the stack when
15643      * encountered.  When an operand is encountered, if the top of the stack is
15644      * a '!", the complement is immediately performed, and the '!' popped.  The
15645      * resulting value is treated as a new operand, and the logic in the
15646      * previous paragraph is executed.  Thus in the expression
15647      *      [a] + ! [b]
15648      * the stack looks like
15649      *
15650      * !
15651      * a
15652      * +
15653      *
15654      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15655      * becomes
15656      *
15657      * !b
15658      * a
15659      * +
15660      *
15661      * A ')' is treated as an operator with lower precedence than all the
15662      * aforementioned ones, which causes all operations on the stack above the
15663      * corresponding '(' to be evaluated down to a single resultant operand.
15664      * Then the fence for the '(' is removed, and the operand goes through the
15665      * algorithm above, without the fence.
15666      *
15667      * A separate stack is kept of the fence positions, so that the position of
15668      * the latest so-far unbalanced '(' is at the top of it.
15669      *
15670      * The ']' ending the construct is treated as the lowest operator of all,
15671      * so that everything gets evaluated down to a single operand, which is the
15672      * result */
15673
15674     sv_2mortal((SV *)(stack = newAV()));
15675     sv_2mortal((SV *)(fence_stack = newAV()));
15676
15677     while (RExC_parse < RExC_end) {
15678         I32 top_index;              /* Index of top-most element in 'stack' */
15679         SV** top_ptr;               /* Pointer to top 'stack' element */
15680         SV* current = NULL;         /* To contain the current inversion list
15681                                        operand */
15682         SV* only_to_avoid_leaks;
15683
15684         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15685                                 TRUE /* Force /x */ );
15686         if (RExC_parse >= RExC_end) {   /* Fail */
15687             break;
15688         }
15689
15690         curchar = UCHARAT(RExC_parse);
15691
15692 redo_curchar:
15693
15694 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15695                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15696         DEBUG_U(dump_regex_sets_structures(pRExC_state,
15697                                            stack, fence, fence_stack));
15698 #endif
15699
15700         top_index = av_tindex_skip_len_mg(stack);
15701
15702         switch (curchar) {
15703             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
15704             char stacked_operator;  /* The topmost operator on the 'stack'. */
15705             SV* lhs;                /* Operand to the left of the operator */
15706             SV* rhs;                /* Operand to the right of the operator */
15707             SV* fence_ptr;          /* Pointer to top element of the fence
15708                                        stack */
15709
15710             case '(':
15711
15712                 if (   RExC_parse < RExC_end - 2
15713                     && UCHARAT(RExC_parse + 1) == '?'
15714                     && UCHARAT(RExC_parse + 2) == '^')
15715                 {
15716                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
15717                      * This happens when we have some thing like
15718                      *
15719                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15720                      *   ...
15721                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
15722                      *
15723                      * Here we would be handling the interpolated
15724                      * '$thai_or_lao'.  We handle this by a recursive call to
15725                      * ourselves which returns the inversion list the
15726                      * interpolated expression evaluates to.  We use the flags
15727                      * from the interpolated pattern. */
15728                     U32 save_flags = RExC_flags;
15729                     const char * save_parse;
15730
15731                     RExC_parse += 2;        /* Skip past the '(?' */
15732                     save_parse = RExC_parse;
15733
15734                     /* Parse the flags for the '(?'.  We already know the first
15735                      * flag to parse is a '^' */
15736                     parse_lparen_question_flags(pRExC_state);
15737
15738                     if (   RExC_parse >= RExC_end - 4
15739                         || UCHARAT(RExC_parse) != ':'
15740                         || UCHARAT(++RExC_parse) != '('
15741                         || UCHARAT(++RExC_parse) != '?'
15742                         || UCHARAT(++RExC_parse) != '[')
15743                     {
15744
15745                         /* In combination with the above, this moves the
15746                          * pointer to the point just after the first erroneous
15747                          * character. */
15748                         if (RExC_parse >= RExC_end - 4) {
15749                             RExC_parse = RExC_end;
15750                         }
15751                         else if (RExC_parse != save_parse) {
15752                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15753                         }
15754                         vFAIL("Expecting '(?flags:(?[...'");
15755                     }
15756
15757                     /* Recurse, with the meat of the embedded expression */
15758                     RExC_parse++;
15759                     (void) handle_regex_sets(pRExC_state, &current, flagp,
15760                                                     depth+1, oregcomp_parse);
15761
15762                     /* Here, 'current' contains the embedded expression's
15763                      * inversion list, and RExC_parse points to the trailing
15764                      * ']'; the next character should be the ')' */
15765                     RExC_parse++;
15766                     if (UCHARAT(RExC_parse) != ')')
15767                         vFAIL("Expecting close paren for nested extended charclass");
15768
15769                     /* Then the ')' matching the original '(' handled by this
15770                      * case: statement */
15771                     RExC_parse++;
15772                     if (UCHARAT(RExC_parse) != ')')
15773                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
15774
15775                     RExC_flags = save_flags;
15776                     goto handle_operand;
15777                 }
15778
15779                 /* A regular '('.  Look behind for illegal syntax */
15780                 if (top_index - fence >= 0) {
15781                     /* If the top entry on the stack is an operator, it had
15782                      * better be a '!', otherwise the entry below the top
15783                      * operand should be an operator */
15784                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
15785                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15786                         || (   IS_OPERAND(*top_ptr)
15787                             && (   top_index - fence < 1
15788                                 || ! (stacked_ptr = av_fetch(stack,
15789                                                              top_index - 1,
15790                                                              FALSE))
15791                                 || ! IS_OPERATOR(*stacked_ptr))))
15792                     {
15793                         RExC_parse++;
15794                         vFAIL("Unexpected '(' with no preceding operator");
15795                     }
15796                 }
15797
15798                 /* Stack the position of this undealt-with left paren */
15799                 av_push(fence_stack, newSViv(fence));
15800                 fence = top_index + 1;
15801                 break;
15802
15803             case '\\':
15804                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15805                  * multi-char folds are allowed.  */
15806                 if (!regclass(pRExC_state, flagp, depth+1,
15807                               TRUE, /* means parse just the next thing */
15808                               FALSE, /* don't allow multi-char folds */
15809                               FALSE, /* don't silence non-portable warnings.  */
15810                               TRUE,  /* strict */
15811                               FALSE, /* Require return to be an ANYOF */
15812                               &current))
15813                 {
15814                     FAIL2("panic: regclass returned failure to handle_sets, "
15815                           "flags=%#" UVxf, (UV) *flagp);
15816                 }
15817
15818                 /* regclass() will return with parsing just the \ sequence,
15819                  * leaving the parse pointer at the next thing to parse */
15820                 RExC_parse--;
15821                 goto handle_operand;
15822
15823             case '[':   /* Is a bracketed character class */
15824             {
15825                 /* See if this is a [:posix:] class. */
15826                 bool is_posix_class = (OOB_NAMEDCLASS
15827                             < handle_possible_posix(pRExC_state,
15828                                                 RExC_parse + 1,
15829                                                 NULL,
15830                                                 NULL,
15831                                                 TRUE /* checking only */));
15832                 /* If it is a posix class, leave the parse pointer at the '['
15833                  * to fool regclass() into thinking it is part of a
15834                  * '[[:posix:]]'. */
15835                 if (! is_posix_class) {
15836                     RExC_parse++;
15837                 }
15838
15839                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15840                  * multi-char folds are allowed.  */
15841                 if (!regclass(pRExC_state, flagp, depth+1,
15842                                 is_posix_class, /* parse the whole char
15843                                                     class only if not a
15844                                                     posix class */
15845                                 FALSE, /* don't allow multi-char folds */
15846                                 TRUE, /* silence non-portable warnings. */
15847                                 TRUE, /* strict */
15848                                 FALSE, /* Require return to be an ANYOF */
15849                                 &current))
15850                 {
15851                     FAIL2("panic: regclass returned failure to handle_sets, "
15852                           "flags=%#" UVxf, (UV) *flagp);
15853                 }
15854
15855                 if (! current) {
15856                     break;
15857                 }
15858
15859                 /* function call leaves parse pointing to the ']', except if we
15860                  * faked it */
15861                 if (is_posix_class) {
15862                     RExC_parse--;
15863                 }
15864
15865                 goto handle_operand;
15866             }
15867
15868             case ']':
15869                 if (top_index >= 1) {
15870                     goto join_operators;
15871                 }
15872
15873                 /* Only a single operand on the stack: are done */
15874                 goto done;
15875
15876             case ')':
15877                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15878                     if (UCHARAT(RExC_parse - 1) == ']')  {
15879                         break;
15880                     }
15881                     RExC_parse++;
15882                     vFAIL("Unexpected ')'");
15883                 }
15884
15885                 /* If nothing after the fence, is missing an operand */
15886                 if (top_index - fence < 0) {
15887                     RExC_parse++;
15888                     goto bad_syntax;
15889                 }
15890                 /* If at least two things on the stack, treat this as an
15891                   * operator */
15892                 if (top_index - fence >= 1) {
15893                     goto join_operators;
15894                 }
15895
15896                 /* Here only a single thing on the fenced stack, and there is a
15897                  * fence.  Get rid of it */
15898                 fence_ptr = av_pop(fence_stack);
15899                 assert(fence_ptr);
15900                 fence = SvIV(fence_ptr);
15901                 SvREFCNT_dec_NN(fence_ptr);
15902                 fence_ptr = NULL;
15903
15904                 if (fence < 0) {
15905                     fence = 0;
15906                 }
15907
15908                 /* Having gotten rid of the fence, we pop the operand at the
15909                  * stack top and process it as a newly encountered operand */
15910                 current = av_pop(stack);
15911                 if (IS_OPERAND(current)) {
15912                     goto handle_operand;
15913                 }
15914
15915                 RExC_parse++;
15916                 goto bad_syntax;
15917
15918             case '&':
15919             case '|':
15920             case '+':
15921             case '-':
15922             case '^':
15923
15924                 /* These binary operators should have a left operand already
15925                  * parsed */
15926                 if (   top_index - fence < 0
15927                     || top_index - fence == 1
15928                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15929                     || ! IS_OPERAND(*top_ptr))
15930                 {
15931                     goto unexpected_binary;
15932                 }
15933
15934                 /* If only the one operand is on the part of the stack visible
15935                  * to us, we just place this operator in the proper position */
15936                 if (top_index - fence < 2) {
15937
15938                     /* Place the operator before the operand */
15939
15940                     SV* lhs = av_pop(stack);
15941                     av_push(stack, newSVuv(curchar));
15942                     av_push(stack, lhs);
15943                     break;
15944                 }
15945
15946                 /* But if there is something else on the stack, we need to
15947                  * process it before this new operator if and only if the
15948                  * stacked operation has equal or higher precedence than the
15949                  * new one */
15950
15951              join_operators:
15952
15953                 /* The operator on the stack is supposed to be below both its
15954                  * operands */
15955                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15956                     || IS_OPERAND(*stacked_ptr))
15957                 {
15958                     /* But if not, it's legal and indicates we are completely
15959                      * done if and only if we're currently processing a ']',
15960                      * which should be the final thing in the expression */
15961                     if (curchar == ']') {
15962                         goto done;
15963                     }
15964
15965                   unexpected_binary:
15966                     RExC_parse++;
15967                     vFAIL2("Unexpected binary operator '%c' with no "
15968                            "preceding operand", curchar);
15969                 }
15970                 stacked_operator = (char) SvUV(*stacked_ptr);
15971
15972                 if (regex_set_precedence(curchar)
15973                     > regex_set_precedence(stacked_operator))
15974                 {
15975                     /* Here, the new operator has higher precedence than the
15976                      * stacked one.  This means we need to add the new one to
15977                      * the stack to await its rhs operand (and maybe more
15978                      * stuff).  We put it before the lhs operand, leaving
15979                      * untouched the stacked operator and everything below it
15980                      * */
15981                     lhs = av_pop(stack);
15982                     assert(IS_OPERAND(lhs));
15983
15984                     av_push(stack, newSVuv(curchar));
15985                     av_push(stack, lhs);
15986                     break;
15987                 }
15988
15989                 /* Here, the new operator has equal or lower precedence than
15990                  * what's already there.  This means the operation already
15991                  * there should be performed now, before the new one. */
15992
15993                 rhs = av_pop(stack);
15994                 if (! IS_OPERAND(rhs)) {
15995
15996                     /* This can happen when a ! is not followed by an operand,
15997                      * like in /(?[\t &!])/ */
15998                     goto bad_syntax;
15999                 }
16000
16001                 lhs = av_pop(stack);
16002
16003                 if (! IS_OPERAND(lhs)) {
16004
16005                     /* This can happen when there is an empty (), like in
16006                      * /(?[[0]+()+])/ */
16007                     goto bad_syntax;
16008                 }
16009
16010                 switch (stacked_operator) {
16011                     case '&':
16012                         _invlist_intersection(lhs, rhs, &rhs);
16013                         break;
16014
16015                     case '|':
16016                     case '+':
16017                         _invlist_union(lhs, rhs, &rhs);
16018                         break;
16019
16020                     case '-':
16021                         _invlist_subtract(lhs, rhs, &rhs);
16022                         break;
16023
16024                     case '^':   /* The union minus the intersection */
16025                     {
16026                         SV* i = NULL;
16027                         SV* u = NULL;
16028
16029                         _invlist_union(lhs, rhs, &u);
16030                         _invlist_intersection(lhs, rhs, &i);
16031                         _invlist_subtract(u, i, &rhs);
16032                         SvREFCNT_dec_NN(i);
16033                         SvREFCNT_dec_NN(u);
16034                         break;
16035                     }
16036                 }
16037                 SvREFCNT_dec(lhs);
16038
16039                 /* Here, the higher precedence operation has been done, and the
16040                  * result is in 'rhs'.  We overwrite the stacked operator with
16041                  * the result.  Then we redo this code to either push the new
16042                  * operator onto the stack or perform any higher precedence
16043                  * stacked operation */
16044                 only_to_avoid_leaks = av_pop(stack);
16045                 SvREFCNT_dec(only_to_avoid_leaks);
16046                 av_push(stack, rhs);
16047                 goto redo_curchar;
16048
16049             case '!':   /* Highest priority, right associative */
16050
16051                 /* If what's already at the top of the stack is another '!",
16052                  * they just cancel each other out */
16053                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16054                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16055                 {
16056                     only_to_avoid_leaks = av_pop(stack);
16057                     SvREFCNT_dec(only_to_avoid_leaks);
16058                 }
16059                 else { /* Otherwise, since it's right associative, just push
16060                           onto the stack */
16061                     av_push(stack, newSVuv(curchar));
16062                 }
16063                 break;
16064
16065             default:
16066                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16067                 if (RExC_parse >= RExC_end) {
16068                     break;
16069                 }
16070                 vFAIL("Unexpected character");
16071
16072           handle_operand:
16073
16074             /* Here 'current' is the operand.  If something is already on the
16075              * stack, we have to check if it is a !.  But first, the code above
16076              * may have altered the stack in the time since we earlier set
16077              * 'top_index'.  */
16078
16079             top_index = av_tindex_skip_len_mg(stack);
16080             if (top_index - fence >= 0) {
16081                 /* If the top entry on the stack is an operator, it had better
16082                  * be a '!', otherwise the entry below the top operand should
16083                  * be an operator */
16084                 top_ptr = av_fetch(stack, top_index, FALSE);
16085                 assert(top_ptr);
16086                 if (IS_OPERATOR(*top_ptr)) {
16087
16088                     /* The only permissible operator at the top of the stack is
16089                      * '!', which is applied immediately to this operand. */
16090                     curchar = (char) SvUV(*top_ptr);
16091                     if (curchar != '!') {
16092                         SvREFCNT_dec(current);
16093                         vFAIL2("Unexpected binary operator '%c' with no "
16094                                 "preceding operand", curchar);
16095                     }
16096
16097                     _invlist_invert(current);
16098
16099                     only_to_avoid_leaks = av_pop(stack);
16100                     SvREFCNT_dec(only_to_avoid_leaks);
16101
16102                     /* And we redo with the inverted operand.  This allows
16103                      * handling multiple ! in a row */
16104                     goto handle_operand;
16105                 }
16106                           /* Single operand is ok only for the non-binary ')'
16107                            * operator */
16108                 else if ((top_index - fence == 0 && curchar != ')')
16109                          || (top_index - fence > 0
16110                              && (! (stacked_ptr = av_fetch(stack,
16111                                                            top_index - 1,
16112                                                            FALSE))
16113                                  || IS_OPERAND(*stacked_ptr))))
16114                 {
16115                     SvREFCNT_dec(current);
16116                     vFAIL("Operand with no preceding operator");
16117                 }
16118             }
16119
16120             /* Here there was nothing on the stack or the top element was
16121              * another operand.  Just add this new one */
16122             av_push(stack, current);
16123
16124         } /* End of switch on next parse token */
16125
16126         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16127     } /* End of loop parsing through the construct */
16128
16129     vFAIL("Syntax error in (?[...])");
16130
16131   done:
16132
16133     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16134         if (RExC_parse < RExC_end) {
16135             RExC_parse++;
16136         }
16137
16138         vFAIL("Unexpected ']' with no following ')' in (?[...");
16139     }
16140
16141     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16142         vFAIL("Unmatched (");
16143     }
16144
16145     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16146         || ((final = av_pop(stack)) == NULL)
16147         || ! IS_OPERAND(final)
16148         || ! is_invlist(final)
16149         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16150     {
16151       bad_syntax:
16152         SvREFCNT_dec(final);
16153         vFAIL("Incomplete expression within '(?[ ])'");
16154     }
16155
16156     /* Here, 'final' is the resultant inversion list from evaluating the
16157      * expression.  Return it if so requested */
16158     if (return_invlist) {
16159         *return_invlist = final;
16160         return END;
16161     }
16162
16163     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16164      * expecting a string of ranges and individual code points */
16165     invlist_iterinit(final);
16166     result_string = newSVpvs("");
16167     while (invlist_iternext(final, &start, &end)) {
16168         if (start == end) {
16169             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16170         }
16171         else {
16172             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16173                                                      start,          end);
16174         }
16175     }
16176
16177     /* About to generate an ANYOF (or similar) node from the inversion list we
16178      * have calculated */
16179     save_parse = RExC_parse;
16180     RExC_parse = SvPV(result_string, len);
16181     save_end = RExC_end;
16182     RExC_end = RExC_parse + len;
16183     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16184
16185     /* We turn off folding around the call, as the class we have constructed
16186      * already has all folding taken into consideration, and we don't want
16187      * regclass() to add to that */
16188     RExC_flags &= ~RXf_PMf_FOLD;
16189     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16190      * folds are allowed.  */
16191     node = regclass(pRExC_state, flagp, depth+1,
16192                     FALSE, /* means parse the whole char class */
16193                     FALSE, /* don't allow multi-char folds */
16194                     TRUE, /* silence non-portable warnings.  The above may very
16195                              well have generated non-portable code points, but
16196                              they're valid on this machine */
16197                     FALSE, /* similarly, no need for strict */
16198                     FALSE, /* Require return to be an ANYOF */
16199                     NULL
16200                 );
16201
16202     RESTORE_WARNINGS;
16203     RExC_parse = save_parse + 1;
16204     RExC_end = save_end;
16205     SvREFCNT_dec_NN(final);
16206     SvREFCNT_dec_NN(result_string);
16207
16208     if (save_fold) {
16209         RExC_flags |= RXf_PMf_FOLD;
16210     }
16211
16212     if (!node)
16213         FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
16214                     PTR2UV(flagp));
16215
16216     /* Fix up the node type if we are in locale.  (We have pretended we are
16217      * under /u for the purposes of regclass(), as this construct will only
16218      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16219      * as to cause any warnings about bad locales to be output in regexec.c),
16220      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16221      * reason we above forbid optimization into something other than an ANYOF
16222      * node is simply to minimize the number of code changes in regexec.c.
16223      * Otherwise we would have to create new EXACTish node types and deal with
16224      * them.  This decision could be revisited should this construct become
16225      * popular.
16226      *
16227      * (One might think we could look at the resulting ANYOF node and suppress
16228      * the flag if everything is above 255, as those would be UTF-8 only,
16229      * but this isn't true, as the components that led to that result could
16230      * have been locale-affected, and just happen to cancel each other out
16231      * under UTF-8 locales.) */
16232     if (in_locale) {
16233         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16234
16235         assert(OP(REGNODE_p(node)) == ANYOF);
16236
16237         OP(REGNODE_p(node)) = ANYOFL;
16238         ANYOF_FLAGS(REGNODE_p(node))
16239                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16240     }
16241
16242     nextchar(pRExC_state);
16243     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16244     return node;
16245 }
16246
16247 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16248
16249 STATIC void
16250 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16251                              AV * stack, const IV fence, AV * fence_stack)
16252 {   /* Dumps the stacks in handle_regex_sets() */
16253
16254     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16255     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16256     SSize_t i;
16257
16258     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16259
16260     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16261
16262     if (stack_top < 0) {
16263         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16264     }
16265     else {
16266         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16267         for (i = stack_top; i >= 0; i--) {
16268             SV ** element_ptr = av_fetch(stack, i, FALSE);
16269             if (! element_ptr) {
16270             }
16271
16272             if (IS_OPERATOR(*element_ptr)) {
16273                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16274                                             (int) i, (int) SvIV(*element_ptr));
16275             }
16276             else {
16277                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16278                 sv_dump(*element_ptr);
16279             }
16280         }
16281     }
16282
16283     if (fence_stack_top < 0) {
16284         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16285     }
16286     else {
16287         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16288         for (i = fence_stack_top; i >= 0; i--) {
16289             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16290             if (! element_ptr) {
16291             }
16292
16293             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16294                                             (int) i, (int) SvIV(*element_ptr));
16295         }
16296     }
16297 }
16298
16299 #endif
16300
16301 #undef IS_OPERATOR
16302 #undef IS_OPERAND
16303
16304 STATIC void
16305 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16306 {
16307     /* This adds the Latin1/above-Latin1 folding rules.
16308      *
16309      * This should be called only for a Latin1-range code points, cp, which is
16310      * known to be involved in a simple fold with other code points above
16311      * Latin1.  It would give false results if /aa has been specified.
16312      * Multi-char folds are outside the scope of this, and must be handled
16313      * specially. */
16314
16315     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16316
16317     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16318
16319     /* The rules that are valid for all Unicode versions are hard-coded in */
16320     switch (cp) {
16321         case 'k':
16322         case 'K':
16323           *invlist =
16324              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16325             break;
16326         case 's':
16327         case 'S':
16328           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16329             break;
16330         case MICRO_SIGN:
16331           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16332           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16333             break;
16334         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16335         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16336           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16337             break;
16338         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16339           *invlist = add_cp_to_invlist(*invlist,
16340                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16341             break;
16342
16343         default:    /* Other code points are checked against the data for the
16344                        current Unicode version */
16345           {
16346             Size_t folds_count;
16347             unsigned int first_fold;
16348             const unsigned int * remaining_folds;
16349             UV folded_cp;
16350
16351             if (isASCII(cp)) {
16352                 folded_cp = toFOLD(cp);
16353             }
16354             else {
16355                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16356                 Size_t dummy_len;
16357                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16358             }
16359
16360             if (folded_cp > 255) {
16361                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16362             }
16363
16364             folds_count = _inverse_folds(folded_cp, &first_fold,
16365                                                     &remaining_folds);
16366             if (folds_count == 0) {
16367
16368                 /* Use deprecated warning to increase the chances of this being
16369                  * output */
16370                 ckWARN2reg_d(RExC_parse,
16371                         "Perl folding rules are not up-to-date for 0x%02X;"
16372                         " please use the perlbug utility to report;", cp);
16373             }
16374             else {
16375                 unsigned int i;
16376
16377                 if (first_fold > 255) {
16378                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16379                 }
16380                 for (i = 0; i < folds_count - 1; i++) {
16381                     if (remaining_folds[i] > 255) {
16382                         *invlist = add_cp_to_invlist(*invlist,
16383                                                     remaining_folds[i]);
16384                     }
16385                 }
16386             }
16387             break;
16388          }
16389     }
16390 }
16391
16392 STATIC void
16393 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16394 {
16395     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16396      * warnings. */
16397
16398     SV * msg;
16399     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16400
16401     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16402
16403     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16404         return;
16405     }
16406
16407     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16408         if (first_is_fatal) {           /* Avoid leaking this */
16409             av_undef(posix_warnings);   /* This isn't necessary if the
16410                                             array is mortal, but is a
16411                                             fail-safe */
16412             (void) sv_2mortal(msg);
16413             PREPARE_TO_DIE;
16414         }
16415         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16416         SvREFCNT_dec_NN(msg);
16417     }
16418
16419     UPDATE_WARNINGS_LOC(RExC_parse);
16420 }
16421
16422 STATIC AV *
16423 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16424 {
16425     /* This adds the string scalar <multi_string> to the array
16426      * <multi_char_matches>.  <multi_string> is known to have exactly
16427      * <cp_count> code points in it.  This is used when constructing a
16428      * bracketed character class and we find something that needs to match more
16429      * than a single character.
16430      *
16431      * <multi_char_matches> is actually an array of arrays.  Each top-level
16432      * element is an array that contains all the strings known so far that are
16433      * the same length.  And that length (in number of code points) is the same
16434      * as the index of the top-level array.  Hence, the [2] element is an
16435      * array, each element thereof is a string containing TWO code points;
16436      * while element [3] is for strings of THREE characters, and so on.  Since
16437      * this is for multi-char strings there can never be a [0] nor [1] element.
16438      *
16439      * When we rewrite the character class below, we will do so such that the
16440      * longest strings are written first, so that it prefers the longest
16441      * matching strings first.  This is done even if it turns out that any
16442      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16443      * Christiansen has agreed that this is ok.  This makes the test for the
16444      * ligature 'ffi' come before the test for 'ff', for example */
16445
16446     AV* this_array;
16447     AV** this_array_ptr;
16448
16449     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16450
16451     if (! multi_char_matches) {
16452         multi_char_matches = newAV();
16453     }
16454
16455     if (av_exists(multi_char_matches, cp_count)) {
16456         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16457         this_array = *this_array_ptr;
16458     }
16459     else {
16460         this_array = newAV();
16461         av_store(multi_char_matches, cp_count,
16462                  (SV*) this_array);
16463     }
16464     av_push(this_array, multi_string);
16465
16466     return multi_char_matches;
16467 }
16468
16469 /* The names of properties whose definitions are not known at compile time are
16470  * stored in this SV, after a constant heading.  So if the length has been
16471  * changed since initialization, then there is a run-time definition. */
16472 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16473                                         (SvCUR(listsv) != initial_listsv_len)
16474
16475 /* There is a restricted set of white space characters that are legal when
16476  * ignoring white space in a bracketed character class.  This generates the
16477  * code to skip them.
16478  *
16479  * There is a line below that uses the same white space criteria but is outside
16480  * this macro.  Both here and there must use the same definition */
16481 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
16482     STMT_START {                                                        \
16483         if (do_skip) {                                                  \
16484             while (isBLANK_A(UCHARAT(p)))                               \
16485             {                                                           \
16486                 p++;                                                    \
16487             }                                                           \
16488         }                                                               \
16489     } STMT_END
16490
16491 STATIC regnode_offset
16492 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16493                  const bool stop_at_1,  /* Just parse the next thing, don't
16494                                            look for a full character class */
16495                  bool allow_multi_folds,
16496                  const bool silence_non_portable,   /* Don't output warnings
16497                                                        about too large
16498                                                        characters */
16499                  const bool strict,
16500                  bool optimizable,                  /* ? Allow a non-ANYOF return
16501                                                        node */
16502                  SV** ret_invlist  /* Return an inversion list, not a node */
16503           )
16504 {
16505     /* parse a bracketed class specification.  Most of these will produce an
16506      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16507      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
16508      * under /i with multi-character folds: it will be rewritten following the
16509      * paradigm of this example, where the <multi-fold>s are characters which
16510      * fold to multiple character sequences:
16511      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16512      * gets effectively rewritten as:
16513      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16514      * reg() gets called (recursively) on the rewritten version, and this
16515      * function will return what it constructs.  (Actually the <multi-fold>s
16516      * aren't physically removed from the [abcdefghi], it's just that they are
16517      * ignored in the recursion by means of a flag:
16518      * <RExC_in_multi_char_class>.)
16519      *
16520      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16521      * characters, with the corresponding bit set if that character is in the
16522      * list.  For characters above this, a range list or swash is used.  There
16523      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16524      * determinable at compile time
16525      *
16526      * On success, returns the offset at which any next node should be placed
16527      * into the regex engine program being compiled.
16528      *
16529      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16530      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16531      * UTF-8
16532      */
16533
16534     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16535     IV range = 0;
16536     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16537     regnode_offset ret;
16538     STRLEN numlen;
16539     int namedclass = OOB_NAMEDCLASS;
16540     char *rangebegin = NULL;
16541     SV *listsv = NULL;
16542     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16543                                       than just initialized.  */
16544     SV* properties = NULL;    /* Code points that match \p{} \P{} */
16545     SV* posixes = NULL;     /* Code points that match classes like [:word:],
16546                                extended beyond the Latin1 range.  These have to
16547                                be kept separate from other code points for much
16548                                of this function because their handling  is
16549                                different under /i, and for most classes under
16550                                /d as well */
16551     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
16552                                separate for a while from the non-complemented
16553                                versions because of complications with /d
16554                                matching */
16555     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16556                                   treated more simply than the general case,
16557                                   leading to less compilation and execution
16558                                   work */
16559     UV element_count = 0;   /* Number of distinct elements in the class.
16560                                Optimizations may be possible if this is tiny */
16561     AV * multi_char_matches = NULL; /* Code points that fold to more than one
16562                                        character; used under /i */
16563     UV n;
16564     char * stop_ptr = RExC_end;    /* where to stop parsing */
16565
16566     /* ignore unescaped whitespace? */
16567     const bool skip_white = cBOOL(   ret_invlist
16568                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16569
16570     /* Unicode properties are stored in a swash; this holds the current one
16571      * being parsed.  If this swash is the only above-latin1 component of the
16572      * character class, an optimization is to pass it directly on to the
16573      * execution engine.  Otherwise, it is set to NULL to indicate that there
16574      * are other things in the class that have to be dealt with at execution
16575      * time */
16576     SV* swash = NULL;           /* Code points that match \p{} \P{} */
16577
16578     /* inversion list of code points this node matches only when the target
16579      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
16580      * /d) */
16581     SV* upper_latin1_only_utf8_matches = NULL;
16582
16583     /* Inversion list of code points this node matches regardless of things
16584      * like locale, folding, utf8ness of the target string */
16585     SV* cp_list = NULL;
16586
16587     /* Like cp_list, but code points on this list need to be checked for things
16588      * that fold to/from them under /i */
16589     SV* cp_foldable_list = NULL;
16590
16591     /* Like cp_list, but code points on this list are valid only when the
16592      * runtime locale is UTF-8 */
16593     SV* only_utf8_locale_list = NULL;
16594
16595     /* In a range, if one of the endpoints is non-character-set portable,
16596      * meaning that it hard-codes a code point that may mean a different
16597      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16598      * mnemonic '\t' which each mean the same character no matter which
16599      * character set the platform is on. */
16600     unsigned int non_portable_endpoint = 0;
16601
16602     /* Is the range unicode? which means on a platform that isn't 1-1 native
16603      * to Unicode (i.e. non-ASCII), each code point in it should be considered
16604      * to be a Unicode value.  */
16605     bool unicode_range = FALSE;
16606     bool invert = FALSE;    /* Is this class to be complemented */
16607
16608     bool warn_super = ALWAYS_WARN_SUPER;
16609
16610     const char * orig_parse = RExC_parse;
16611
16612     /* This variable is used to mark where the end in the input is of something
16613      * that looks like a POSIX construct but isn't.  During the parse, when
16614      * something looks like it could be such a construct is encountered, it is
16615      * checked for being one, but not if we've already checked this area of the
16616      * input.  Only after this position is reached do we check again */
16617     char *not_posix_region_end = RExC_parse - 1;
16618
16619     AV* posix_warnings = NULL;
16620     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16621     U8 op = END;    /* The returned node-type, initialized to an impossible
16622                        one.  */
16623     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
16624     U32 posixl = 0;       /* bit field of posix classes matched under /l */
16625
16626
16627 /* Flags as to what things aren't knowable until runtime.  (Note that these are
16628  * mutually exclusive.) */
16629 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
16630                                             haven't been defined as of yet */
16631 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
16632                                             UTF-8 or not */
16633 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
16634                                             what gets folded */
16635     U32 has_runtime_dependency = 0;     /* OR of the above flags */
16636
16637     GET_RE_DEBUG_FLAGS_DECL;
16638
16639     PERL_ARGS_ASSERT_REGCLASS;
16640 #ifndef DEBUGGING
16641     PERL_UNUSED_ARG(depth);
16642 #endif
16643
16644
16645     /* If wants an inversion list returned, we can't optimize to something
16646      * else. */
16647     if (ret_invlist) {
16648         optimizable = FALSE;
16649     }
16650
16651     DEBUG_PARSE("clas");
16652
16653 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
16654     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
16655                                    && UNICODE_DOT_DOT_VERSION == 0)
16656     allow_multi_folds = FALSE;
16657 #endif
16658
16659     listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16660     initial_listsv_len = SvCUR(listsv);
16661     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
16662
16663     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16664
16665     assert(RExC_parse <= RExC_end);
16666
16667     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
16668         RExC_parse++;
16669         invert = TRUE;
16670         allow_multi_folds = FALSE;
16671         MARK_NAUGHTY(1);
16672         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16673     }
16674
16675     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16676     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16677         int maybe_class = handle_possible_posix(pRExC_state,
16678                                                 RExC_parse,
16679                                                 &not_posix_region_end,
16680                                                 NULL,
16681                                                 TRUE /* checking only */);
16682         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16683             ckWARN4reg(not_posix_region_end,
16684                     "POSIX syntax [%c %c] belongs inside character classes%s",
16685                     *RExC_parse, *RExC_parse,
16686                     (maybe_class == OOB_NAMEDCLASS)
16687                     ? ((POSIXCC_NOTYET(*RExC_parse))
16688                         ? " (but this one isn't implemented)"
16689                         : " (but this one isn't fully valid)")
16690                     : ""
16691                     );
16692         }
16693     }
16694
16695     /* If the caller wants us to just parse a single element, accomplish this
16696      * by faking the loop ending condition */
16697     if (stop_at_1 && RExC_end > RExC_parse) {
16698         stop_ptr = RExC_parse + 1;
16699     }
16700
16701     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16702     if (UCHARAT(RExC_parse) == ']')
16703         goto charclassloop;
16704
16705     while (1) {
16706
16707         if (   posix_warnings
16708             && av_tindex_skip_len_mg(posix_warnings) >= 0
16709             && RExC_parse > not_posix_region_end)
16710         {
16711             /* Warnings about posix class issues are considered tentative until
16712              * we are far enough along in the parse that we can no longer
16713              * change our mind, at which point we output them.  This is done
16714              * each time through the loop so that a later class won't zap them
16715              * before they have been dealt with. */
16716             output_posix_warnings(pRExC_state, posix_warnings);
16717         }
16718
16719         if  (RExC_parse >= stop_ptr) {
16720             break;
16721         }
16722
16723         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16724
16725         if  (UCHARAT(RExC_parse) == ']') {
16726             break;
16727         }
16728
16729       charclassloop:
16730
16731         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16732         save_value = value;
16733         save_prevvalue = prevvalue;
16734
16735         if (!range) {
16736             rangebegin = RExC_parse;
16737             element_count++;
16738             non_portable_endpoint = 0;
16739         }
16740         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16741             value = utf8n_to_uvchr((U8*)RExC_parse,
16742                                    RExC_end - RExC_parse,
16743                                    &numlen, UTF8_ALLOW_DEFAULT);
16744             RExC_parse += numlen;
16745         }
16746         else
16747             value = UCHARAT(RExC_parse++);
16748
16749         if (value == '[') {
16750             char * posix_class_end;
16751             namedclass = handle_possible_posix(pRExC_state,
16752                                                RExC_parse,
16753                                                &posix_class_end,
16754                                                do_posix_warnings ? &posix_warnings : NULL,
16755                                                FALSE    /* die if error */);
16756             if (namedclass > OOB_NAMEDCLASS) {
16757
16758                 /* If there was an earlier attempt to parse this particular
16759                  * posix class, and it failed, it was a false alarm, as this
16760                  * successful one proves */
16761                 if (   posix_warnings
16762                     && av_tindex_skip_len_mg(posix_warnings) >= 0
16763                     && not_posix_region_end >= RExC_parse
16764                     && not_posix_region_end <= posix_class_end)
16765                 {
16766                     av_undef(posix_warnings);
16767                 }
16768
16769                 RExC_parse = posix_class_end;
16770             }
16771             else if (namedclass == OOB_NAMEDCLASS) {
16772                 not_posix_region_end = posix_class_end;
16773             }
16774             else {
16775                 namedclass = OOB_NAMEDCLASS;
16776             }
16777         }
16778         else if (   RExC_parse - 1 > not_posix_region_end
16779                  && MAYBE_POSIXCC(value))
16780         {
16781             (void) handle_possible_posix(
16782                         pRExC_state,
16783                         RExC_parse - 1,  /* -1 because parse has already been
16784                                             advanced */
16785                         &not_posix_region_end,
16786                         do_posix_warnings ? &posix_warnings : NULL,
16787                         TRUE /* checking only */);
16788         }
16789         else if (  strict && ! skip_white
16790                  && (   _generic_isCC(value, _CC_VERTSPACE)
16791                      || is_VERTWS_cp_high(value)))
16792         {
16793             vFAIL("Literal vertical space in [] is illegal except under /x");
16794         }
16795         else if (value == '\\') {
16796             /* Is a backslash; get the code point of the char after it */
16797
16798             if (RExC_parse >= RExC_end) {
16799                 vFAIL("Unmatched [");
16800             }
16801
16802             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16803                 value = utf8n_to_uvchr((U8*)RExC_parse,
16804                                    RExC_end - RExC_parse,
16805                                    &numlen, UTF8_ALLOW_DEFAULT);
16806                 RExC_parse += numlen;
16807             }
16808             else
16809                 value = UCHARAT(RExC_parse++);
16810
16811             /* Some compilers cannot handle switching on 64-bit integer
16812              * values, therefore value cannot be an UV.  Yes, this will
16813              * be a problem later if we want switch on Unicode.
16814              * A similar issue a little bit later when switching on
16815              * namedclass. --jhi */
16816
16817             /* If the \ is escaping white space when white space is being
16818              * skipped, it means that that white space is wanted literally, and
16819              * is already in 'value'.  Otherwise, need to translate the escape
16820              * into what it signifies. */
16821             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16822
16823             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
16824             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
16825             case 's':   namedclass = ANYOF_SPACE;       break;
16826             case 'S':   namedclass = ANYOF_NSPACE;      break;
16827             case 'd':   namedclass = ANYOF_DIGIT;       break;
16828             case 'D':   namedclass = ANYOF_NDIGIT;      break;
16829             case 'v':   namedclass = ANYOF_VERTWS;      break;
16830             case 'V':   namedclass = ANYOF_NVERTWS;     break;
16831             case 'h':   namedclass = ANYOF_HORIZWS;     break;
16832             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
16833             case 'N':  /* Handle \N{NAME} in class */
16834                 {
16835                     const char * const backslash_N_beg = RExC_parse - 2;
16836                     int cp_count;
16837
16838                     if (! grok_bslash_N(pRExC_state,
16839                                         NULL,      /* No regnode */
16840                                         &value,    /* Yes single value */
16841                                         &cp_count, /* Multiple code pt count */
16842                                         flagp,
16843                                         strict,
16844                                         depth)
16845                     ) {
16846
16847                         if (*flagp & NEED_UTF8)
16848                             FAIL("panic: grok_bslash_N set NEED_UTF8");
16849
16850                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
16851
16852                         if (cp_count < 0) {
16853                             vFAIL("\\N in a character class must be a named character: \\N{...}");
16854                         }
16855                         else if (cp_count == 0) {
16856                             ckWARNreg(RExC_parse,
16857                               "Ignoring zero length \\N{} in character class");
16858                         }
16859                         else { /* cp_count > 1 */
16860                             if (! RExC_in_multi_char_class) {
16861                                 if (invert || range || *RExC_parse == '-') {
16862                                     if (strict) {
16863                                         RExC_parse--;
16864                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16865                                     }
16866                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16867                                     break; /* <value> contains the first code
16868                                               point. Drop out of the switch to
16869                                               process it */
16870                                 }
16871                                 else {
16872                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
16873                                                  RExC_parse - backslash_N_beg);
16874                                     multi_char_matches
16875                                         = add_multi_match(multi_char_matches,
16876                                                           multi_char_N,
16877                                                           cp_count);
16878                                 }
16879                             }
16880                         } /* End of cp_count != 1 */
16881
16882                         /* This element should not be processed further in this
16883                          * class */
16884                         element_count--;
16885                         value = save_value;
16886                         prevvalue = save_prevvalue;
16887                         continue;   /* Back to top of loop to get next char */
16888                     }
16889
16890                     /* Here, is a single code point, and <value> contains it */
16891                     unicode_range = TRUE;   /* \N{} are Unicode */
16892                 }
16893                 break;
16894             case 'p':
16895             case 'P':
16896                 {
16897                 char *e;
16898                 char *i;
16899
16900                 /* We will handle any undefined properties ourselves */
16901                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16902                                        /* And we actually would prefer to get
16903                                         * the straight inversion list of the
16904                                         * swash, since we will be accessing it
16905                                         * anyway, to save a little time */
16906                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16907
16908                 SvREFCNT_dec(swash); /* Free any left-overs */
16909
16910                 /* \p means they want Unicode semantics */
16911                 REQUIRE_UNI_RULES(flagp, 0);
16912
16913                 if (RExC_parse >= RExC_end)
16914                     vFAIL2("Empty \\%c", (U8)value);
16915                 if (*RExC_parse == '{') {
16916                     const U8 c = (U8)value;
16917                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16918                     if (!e) {
16919                         RExC_parse++;
16920                         vFAIL2("Missing right brace on \\%c{}", c);
16921                     }
16922
16923                     RExC_parse++;
16924
16925                     /* White space is allowed adjacent to the braces and after
16926                      * any '^', even when not under /x */
16927                     while (isSPACE(*RExC_parse)) {
16928                          RExC_parse++;
16929                     }
16930
16931                     if (UCHARAT(RExC_parse) == '^') {
16932
16933                         /* toggle.  (The rhs xor gets the single bit that
16934                          * differs between P and p; the other xor inverts just
16935                          * that bit) */
16936                         value ^= 'P' ^ 'p';
16937
16938                         RExC_parse++;
16939                         while (isSPACE(*RExC_parse)) {
16940                             RExC_parse++;
16941                         }
16942                     }
16943
16944                     if (e == RExC_parse)
16945                         vFAIL2("Empty \\%c{}", c);
16946
16947                     n = e - RExC_parse;
16948                     while (isSPACE(*(RExC_parse + n - 1)))
16949                         n--;
16950
16951                 }   /* The \p isn't immediately followed by a '{' */
16952                 else if (! isALPHA(*RExC_parse)) {
16953                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16954                     vFAIL2("Character following \\%c must be '{' or a "
16955                            "single-character Unicode property name",
16956                            (U8) value);
16957                 }
16958                 else {
16959                     e = RExC_parse;
16960                     n = 1;
16961                 }
16962                 {
16963                     char* name = RExC_parse;
16964                     char* base_name;    /* name after any packages are stripped */
16965                     char* lookup_name = NULL;
16966                     const char * const colon_colon = "::";
16967                     bool invert;
16968
16969                     SV* invlist;
16970
16971                     /* Temporary workaround for [perl #133136].  For this
16972                     * precise input that is in the .t that is failing, load
16973                     * utf8.pm, which is what the test wants, so that that
16974                     * .t passes */
16975                     if (     memEQs(RExC_start, e + 1 - RExC_start,
16976                                     "foo\\p{Alnum}")
16977                         && ! hv_common(GvHVn(PL_incgv),
16978                                        NULL,
16979                                        "utf8.pm", sizeof("utf8.pm") - 1,
16980                                        0, HV_FETCH_ISEXISTS, NULL, 0))
16981                     {
16982                         require_pv("utf8.pm");
16983                     }
16984                     invlist = parse_uniprop_string(name, n, FOLD, &invert);
16985                     if (invlist) {
16986                         if (invert) {
16987                             value ^= 'P' ^ 'p';
16988                         }
16989                     }
16990                     else {
16991
16992                     /* Try to get the definition of the property into
16993                      * <invlist>.  If /i is in effect, the effective property
16994                      * will have its name be <__NAME_i>.  The design is
16995                      * discussed in commit
16996                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16997                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16998                     SAVEFREEPV(name);
16999
17000                     for (i = RExC_parse; i < RExC_parse + n; i++) {
17001                         if (isCNTRL(*i) && *i != '\t') {
17002                             RExC_parse = e + 1;
17003                             vFAIL2("Can't find Unicode property definition \"%s\"", name);
17004                         }
17005                     }
17006
17007                     if (FOLD) {
17008                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
17009
17010                         /* The function call just below that uses this can fail
17011                          * to return, leaking memory if we don't do this */
17012                         SAVEFREEPV(lookup_name);
17013                     }
17014
17015                     /* Look up the property name, and get its swash and
17016                      * inversion list, if the property is found  */
17017                     swash = _core_swash_init("utf8",
17018                                              (lookup_name)
17019                                               ? lookup_name
17020                                               : name,
17021                                              &PL_sv_undef,
17022                                              1, /* binary */
17023                                              0, /* not tr/// */
17024                                              NULL, /* No inversion list */
17025                                              &swash_init_flags
17026                                             );
17027                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
17028                         HV* curpkg = (IN_PERL_COMPILETIME)
17029                                       ? PL_curstash
17030                                       : CopSTASH(PL_curcop);
17031                         UV final_n = n;
17032                         bool has_pkg;
17033
17034                         if (swash) {    /* Got a swash but no inversion list.
17035                                            Something is likely wrong that will
17036                                            be sorted-out later */
17037                             SvREFCNT_dec_NN(swash);
17038                             swash = NULL;
17039                         }
17040
17041                         /* Here didn't find it.  It could be a an error (like a
17042                          * typo) in specifying a Unicode property, or it could
17043                          * be a user-defined property that will be available at
17044                          * run-time.  The names of these must begin with 'In'
17045                          * or 'Is' (after any packages are stripped off).  So
17046                          * if not one of those, or if we accept only
17047                          * compile-time properties, is an error; otherwise add
17048                          * it to the list for run-time look up. */
17049                         if ((base_name = rninstr(name, name + n,
17050                                                  colon_colon, colon_colon + 2)))
17051                         { /* Has ::.  We know this must be a user-defined
17052                              property */
17053                             base_name += 2;
17054                             final_n -= base_name - name;
17055                             has_pkg = TRUE;
17056                         }
17057                         else {
17058                             base_name = name;
17059                             has_pkg = FALSE;
17060                         }
17061
17062                         if (   final_n < 3
17063                             || base_name[0] != 'I'
17064                             || (base_name[1] != 's' && base_name[1] != 'n')
17065                             || ret_invlist)
17066                         {
17067                             const char * const msg
17068                                 = (has_pkg)
17069                                   ? "Illegal user-defined property name"
17070                                   : "Can't find Unicode property definition";
17071                             RExC_parse = e + 1;
17072
17073                             /* diag_listed_as: Can't find Unicode property definition "%s" */
17074                             vFAIL3utf8f("%s \"%" UTF8f "\"",
17075                                 msg, UTF8fARG(UTF, n, name));
17076                         }
17077
17078                         /* If the property name doesn't already have a package
17079                          * name, add the current one to it so that it can be
17080                          * referred to outside it. [perl #121777] */
17081                         if (! has_pkg && curpkg) {
17082                             char* pkgname = HvNAME(curpkg);
17083                             if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
17084                                 char* full_name = Perl_form(aTHX_
17085                                                             "%s::%s",
17086                                                             pkgname,
17087                                                             name);
17088                                 n = strlen(full_name);
17089                                 name = savepvn(full_name, n);
17090                                 SAVEFREEPV(name);
17091                             }
17092                         }
17093                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
17094                                         (value == 'p' ? '+' : '!'),
17095                                         (FOLD) ? "__" : "",
17096                                         UTF8fARG(UTF, n, name),
17097                                         (FOLD) ? "_i" : "");
17098                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17099
17100                         /* We don't know yet what this matches, so have to flag
17101                          * it */
17102                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17103                     }
17104                     else {
17105
17106                         /* Here, did get the swash and its inversion list.  If
17107                          * the swash is from a user-defined property, then this
17108                          * whole character class should be regarded as such */
17109                         if (swash_init_flags
17110                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
17111                         {
17112                             has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17113                         }
17114                     }
17115                     }
17116                     if (invlist) {
17117                         if (! (has_runtime_dependency
17118                                                 & HAS_USER_DEFINED_PROPERTY) &&
17119                             /* We warn on matching an above-Unicode code point
17120                              * if the match would return true, except don't
17121                              * warn for \p{All}, which has exactly one element
17122                              * = 0 */
17123                             (_invlist_contains_cp(invlist, 0x110000)
17124                                 && (! (_invlist_len(invlist) == 1
17125                                        && *invlist_array(invlist) == 0))))
17126                         {
17127                             warn_super = TRUE;
17128                         }
17129
17130                         /* Invert if asking for the complement */
17131                         if (value == 'P') {
17132                             _invlist_union_complement_2nd(properties,
17133                                                           invlist,
17134                                                           &properties);
17135
17136                             /* The swash can't be used as-is, because we've
17137                              * inverted things; delay removing it to here after
17138                              * have copied its invlist above */
17139                             if (! swash) {
17140                                 SvREFCNT_dec_NN(invlist);
17141                             }
17142                             SvREFCNT_dec(swash);
17143                             swash = NULL;
17144                         }
17145                         else {
17146                             _invlist_union(properties, invlist, &properties);
17147                             if (! swash) {
17148                                 SvREFCNT_dec_NN(invlist);
17149                             }
17150                         }
17151                     }
17152                 }
17153
17154                 RExC_parse = e + 1;
17155                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17156                                                 named */
17157                 }
17158                 break;
17159             case 'n':   value = '\n';                   break;
17160             case 'r':   value = '\r';                   break;
17161             case 't':   value = '\t';                   break;
17162             case 'f':   value = '\f';                   break;
17163             case 'b':   value = '\b';                   break;
17164             case 'e':   value = ESC_NATIVE;             break;
17165             case 'a':   value = '\a';                   break;
17166             case 'o':
17167                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17168                 {
17169                     const char* error_msg;
17170                     bool valid = grok_bslash_o(&RExC_parse,
17171                                                RExC_end,
17172                                                &value,
17173                                                &error_msg,
17174                                                TO_OUTPUT_WARNINGS(RExC_parse),
17175                                                strict,
17176                                                silence_non_portable,
17177                                                UTF);
17178                     if (! valid) {
17179                         vFAIL(error_msg);
17180                     }
17181                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17182                 }
17183                 non_portable_endpoint++;
17184                 break;
17185             case 'x':
17186                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17187                 {
17188                     const char* error_msg;
17189                     bool valid = grok_bslash_x(&RExC_parse,
17190                                                RExC_end,
17191                                                &value,
17192                                                &error_msg,
17193                                                TO_OUTPUT_WARNINGS(RExC_parse),
17194                                                strict,
17195                                                silence_non_portable,
17196                                                UTF);
17197                     if (! valid) {
17198                         vFAIL(error_msg);
17199                     }
17200                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17201                 }
17202                 non_portable_endpoint++;
17203                 break;
17204             case 'c':
17205                 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17206                 UPDATE_WARNINGS_LOC(RExC_parse);
17207                 RExC_parse++;
17208                 non_portable_endpoint++;
17209                 break;
17210             case '0': case '1': case '2': case '3': case '4':
17211             case '5': case '6': case '7':
17212                 {
17213                     /* Take 1-3 octal digits */
17214                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17215                     numlen = (strict) ? 4 : 3;
17216                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17217                     RExC_parse += numlen;
17218                     if (numlen != 3) {
17219                         if (strict) {
17220                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
17221                             vFAIL("Need exactly 3 octal digits");
17222                         }
17223                         else if (   numlen < 3 /* like \08, \178 */
17224                                  && RExC_parse < RExC_end
17225                                  && isDIGIT(*RExC_parse)
17226                                  && ckWARN(WARN_REGEXP))
17227                         {
17228                             reg_warn_non_literal_string(
17229                                  RExC_parse + 1,
17230                                  form_short_octal_warning(RExC_parse, numlen));
17231                         }
17232                     }
17233                     non_portable_endpoint++;
17234                     break;
17235                 }
17236             default:
17237                 /* Allow \_ to not give an error */
17238                 if (isWORDCHAR(value) && value != '_') {
17239                     if (strict) {
17240                         vFAIL2("Unrecognized escape \\%c in character class",
17241                                (int)value);
17242                     }
17243                     else {
17244                         ckWARN2reg(RExC_parse,
17245                             "Unrecognized escape \\%c in character class passed through",
17246                             (int)value);
17247                     }
17248                 }
17249                 break;
17250             }   /* End of switch on char following backslash */
17251         } /* end of handling backslash escape sequences */
17252
17253         /* Here, we have the current token in 'value' */
17254
17255         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17256             U8 classnum;
17257
17258             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17259              * literal, as is the character that began the false range, i.e.
17260              * the 'a' in the examples */
17261             if (range) {
17262                 const int w = (RExC_parse >= rangebegin)
17263                                 ? RExC_parse - rangebegin
17264                                 : 0;
17265                 if (strict) {
17266                     vFAIL2utf8f(
17267                         "False [] range \"%" UTF8f "\"",
17268                         UTF8fARG(UTF, w, rangebegin));
17269                 }
17270                 else {
17271                     ckWARN2reg(RExC_parse,
17272                         "False [] range \"%" UTF8f "\"",
17273                         UTF8fARG(UTF, w, rangebegin));
17274                     cp_list = add_cp_to_invlist(cp_list, '-');
17275                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17276                                                             prevvalue);
17277                 }
17278
17279                 range = 0; /* this was not a true range */
17280                 element_count += 2; /* So counts for three values */
17281             }
17282
17283             classnum = namedclass_to_classnum(namedclass);
17284
17285             if (LOC && namedclass < ANYOF_POSIXL_MAX
17286 #ifndef HAS_ISASCII
17287                 && classnum != _CC_ASCII
17288 #endif
17289             ) {
17290                 SV* scratch_list = NULL;
17291
17292                 /* What the Posix classes (like \w, [:space:]) match in locale
17293                  * isn't knowable under locale until actual match time.  A
17294                  * special node is used for these which has extra space for a
17295                  * bitmap, with a bit reserved for each named class that is to
17296                  * be matched against.  This isn't needed for \p{} and
17297                  * pseudo-classes, as they are not affected by locale, and
17298                  * hence are dealt with separately */
17299                 POSIXL_SET(posixl, namedclass);
17300                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17301                 anyof_flags |= ANYOF_MATCHES_POSIXL;
17302
17303                 /* The above-Latin1 characters are not subject to locale rules.
17304                  * Just add them to the unconditionally-matched list */
17305
17306                 /* Get the list of the above-Latin1 code points this matches */
17307                 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17308                                         PL_XPosix_ptrs[classnum],
17309
17310                                         /* Odd numbers are complements, like
17311                                         * NDIGIT, NASCII, ... */
17312                                         namedclass % 2 != 0,
17313                                         &scratch_list);
17314                 /* Checking if 'cp_list' is NULL first saves an extra clone.
17315                  * Its reference count will be decremented at the next union,
17316                  * etc, or if this is the only instance, at the end of the
17317                  * routine */
17318                 if (! cp_list) {
17319                     cp_list = scratch_list;
17320                 }
17321                 else {
17322                     _invlist_union(cp_list, scratch_list, &cp_list);
17323                     SvREFCNT_dec_NN(scratch_list);
17324                 }
17325                 continue;   /* Go get next character */
17326             }
17327             else {
17328
17329                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17330                  * matter (or is a Unicode property, which is skipped here). */
17331                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17332                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17333
17334                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17335                          * nor /l make a difference in what these match,
17336                          * therefore we just add what they match to cp_list. */
17337                         if (classnum != _CC_VERTSPACE) {
17338                             assert(   namedclass == ANYOF_HORIZWS
17339                                    || namedclass == ANYOF_NHORIZWS);
17340
17341                             /* It turns out that \h is just a synonym for
17342                              * XPosixBlank */
17343                             classnum = _CC_BLANK;
17344                         }
17345
17346                         _invlist_union_maybe_complement_2nd(
17347                                 cp_list,
17348                                 PL_XPosix_ptrs[classnum],
17349                                 namedclass % 2 != 0,    /* Complement if odd
17350                                                           (NHORIZWS, NVERTWS)
17351                                                         */
17352                                 &cp_list);
17353                     }
17354                 }
17355                 else if (   AT_LEAST_UNI_SEMANTICS
17356                          || classnum == _CC_ASCII
17357                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17358                                                    || classnum == _CC_XDIGIT)))
17359                 {
17360                     /* We usually have to worry about /d affecting what POSIX
17361                      * classes match, with special code needed because we won't
17362                      * know until runtime what all matches.  But there is no
17363                      * extra work needed under /u and /a; and [:ascii:] is
17364                      * unaffected by /d; and :digit: and :xdigit: don't have
17365                      * runtime differences under /d.  So we can special case
17366                      * these, and avoid some extra work below, and at runtime.
17367                      * */
17368                     _invlist_union_maybe_complement_2nd(
17369                                                      simple_posixes,
17370                                                       ((AT_LEAST_ASCII_RESTRICTED)
17371                                                        ? PL_Posix_ptrs[classnum]
17372                                                        : PL_XPosix_ptrs[classnum]),
17373                                                      namedclass % 2 != 0,
17374                                                      &simple_posixes);
17375                 }
17376                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17377                            complement and use nposixes */
17378                     SV** posixes_ptr = namedclass % 2 == 0
17379                                        ? &posixes
17380                                        : &nposixes;
17381                     _invlist_union_maybe_complement_2nd(
17382                                                      *posixes_ptr,
17383                                                      PL_XPosix_ptrs[classnum],
17384                                                      namedclass % 2 != 0,
17385                                                      posixes_ptr);
17386                 }
17387             }
17388         } /* end of namedclass \blah */
17389
17390         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17391
17392         /* If 'range' is set, 'value' is the ending of a range--check its
17393          * validity.  (If value isn't a single code point in the case of a
17394          * range, we should have figured that out above in the code that
17395          * catches false ranges).  Later, we will handle each individual code
17396          * point in the range.  If 'range' isn't set, this could be the
17397          * beginning of a range, so check for that by looking ahead to see if
17398          * the next real character to be processed is the range indicator--the
17399          * minus sign */
17400
17401         if (range) {
17402 #ifdef EBCDIC
17403             /* For unicode ranges, we have to test that the Unicode as opposed
17404              * to the native values are not decreasing.  (Above 255, there is
17405              * no difference between native and Unicode) */
17406             if (unicode_range && prevvalue < 255 && value < 255) {
17407                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17408                     goto backwards_range;
17409                 }
17410             }
17411             else
17412 #endif
17413             if (prevvalue > value) /* b-a */ {
17414                 int w;
17415 #ifdef EBCDIC
17416               backwards_range:
17417 #endif
17418                 w = RExC_parse - rangebegin;
17419                 vFAIL2utf8f(
17420                     "Invalid [] range \"%" UTF8f "\"",
17421                     UTF8fARG(UTF, w, rangebegin));
17422                 NOT_REACHED; /* NOTREACHED */
17423             }
17424         }
17425         else {
17426             prevvalue = value; /* save the beginning of the potential range */
17427             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17428                 && *RExC_parse == '-')
17429             {
17430                 char* next_char_ptr = RExC_parse + 1;
17431
17432                 /* Get the next real char after the '-' */
17433                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17434
17435                 /* If the '-' is at the end of the class (just before the ']',
17436                  * it is a literal minus; otherwise it is a range */
17437                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17438                     RExC_parse = next_char_ptr;
17439
17440                     /* a bad range like \w-, [:word:]- ? */
17441                     if (namedclass > OOB_NAMEDCLASS) {
17442                         if (strict || ckWARN(WARN_REGEXP)) {
17443                             const int w = RExC_parse >= rangebegin
17444                                           ?  RExC_parse - rangebegin
17445                                           : 0;
17446                             if (strict) {
17447                                 vFAIL4("False [] range \"%*.*s\"",
17448                                     w, w, rangebegin);
17449                             }
17450                             else {
17451                                 vWARN4(RExC_parse,
17452                                     "False [] range \"%*.*s\"",
17453                                     w, w, rangebegin);
17454                             }
17455                         }
17456                         cp_list = add_cp_to_invlist(cp_list, '-');
17457                         element_count++;
17458                     } else
17459                         range = 1;      /* yeah, it's a range! */
17460                     continue;   /* but do it the next time */
17461                 }
17462             }
17463         }
17464
17465         if (namedclass > OOB_NAMEDCLASS) {
17466             continue;
17467         }
17468
17469         /* Here, we have a single value this time through the loop, and
17470          * <prevvalue> is the beginning of the range, if any; or <value> if
17471          * not. */
17472
17473         /* non-Latin1 code point implies unicode semantics. */
17474         if (value > 255) {
17475             REQUIRE_UNI_RULES(flagp, 0);
17476         }
17477
17478         /* Ready to process either the single value, or the completed range.
17479          * For single-valued non-inverted ranges, we consider the possibility
17480          * of multi-char folds.  (We made a conscious decision to not do this
17481          * for the other cases because it can often lead to non-intuitive
17482          * results.  For example, you have the peculiar case that:
17483          *  "s s" =~ /^[^\xDF]+$/i => Y
17484          *  "ss"  =~ /^[^\xDF]+$/i => N
17485          *
17486          * See [perl #89750] */
17487         if (FOLD && allow_multi_folds && value == prevvalue) {
17488             if (    value == LATIN_SMALL_LETTER_SHARP_S
17489                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17490                                                         value)))
17491             {
17492                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17493
17494                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17495                 STRLEN foldlen;
17496
17497                 UV folded = _to_uni_fold_flags(
17498                                 value,
17499                                 foldbuf,
17500                                 &foldlen,
17501                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17502                                                    ? FOLD_FLAGS_NOMIX_ASCII
17503                                                    : 0)
17504                                 );
17505
17506                 /* Here, <folded> should be the first character of the
17507                  * multi-char fold of <value>, with <foldbuf> containing the
17508                  * whole thing.  But, if this fold is not allowed (because of
17509                  * the flags), <fold> will be the same as <value>, and should
17510                  * be processed like any other character, so skip the special
17511                  * handling */
17512                 if (folded != value) {
17513
17514                     /* Skip if we are recursed, currently parsing the class
17515                      * again.  Otherwise add this character to the list of
17516                      * multi-char folds. */
17517                     if (! RExC_in_multi_char_class) {
17518                         STRLEN cp_count = utf8_length(foldbuf,
17519                                                       foldbuf + foldlen);
17520                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17521
17522                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17523
17524                         multi_char_matches
17525                                         = add_multi_match(multi_char_matches,
17526                                                           multi_fold,
17527                                                           cp_count);
17528
17529                     }
17530
17531                     /* This element should not be processed further in this
17532                      * class */
17533                     element_count--;
17534                     value = save_value;
17535                     prevvalue = save_prevvalue;
17536                     continue;
17537                 }
17538             }
17539         }
17540
17541         if (strict && ckWARN(WARN_REGEXP)) {
17542             if (range) {
17543
17544                 /* If the range starts above 255, everything is portable and
17545                  * likely to be so for any forseeable character set, so don't
17546                  * warn. */
17547                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17548                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17549                 }
17550                 else if (prevvalue != value) {
17551
17552                     /* Under strict, ranges that stop and/or end in an ASCII
17553                      * printable should have each end point be a portable value
17554                      * for it (preferably like 'A', but we don't warn if it is
17555                      * a (portable) Unicode name or code point), and the range
17556                      * must be be all digits or all letters of the same case.
17557                      * Otherwise, the range is non-portable and unclear as to
17558                      * what it contains */
17559                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
17560                         && (          non_portable_endpoint
17561                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17562                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
17563                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
17564                     ))) {
17565                         vWARN(RExC_parse, "Ranges of ASCII printables should"
17566                                           " be some subset of \"0-9\","
17567                                           " \"A-Z\", or \"a-z\"");
17568                     }
17569                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17570                         SSize_t index_start;
17571                         SSize_t index_final;
17572
17573                         /* But the nature of Unicode and languages mean we
17574                          * can't do the same checks for above-ASCII ranges,
17575                          * except in the case of digit ones.  These should
17576                          * contain only digits from the same group of 10.  The
17577                          * ASCII case is handled just above.  Hence here, the
17578                          * range could be a range of digits.  First some
17579                          * unlikely special cases.  Grandfather in that a range
17580                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17581                          * if its starting value is one of the 10 digits prior
17582                          * to it.  This is because it is an alternate way of
17583                          * writing 19D1, and some people may expect it to be in
17584                          * that group.  But it is bad, because it won't give
17585                          * the expected results.  In Unicode 5.2 it was
17586                          * considered to be in that group (of 11, hence), but
17587                          * this was fixed in the next version */
17588
17589                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17590                             goto warn_bad_digit_range;
17591                         }
17592                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
17593                                           &&     value <= 0x1D7FF))
17594                         {
17595                             /* This is the only other case currently in Unicode
17596                              * where the algorithm below fails.  The code
17597                              * points just above are the end points of a single
17598                              * range containing only decimal digits.  It is 5
17599                              * different series of 0-9.  All other ranges of
17600                              * digits currently in Unicode are just a single
17601                              * series.  (And mktables will notify us if a later
17602                              * Unicode version breaks this.)
17603                              *
17604                              * If the range being checked is at most 9 long,
17605                              * and the digit values represented are in
17606                              * numerical order, they are from the same series.
17607                              * */
17608                             if (         value - prevvalue > 9
17609                                 ||    (((    value - 0x1D7CE) % 10)
17610                                      <= (prevvalue - 0x1D7CE) % 10))
17611                             {
17612                                 goto warn_bad_digit_range;
17613                             }
17614                         }
17615                         else {
17616
17617                             /* For all other ranges of digits in Unicode, the
17618                              * algorithm is just to check if both end points
17619                              * are in the same series, which is the same range.
17620                              * */
17621                             index_start = _invlist_search(
17622                                                     PL_XPosix_ptrs[_CC_DIGIT],
17623                                                     prevvalue);
17624
17625                             /* Warn if the range starts and ends with a digit,
17626                              * and they are not in the same group of 10. */
17627                             if (   index_start >= 0
17628                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17629                                 && (index_final =
17630                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17631                                                     value)) != index_start
17632                                 && index_final >= 0
17633                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17634                             {
17635                               warn_bad_digit_range:
17636                                 vWARN(RExC_parse, "Ranges of digits should be"
17637                                                   " from the same group of"
17638                                                   " 10");
17639                             }
17640                         }
17641                     }
17642                 }
17643             }
17644             if ((! range || prevvalue == value) && non_portable_endpoint) {
17645                 if (isPRINT_A(value)) {
17646                     char literal[3];
17647                     unsigned d = 0;
17648                     if (isBACKSLASHED_PUNCT(value)) {
17649                         literal[d++] = '\\';
17650                     }
17651                     literal[d++] = (char) value;
17652                     literal[d++] = '\0';
17653
17654                     vWARN4(RExC_parse,
17655                            "\"%.*s\" is more clearly written simply as \"%s\"",
17656                            (int) (RExC_parse - rangebegin),
17657                            rangebegin,
17658                            literal
17659                         );
17660                 }
17661                 else if isMNEMONIC_CNTRL(value) {
17662                     vWARN4(RExC_parse,
17663                            "\"%.*s\" is more clearly written simply as \"%s\"",
17664                            (int) (RExC_parse - rangebegin),
17665                            rangebegin,
17666                            cntrl_to_mnemonic((U8) value)
17667                         );
17668                 }
17669             }
17670         }
17671
17672         /* Deal with this element of the class */
17673
17674 #ifndef EBCDIC
17675         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17676                                                     prevvalue, value);
17677 #else
17678         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17679          * that don't require special handling, we can just add the range like
17680          * we do for ASCII platforms */
17681         if ((UNLIKELY(prevvalue == 0) && value >= 255)
17682             || ! (prevvalue < 256
17683                     && (unicode_range
17684                         || (! non_portable_endpoint
17685                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17686                                 || (isUPPER_A(prevvalue)
17687                                     && isUPPER_A(value)))))))
17688         {
17689             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17690                                                         prevvalue, value);
17691         }
17692         else {
17693             /* Here, requires special handling.  This can be because it is a
17694              * range whose code points are considered to be Unicode, and so
17695              * must be individually translated into native, or because its a
17696              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17697              * EBCDIC, but we have defined them to include only the "expected"
17698              * upper or lower case ASCII alphabetics.  Subranges above 255 are
17699              * the same in native and Unicode, so can be added as a range */
17700             U8 start = NATIVE_TO_LATIN1(prevvalue);
17701             unsigned j;
17702             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17703             for (j = start; j <= end; j++) {
17704                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17705             }
17706             if (value > 255) {
17707                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17708                                                             256, value);
17709             }
17710         }
17711 #endif
17712
17713         range = 0; /* this range (if it was one) is done now */
17714     } /* End of loop through all the text within the brackets */
17715
17716     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17717         output_posix_warnings(pRExC_state, posix_warnings);
17718     }
17719
17720     /* If anything in the class expands to more than one character, we have to
17721      * deal with them by building up a substitute parse string, and recursively
17722      * calling reg() on it, instead of proceeding */
17723     if (multi_char_matches) {
17724         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17725         I32 cp_count;
17726         STRLEN len;
17727         char *save_end = RExC_end;
17728         char *save_parse = RExC_parse;
17729         char *save_start = RExC_start;
17730         Size_t constructed_prefix_len = 0; /* This gives the length of the
17731                                               constructed portion of the
17732                                               substitute parse. */
17733         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
17734                                        a "|" */
17735         I32 reg_flags;
17736
17737         assert(! invert);
17738         /* Only one level of recursion allowed */
17739         assert(RExC_copy_start_in_constructed == RExC_precomp);
17740
17741 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
17742            because too confusing */
17743         if (invert) {
17744             sv_catpvs(substitute_parse, "(?:");
17745         }
17746 #endif
17747
17748         /* Look at the longest folds first */
17749         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17750                         cp_count > 0;
17751                         cp_count--)
17752         {
17753
17754             if (av_exists(multi_char_matches, cp_count)) {
17755                 AV** this_array_ptr;
17756                 SV* this_sequence;
17757
17758                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17759                                                  cp_count, FALSE);
17760                 while ((this_sequence = av_pop(*this_array_ptr)) !=
17761                                                                 &PL_sv_undef)
17762                 {
17763                     if (! first_time) {
17764                         sv_catpvs(substitute_parse, "|");
17765                     }
17766                     first_time = FALSE;
17767
17768                     sv_catpv(substitute_parse, SvPVX(this_sequence));
17769                 }
17770             }
17771         }
17772
17773         /* If the character class contains anything else besides these
17774          * multi-character folds, have to include it in recursive parsing */
17775         if (element_count) {
17776             sv_catpvs(substitute_parse, "|[");
17777             constructed_prefix_len = SvCUR(substitute_parse);
17778             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17779
17780             /* Put in a closing ']' only if not going off the end, as otherwise
17781              * we are adding something that really isn't there */
17782             if (RExC_parse < RExC_end) {
17783                 sv_catpvs(substitute_parse, "]");
17784             }
17785         }
17786
17787         sv_catpvs(substitute_parse, ")");
17788 #if 0
17789         if (invert) {
17790             /* This is a way to get the parse to skip forward a whole named
17791              * sequence instead of matching the 2nd character when it fails the
17792              * first */
17793             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17794         }
17795 #endif
17796
17797         /* Set up the data structure so that any errors will be properly
17798          * reported.  See the comments at the definition of
17799          * REPORT_LOCATION_ARGS for details */
17800         RExC_copy_start_in_input = (char *) orig_parse;
17801         RExC_start = RExC_parse = SvPV(substitute_parse, len);
17802         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17803         RExC_end = RExC_parse + len;
17804         RExC_in_multi_char_class = 1;
17805
17806         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17807
17808         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17809
17810         /* And restore so can parse the rest of the pattern */
17811         RExC_parse = save_parse;
17812         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17813         RExC_end = save_end;
17814         RExC_in_multi_char_class = 0;
17815         SvREFCNT_dec_NN(multi_char_matches);
17816         return ret;
17817     }
17818
17819     /* If folding, we calculate all characters that could fold to or from the
17820      * ones already on the list */
17821     if (cp_foldable_list) {
17822         if (FOLD) {
17823             UV start, end;      /* End points of code point ranges */
17824
17825             SV* fold_intersection = NULL;
17826             SV** use_list;
17827
17828             /* Our calculated list will be for Unicode rules.  For locale
17829              * matching, we have to keep a separate list that is consulted at
17830              * runtime only when the locale indicates Unicode rules (and we
17831              * don't include potential matches in the ASCII/Latin1 range, as
17832              * any code point could fold to any other, based on the run-time
17833              * locale).   For non-locale, we just use the general list */
17834             if (LOC) {
17835                 use_list = &only_utf8_locale_list;
17836             }
17837             else {
17838                 use_list = &cp_list;
17839             }
17840
17841             /* Only the characters in this class that participate in folds need
17842              * be checked.  Get the intersection of this class and all the
17843              * possible characters that are foldable.  This can quickly narrow
17844              * down a large class */
17845             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
17846                                   &fold_intersection);
17847
17848             /* Now look at the foldable characters in this class individually */
17849             invlist_iterinit(fold_intersection);
17850             while (invlist_iternext(fold_intersection, &start, &end)) {
17851                 UV j;
17852                 UV folded;
17853
17854                 /* Look at every character in the range */
17855                 for (j = start; j <= end; j++) {
17856                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17857                     STRLEN foldlen;
17858                     unsigned int k;
17859                     Size_t folds_count;
17860                     unsigned int first_fold;
17861                     const unsigned int * remaining_folds;
17862
17863                     if (j < 256) {
17864
17865                         /* Under /l, we don't know what code points below 256
17866                          * fold to, except we do know the MICRO SIGN folds to
17867                          * an above-255 character if the locale is UTF-8, so we
17868                          * add it to the special list (in *use_list)  Otherwise
17869                          * we know now what things can match, though some folds
17870                          * are valid under /d only if the target is UTF-8.
17871                          * Those go in a separate list */
17872                         if (      IS_IN_SOME_FOLD_L1(j)
17873                             && ! (LOC && j != MICRO_SIGN))
17874                         {
17875
17876                             /* ASCII is always matched; non-ASCII is matched
17877                              * only under Unicode rules (which could happen
17878                              * under /l if the locale is a UTF-8 one */
17879                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17880                                 *use_list = add_cp_to_invlist(*use_list,
17881                                                             PL_fold_latin1[j]);
17882                             }
17883                             else if (j != PL_fold_latin1[j]) {
17884                                 upper_latin1_only_utf8_matches
17885                                         = add_cp_to_invlist(
17886                                                 upper_latin1_only_utf8_matches,
17887                                                 PL_fold_latin1[j]);
17888                             }
17889                         }
17890
17891                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17892                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17893                         {
17894                             add_above_Latin1_folds(pRExC_state,
17895                                                    (U8) j,
17896                                                    use_list);
17897                         }
17898                         continue;
17899                     }
17900
17901                     /* Here is an above Latin1 character.  We don't have the
17902                      * rules hard-coded for it.  First, get its fold.  This is
17903                      * the simple fold, as the multi-character folds have been
17904                      * handled earlier and separated out */
17905                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
17906                                                         (ASCII_FOLD_RESTRICTED)
17907                                                         ? FOLD_FLAGS_NOMIX_ASCII
17908                                                         : 0);
17909
17910                     /* Single character fold of above Latin1.  Add everything
17911                      * in its fold closure to the list that this node should
17912                      * match. */
17913                     folds_count = _inverse_folds(folded, &first_fold,
17914                                                     &remaining_folds);
17915                     for (k = 0; k <= folds_count; k++) {
17916                         UV c = (k == 0)     /* First time through use itself */
17917                                 ? folded
17918                                 : (k == 1)  /* 2nd time use, the first fold */
17919                                    ? first_fold
17920
17921                                      /* Then the remaining ones */
17922                                    : remaining_folds[k-2];
17923
17924                         /* /aa doesn't allow folds between ASCII and non- */
17925                         if ((   ASCII_FOLD_RESTRICTED
17926                             && (isASCII(c) != isASCII(j))))
17927                         {
17928                             continue;
17929                         }
17930
17931                         /* Folds under /l which cross the 255/256 boundary are
17932                          * added to a separate list.  (These are valid only
17933                          * when the locale is UTF-8.) */
17934                         if (c < 256 && LOC) {
17935                             *use_list = add_cp_to_invlist(*use_list, c);
17936                             continue;
17937                         }
17938
17939                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17940                         {
17941                             cp_list = add_cp_to_invlist(cp_list, c);
17942                         }
17943                         else {
17944                             /* Similarly folds involving non-ascii Latin1
17945                              * characters under /d are added to their list */
17946                             upper_latin1_only_utf8_matches
17947                                     = add_cp_to_invlist(
17948                                                 upper_latin1_only_utf8_matches,
17949                                                 c);
17950                         }
17951                     }
17952                 }
17953             }
17954             SvREFCNT_dec_NN(fold_intersection);
17955         }
17956
17957         /* Now that we have finished adding all the folds, there is no reason
17958          * to keep the foldable list separate */
17959         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17960         SvREFCNT_dec_NN(cp_foldable_list);
17961     }
17962
17963     /* And combine the result (if any) with any inversion lists from posix
17964      * classes.  The lists are kept separate up to now because we don't want to
17965      * fold the classes (folding of those is automatically handled by the swash
17966      * fetching code) */
17967     if (simple_posixes) {   /* These are the classes known to be unaffected by
17968                                /a, /aa, and /d */
17969         if (cp_list) {
17970             _invlist_union(cp_list, simple_posixes, &cp_list);
17971             SvREFCNT_dec_NN(simple_posixes);
17972         }
17973         else {
17974             cp_list = simple_posixes;
17975         }
17976     }
17977     if (posixes || nposixes) {
17978         if (! DEPENDS_SEMANTICS) {
17979
17980             /* For everything but /d, we can just add the current 'posixes' and
17981              * 'nposixes' to the main list */
17982             if (posixes) {
17983                 if (cp_list) {
17984                     _invlist_union(cp_list, posixes, &cp_list);
17985                     SvREFCNT_dec_NN(posixes);
17986                 }
17987                 else {
17988                     cp_list = posixes;
17989                 }
17990             }
17991             if (nposixes) {
17992                 if (cp_list) {
17993                     _invlist_union(cp_list, nposixes, &cp_list);
17994                     SvREFCNT_dec_NN(nposixes);
17995                 }
17996                 else {
17997                     cp_list = nposixes;
17998                 }
17999             }
18000         }
18001         else {
18002             /* Under /d, things like \w match upper Latin1 characters only if
18003              * the target string is in UTF-8.  But things like \W match all the
18004              * upper Latin1 characters if the target string is not in UTF-8.
18005              *
18006              * Handle the case with something like \W separately */
18007             if (nposixes) {
18008                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18009
18010                 /* A complemented posix class matches all upper Latin1
18011                  * characters if not in UTF-8.  And it matches just certain
18012                  * ones when in UTF-8.  That means those certain ones are
18013                  * matched regardless, so can just be added to the
18014                  * unconditional list */
18015                 if (cp_list) {
18016                     _invlist_union(cp_list, nposixes, &cp_list);
18017                     SvREFCNT_dec_NN(nposixes);
18018                     nposixes = NULL;
18019                 }
18020                 else {
18021                     cp_list = nposixes;
18022                 }
18023
18024                 /* Likewise for 'posixes' */
18025                 _invlist_union(posixes, cp_list, &cp_list);
18026
18027                 /* Likewise for anything else in the range that matched only
18028                  * under UTF-8 */
18029                 if (upper_latin1_only_utf8_matches) {
18030                     _invlist_union(cp_list,
18031                                    upper_latin1_only_utf8_matches,
18032                                    &cp_list);
18033                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18034                     upper_latin1_only_utf8_matches = NULL;
18035                 }
18036
18037                 /* If we don't match all the upper Latin1 characters regardless
18038                  * of UTF-8ness, we have to set a flag to match the rest when
18039                  * not in UTF-8 */
18040                 _invlist_subtract(only_non_utf8_list, cp_list,
18041                                   &only_non_utf8_list);
18042                 if (_invlist_len(only_non_utf8_list) != 0) {
18043                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18044                 }
18045                 SvREFCNT_dec_NN(only_non_utf8_list);
18046             }
18047             else {
18048                 /* Here there were no complemented posix classes.  That means
18049                  * the upper Latin1 characters in 'posixes' match only when the
18050                  * target string is in UTF-8.  So we have to add them to the
18051                  * list of those types of code points, while adding the
18052                  * remainder to the unconditional list.
18053                  *
18054                  * First calculate what they are */
18055                 SV* nonascii_but_latin1_properties = NULL;
18056                 _invlist_intersection(posixes, PL_UpperLatin1,
18057                                       &nonascii_but_latin1_properties);
18058
18059                 /* And add them to the final list of such characters. */
18060                 _invlist_union(upper_latin1_only_utf8_matches,
18061                                nonascii_but_latin1_properties,
18062                                &upper_latin1_only_utf8_matches);
18063
18064                 /* Remove them from what now becomes the unconditional list */
18065                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18066                                   &posixes);
18067
18068                 /* And add those unconditional ones to the final list */
18069                 if (cp_list) {
18070                     _invlist_union(cp_list, posixes, &cp_list);
18071                     SvREFCNT_dec_NN(posixes);
18072                     posixes = NULL;
18073                 }
18074                 else {
18075                     cp_list = posixes;
18076                 }
18077
18078                 SvREFCNT_dec(nonascii_but_latin1_properties);
18079
18080                 /* Get rid of any characters from the conditional list that we
18081                  * now know are matched unconditionally, which may make that
18082                  * list empty */
18083                 _invlist_subtract(upper_latin1_only_utf8_matches,
18084                                   cp_list,
18085                                   &upper_latin1_only_utf8_matches);
18086                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18087                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18088                     upper_latin1_only_utf8_matches = NULL;
18089                 }
18090             }
18091         }
18092     }
18093
18094     /* And combine the result (if any) with any inversion list from properties.
18095      * The lists are kept separate up to now so that we can distinguish the two
18096      * in regards to matching above-Unicode.  A run-time warning is generated
18097      * if a Unicode property is matched against a non-Unicode code point. But,
18098      * we allow user-defined properties to match anything, without any warning,
18099      * and we also suppress the warning if there is a portion of the character
18100      * class that isn't a Unicode property, and which matches above Unicode, \W
18101      * or [\x{110000}] for example.
18102      * (Note that in this case, unlike the Posix one above, there is no
18103      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18104      * forces Unicode semantics */
18105     if (properties) {
18106         if (cp_list) {
18107
18108             /* If it matters to the final outcome, see if a non-property
18109              * component of the class matches above Unicode.  If so, the
18110              * warning gets suppressed.  This is true even if just a single
18111              * such code point is specified, as, though not strictly correct if
18112              * another such code point is matched against, the fact that they
18113              * are using above-Unicode code points indicates they should know
18114              * the issues involved */
18115             if (warn_super) {
18116                 warn_super = ! (invert
18117                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18118             }
18119
18120             _invlist_union(properties, cp_list, &cp_list);
18121             SvREFCNT_dec_NN(properties);
18122         }
18123         else {
18124             cp_list = properties;
18125         }
18126
18127         if (warn_super) {
18128             anyof_flags
18129              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18130
18131             /* Because an ANYOF node is the only one that warns, this node
18132              * can't be optimized into something else */
18133             optimizable = FALSE;
18134         }
18135     }
18136
18137     /* Here, we have calculated what code points should be in the character
18138      * class.
18139      *
18140      * Now we can see about various optimizations.  Fold calculation (which we
18141      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18142      * would invert to include K, which under /i would match k, which it
18143      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18144      * folded until runtime */
18145
18146     /* If we didn't do folding, it's because some information isn't available
18147      * until runtime; set the run-time fold flag for these.  (We don't have to
18148      * worry about properties folding, as that is taken care of by the swash
18149      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
18150      * locales, or the class matches at least one 0-255 range code point */
18151     if (LOC && FOLD) {
18152
18153         /* Some things on the list might be unconditionally included because of
18154          * other components.  Remove them, and clean up the list if it goes to
18155          * 0 elements */
18156         if (only_utf8_locale_list && cp_list) {
18157             _invlist_subtract(only_utf8_locale_list, cp_list,
18158                               &only_utf8_locale_list);
18159
18160             if (_invlist_len(only_utf8_locale_list) == 0) {
18161                 SvREFCNT_dec_NN(only_utf8_locale_list);
18162                 only_utf8_locale_list = NULL;
18163             }
18164         }
18165         if (only_utf8_locale_list) {
18166             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18167             anyof_flags
18168                  |= ANYOFL_FOLD
18169                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18170         }
18171         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18172             UV start, end;
18173             invlist_iterinit(cp_list);
18174             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18175                 anyof_flags |= ANYOFL_FOLD;
18176                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18177             }
18178             invlist_iterfinish(cp_list);
18179         }
18180     }
18181     else if (   DEPENDS_SEMANTICS
18182              && (    upper_latin1_only_utf8_matches
18183                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18184     {
18185         RExC_seen_d_op = TRUE;
18186         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18187     }
18188
18189     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18190      * compile time. */
18191     if (     cp_list
18192         &&   invert
18193         && ! has_runtime_dependency)
18194     {
18195         _invlist_invert(cp_list);
18196
18197         /* Any swash can't be used as-is, because we've inverted things */
18198         if (swash) {
18199             SvREFCNT_dec_NN(swash);
18200             swash = NULL;
18201         }
18202
18203         invert = FALSE;
18204     }
18205
18206     if (ret_invlist) {
18207         *ret_invlist = cp_list;
18208         SvREFCNT_dec(swash);
18209
18210         return RExC_emit;
18211     }
18212
18213     /* All possible optimizations below still have these characteristics.
18214      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18215      * routine) */
18216     *flagp |= HASWIDTH|SIMPLE;
18217
18218     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18219         RExC_contains_locale = 1;
18220     }
18221
18222     /* Some character classes are equivalent to other nodes.  Such nodes take
18223      * up less room, and some nodes require fewer operations to execute, than
18224      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18225      * improve efficiency. */
18226
18227     if (optimizable) {
18228         PERL_UINT_FAST8_T i;
18229         Size_t partial_cp_count = 0;
18230         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18231         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18232
18233         if (cp_list) { /* Count the code points in enough ranges that we would
18234                           see all the ones possible in any fold in this version
18235                           of Unicode */
18236
18237             invlist_iterinit(cp_list);
18238             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18239                 if (invlist_iternext(cp_list, &start[i], &end[i])) {
18240                     partial_cp_count += end[i] - start[i] + 1;
18241                 }
18242             }
18243
18244             invlist_iterfinish(cp_list);
18245         }
18246
18247         /* If we know at compile time that this matches every possible code
18248          * point, any run-time dependencies don't matter */
18249         if (start[0] == 0 && end[0] == UV_MAX) {
18250             if (invert) {
18251                 ret = reganode(pRExC_state, OPFAIL, 0);
18252             }
18253             else {
18254                 ret = reg_node(pRExC_state, SANY);
18255                 MARK_NAUGHTY(1);
18256             }
18257             goto not_anyof;
18258         }
18259
18260         /* Similarly, for /l posix classes, if both a class and its
18261          * complement match, any run-time dependencies don't matter */
18262         if (posixl) {
18263             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18264                                                         namedclass += 2)
18265             {
18266                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18267                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18268                 {
18269                     if (invert) {
18270                         ret = reganode(pRExC_state, OPFAIL, 0);
18271                     }
18272                     else {
18273                         ret = reg_node(pRExC_state, SANY);
18274                         MARK_NAUGHTY(1);
18275                     }
18276                     goto not_anyof;
18277                 }
18278             }
18279             /* For well-behaved locales, some classes are subsets of others,
18280              * so complementing the subset and including the non-complemented
18281              * superset should match everything, like [\D[:alnum:]], and
18282              * [[:^alpha:][:alnum:]], but some implementations of locales are
18283              * buggy, and khw thinks its a bad idea to have optimization change
18284              * behavior, even if it avoids an OS bug in a given case */
18285
18286 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18287
18288             /* If is a single posix /l class, can optimize to just that op.
18289              * Such a node will not match anything in the Latin1 range, as that
18290              * is not determinable until runtime, but will match whatever the
18291              * class does outside that range.  (Note that some classes won't
18292              * match anything outside the range, like [:ascii:]) */
18293             if (    isSINGLE_BIT_SET(posixl)
18294                 && (partial_cp_count == 0 || start[0] > 255))
18295             {
18296                 U8 classnum;
18297                 SV * class_above_latin1 = NULL;
18298                 bool already_inverted;
18299                 bool are_equivalent;
18300
18301                 /* Compute which bit is set, which is the same thing as, e.g.,
18302                  * ANYOF_CNTRL.  From
18303                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18304                  * */
18305                 static const int MultiplyDeBruijnBitPosition2[32] =
18306                     {
18307                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18308                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18309                     };
18310
18311                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18312                                                           * 0x077CB531U) >> 27];
18313                 classnum = namedclass_to_classnum(namedclass);
18314
18315                 /* The named classes are such that the inverted number is one
18316                  * larger than the non-inverted one */
18317                 already_inverted = namedclass
18318                                  - classnum_to_namedclass(classnum);
18319
18320                 /* Create an inversion list of the official property, inverted
18321                  * if the constructed node list is inverted, and restricted to
18322                  * only the above latin1 code points, which are the only ones
18323                  * known at compile time */
18324                 _invlist_intersection_maybe_complement_2nd(
18325                                                     PL_AboveLatin1,
18326                                                     PL_XPosix_ptrs[classnum],
18327                                                     already_inverted,
18328                                                     &class_above_latin1);
18329                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18330                                                                         FALSE);
18331                 SvREFCNT_dec_NN(class_above_latin1);
18332
18333                 if (are_equivalent) {
18334
18335                     /* Resolve the run-time inversion flag with this possibly
18336                      * inverted class */
18337                     invert = invert ^ already_inverted;
18338
18339                     ret = reg_node(pRExC_state,
18340                                    POSIXL + invert * (NPOSIXL - POSIXL));
18341                     FLAGS(REGNODE_p(ret)) = classnum;
18342                     goto not_anyof;
18343                 }
18344             }
18345         }
18346
18347         /* khw can't think of any other possible transformation involving
18348          * these. */
18349         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18350             goto is_anyof;
18351         }
18352
18353         if (! has_runtime_dependency) {
18354
18355             /* If the list is empty, nothing matches.  This happens, for
18356              * example, when a Unicode property that doesn't match anything is
18357              * the only element in the character class (perluniprops.pod notes
18358              * such properties). */
18359             if (partial_cp_count == 0) {
18360                 assert (! invert);
18361                 ret = reganode(pRExC_state, OPFAIL, 0);
18362                 goto not_anyof;
18363             }
18364
18365             /* If matches everything but \n */
18366             if (   start[0] == 0 && end[0] == '\n' - 1
18367                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18368             {
18369                 assert (! invert);
18370                 ret = reg_node(pRExC_state, REG_ANY);
18371                 MARK_NAUGHTY(1);
18372                 goto not_anyof;
18373             }
18374         }
18375
18376         /* Next see if can optimize classes that contain just a few code points
18377          * into an EXACTish node.  The reason to do this is to let the
18378          * optimizer join this node with adjacent EXACTish ones.
18379          *
18380          * An EXACTFish node can be generated even if not under /i, and vice
18381          * versa.  But care must be taken.  An EXACTFish node has to be such
18382          * that it only matches precisely the code points in the class, but we
18383          * want to generate the least restrictive one that does that, to
18384          * increase the odds of being able to join with an adjacent node.  For
18385          * example, if the class contains [kK], we have to make it an EXACTFAA
18386          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18387          * /i or not is irrelevant in this case.  Less obvious is the pattern
18388          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18389          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18390          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18391          * that includes \X{02BC}, there is a multi-char fold that does, and so
18392          * the node generated for it must be an EXACTFish one.  On the other
18393          * hand qr/:/i should generate a plain EXACT node since the colon
18394          * participates in no fold whatsoever, and having it EXACT tells the
18395          * optimizer the target string cannot match unless it has a colon in
18396          * it.
18397          *
18398          * We don't typically generate an EXACTish node if doing so would
18399          * require changing the pattern to UTF-8, as that affects /d and
18400          * otherwise is slower.  However, under /i, not changing to UTF-8 can
18401          * miss some potential multi-character folds.  We calculate the
18402          * EXACTish node, and then decide if something would be missed if we
18403          * don't upgrade */
18404         if (   ! posixl
18405             && ! invert
18406
18407                 /* Only try if there are no more code points in the class than
18408                  * in the max possible fold */
18409             &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
18410
18411             && (start[0] < 256 || UTF || FOLD))
18412         {
18413             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18414             {
18415                 /* We can always make a single code point class into an
18416                  * EXACTish node. */
18417
18418                 if (LOC) {
18419
18420                     /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
18421                      * as that means there is a fold not known until runtime so
18422                      * shows as only a single code point here. */
18423                     op = (FOLD) ? EXACTFL : EXACTL;
18424                 }
18425                 else if (! FOLD) { /* Not /l and not /i */
18426                     op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
18427                 }
18428                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18429                                               small */
18430
18431                     /* Under /i, it gets a little tricky.  A code point that
18432                      * doesn't participate in a fold should be an EXACT node.
18433                      * We know this one isn't the result of a simple fold, or
18434                      * there'd be more than one code point in the list, but it
18435                      * could be part of a multi- character fold.  In that case
18436                      * we better not create an EXACT node, as we would wrongly
18437                      * be telling the optimizer that this code point must be in
18438                      * the target string, and that is wrong.  This is because
18439                      * if the sequence around this code point forms a
18440                      * multi-char fold, what needs to be in the string could be
18441                      * the code point that folds to the sequence.
18442                      *
18443                      * This handles the case of below-255 code points, as we
18444                      * have an easy look up for those.  The next clause handles
18445                      * the above-256 one */
18446                     op = IS_IN_SOME_FOLD_L1(start[0])
18447                          ? EXACTFU
18448                          : EXACT;
18449                 }
18450                 else {  /* /i, larger code point.  Since we are under /i, and
18451                            have just this code point, we know that it can't
18452                            fold to something else, so PL_InMultiCharFold
18453                            applies to it */
18454                     op = _invlist_contains_cp(PL_InMultiCharFold,
18455                                               start[0])
18456                          ? EXACTFU_ONLY8
18457                          : EXACT_ONLY8;
18458                 }
18459
18460                 value = start[0];
18461             }
18462             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18463                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18464             {
18465                 /* Here, the only runtime dependency, if any, is from /d, and
18466                  * the class matches more than one code point, and the lowest
18467                  * code point participates in some fold.  It might be that the
18468                  * other code points are /i equivalent to this one, and hence
18469                  * they would representable by an EXACTFish node.  Above, we
18470                  * eliminated classes that contain too many code points to be
18471                  * EXACTFish, with the test for MAX_FOLD_FROMS
18472                  *
18473                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18474                  * We do this because we have EXACTFAA at our disposal for the
18475                  * ASCII range */
18476                 if (partial_cp_count == 2 && isASCII(start[0])) {
18477
18478                     /* The only ASCII characters that participate in folds are
18479                      * alphabetics */
18480                     assert(isALPHA(start[0]));
18481                     if (   end[0] == start[0]   /* First range is a single
18482                                                    character, so 2nd exists */
18483                         && isALPHA_FOLD_EQ(start[0], start[1]))
18484                     {
18485
18486                         /* Here, is part of an ASCII fold pair */
18487
18488                         if (   ASCII_FOLD_RESTRICTED
18489                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18490                         {
18491                             /* If the second clause just above was true, it
18492                              * means we can't be under /i, or else the list
18493                              * would have included more than this fold pair.
18494                              * Therefore we have to exclude the possibility of
18495                              * whatever else it is that folds to these, by
18496                              * using EXACTFAA */
18497                             op = EXACTFAA;
18498                         }
18499                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18500
18501                             /* Here, there's no simple fold that start[0] is part
18502                              * of, but there is a multi-character one.  If we
18503                              * are not under /i, we want to exclude that
18504                              * possibility; if under /i, we want to include it
18505                              * */
18506                             op = (FOLD) ? EXACTFU : EXACTFAA;
18507                         }
18508                         else {
18509
18510                             /* Here, the only possible fold start[0] particpates in
18511                              * is with start[1].  /i or not isn't relevant */
18512                             op = EXACTFU;
18513                         }
18514
18515                         value = toFOLD(start[0]);
18516                     }
18517                 }
18518                 else if (  ! upper_latin1_only_utf8_matches
18519                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18520                                                                           == 2
18521                              && PL_fold_latin1[
18522                                invlist_highest(upper_latin1_only_utf8_matches)]
18523                              == start[0]))
18524                 {
18525                     /* Here, the smallest character is non-ascii or there are
18526                      * more than 2 code points matched by this node.  Also, we
18527                      * either don't have /d UTF-8 dependent matches, or if we
18528                      * do, they look like they could be a single character that
18529                      * is the fold of the lowest one in the always-match list.
18530                      * This test quickly excludes most of the false positives
18531                      * when there are /d UTF-8 depdendent matches.  These are
18532                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18533                      * SMALL LETTER A WITH GRAVE iff the target string is
18534                      * UTF-8.  (We don't have to worry above about exceeding
18535                      * the array bounds of PL_fold_latin1[] because any code
18536                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18537                      *
18538                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18539                      * points) in the ASCII range, so we can't use it here to
18540                      * artificially restrict the fold domain, so we check if
18541                      * the class does or does not match some EXACTFish node.
18542                      * Further, if we aren't under /i, and and the folded-to
18543                      * character is part of a multi-character fold, we can't do
18544                      * this optimization, as the sequence around it could be
18545                      * that multi-character fold, and we don't here know the
18546                      * context, so we have to assume it is that multi-char
18547                      * fold, to prevent potential bugs.
18548                      *
18549                      * To do the general case, we first find the fold of the
18550                      * lowest code point (which may be higher than the lowest
18551                      * one), then find everything that folds to it.  (The data
18552                      * structure we have only maps from the folded code points,
18553                      * so we have to do the earlier step.) */
18554
18555                     Size_t foldlen;
18556                     U8 foldbuf[UTF8_MAXBYTES_CASE];
18557                     UV folded = _to_uni_fold_flags(start[0],
18558                                                         foldbuf, &foldlen, 0);
18559                     unsigned int first_fold;
18560                     const unsigned int * remaining_folds;
18561                     Size_t folds_to_this_cp_count = _inverse_folds(
18562                                                             folded,
18563                                                             &first_fold,
18564                                                             &remaining_folds);
18565                     Size_t folds_count = folds_to_this_cp_count + 1;
18566                     SV * fold_list = _new_invlist(folds_count);
18567                     unsigned int i;
18568
18569                     /* If there are UTF-8 dependent matches, create a temporary
18570                      * list of what this node matches, including them. */
18571                     SV * all_cp_list = NULL;
18572                     SV ** use_this_list = &cp_list;
18573
18574                     if (upper_latin1_only_utf8_matches) {
18575                         all_cp_list = _new_invlist(0);
18576                         use_this_list = &all_cp_list;
18577                         _invlist_union(cp_list,
18578                                        upper_latin1_only_utf8_matches,
18579                                        use_this_list);
18580                     }
18581
18582                     /* Having gotten everything that participates in the fold
18583                      * containing the lowest code point, we turn that into an
18584                      * inversion list, making sure everything is included. */
18585                     fold_list = add_cp_to_invlist(fold_list, start[0]);
18586                     fold_list = add_cp_to_invlist(fold_list, folded);
18587                     fold_list = add_cp_to_invlist(fold_list, first_fold);
18588                     for (i = 0; i < folds_to_this_cp_count - 1; i++) {
18589                         fold_list = add_cp_to_invlist(fold_list,
18590                                                         remaining_folds[i]);
18591                     }
18592
18593                     /* If the fold list is identical to what's in this ANYOF
18594                      * node, the node can be represented by an EXACTFish one
18595                      * instead */
18596                     if (_invlistEQ(*use_this_list, fold_list,
18597                                    0 /* Don't complement */ )
18598                     ) {
18599
18600                         /* But, we have to be careful, as mentioned above.
18601                          * Just the right sequence of characters could match
18602                          * this if it is part of a multi-character fold.  That
18603                          * IS what we want if we are under /i.  But it ISN'T
18604                          * what we want if not under /i, as it could match when
18605                          * it shouldn't.  So, when we aren't under /i and this
18606                          * character participates in a multi-char fold, we
18607                          * don't optimize into an EXACTFish node.  So, for each
18608                          * case below we have to check if we are folding
18609                          * and if not, if it is not part of a multi-char fold.
18610                          * */
18611                         if (start[0] > 255) {    /* Highish code point */
18612                             if (FOLD || ! _invlist_contains_cp(
18613                                             PL_InMultiCharFold, folded))
18614                             {
18615                                 op = (LOC)
18616                                      ? EXACTFLU8
18617                                      : (ASCII_FOLD_RESTRICTED)
18618                                        ? EXACTFAA
18619                                        : EXACTFU_ONLY8;
18620                                 value = folded;
18621                             }
18622                         }   /* Below, the lowest code point < 256 */
18623                         else if (    FOLD
18624                                  &&  folded == 's'
18625                                  &&  DEPENDS_SEMANTICS)
18626                         {   /* An EXACTF node containing a single character
18627                                 's', can be an EXACTFU if it doesn't get
18628                                 joined with an adjacent 's' */
18629                             op = EXACTFU_S_EDGE;
18630                             value = folded;
18631                         }
18632                         else if (    FOLD
18633                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
18634                         {
18635                             if (upper_latin1_only_utf8_matches) {
18636                                 op = EXACTF;
18637
18638                                 /* We can't use the fold, as that only matches
18639                                  * under UTF-8 */
18640                                 value = start[0];
18641                             }
18642                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
18643                                      && ! UTF)
18644                             {   /* EXACTFUP is a special node for this
18645                                    character */
18646                                 op = (ASCII_FOLD_RESTRICTED)
18647                                      ? EXACTFAA
18648                                      : EXACTFUP;
18649                                 value = MICRO_SIGN;
18650                             }
18651                             else if (     ASCII_FOLD_RESTRICTED
18652                                      && ! isASCII(start[0]))
18653                             {   /* For ASCII under /iaa, we can use EXACTFU
18654                                    below */
18655                                 op = EXACTFAA;
18656                                 value = folded;
18657                             }
18658                             else {
18659                                 op = EXACTFU;
18660                                 value = folded;
18661                             }
18662                         }
18663                     }
18664
18665                     SvREFCNT_dec_NN(fold_list);
18666                     SvREFCNT_dec(all_cp_list);
18667                 }
18668             }
18669
18670             if (op != END) {
18671
18672                 /* Here, we have calculated what EXACTish node we would use.
18673                  * But we don't use it if it would require converting the
18674                  * pattern to UTF-8, unless not using it could cause us to miss
18675                  * some folds (hence be buggy) */
18676
18677                 if (! UTF && value > 255) {
18678                     SV * in_multis = NULL;
18679
18680                     assert(FOLD);
18681
18682                     /* If there is no code point that is part of a multi-char
18683                      * fold, then there aren't any matches, so we don't do this
18684                      * optimization.  Otherwise, it could match depending on
18685                      * the context around us, so we do upgrade */
18686                     _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
18687                     if (UNLIKELY(_invlist_len(in_multis) != 0)) {
18688                         REQUIRE_UTF8(flagp);
18689                     }
18690                     else {
18691                         op = END;
18692                     }
18693                 }
18694
18695                 if (op != END) {
18696                     U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
18697
18698                     ret = regnode_guts(pRExC_state, op, len, "exact");
18699                     FILL_NODE(ret, op);
18700                     RExC_emit += 1 + STR_SZ(len);
18701                     STR_LEN(REGNODE_p(ret)) = len;
18702                     if (len == 1) {
18703                         *STRING(REGNODE_p(ret)) = value;
18704                     }
18705                     else {
18706                         uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
18707                     }
18708                     goto not_anyof;
18709                 }
18710             }
18711         }
18712
18713         if (! has_runtime_dependency) {
18714
18715             /* See if this can be turned into an ANYOFM node.  Think about the
18716              * bit patterns in two different bytes.  In some positions, the
18717              * bits in each will be 1; and in other positions both will be 0;
18718              * and in some positions the bit will be 1 in one byte, and 0 in
18719              * the other.  Let 'n' be the number of positions where the bits
18720              * differ.  We create a mask which has exactly 'n' 0 bits, each in
18721              * a position where the two bytes differ.  Now take the set of all
18722              * bytes that when ANDed with the mask yield the same result.  That
18723              * set has 2**n elements, and is representable by just two 8 bit
18724              * numbers: the result and the mask.  Importantly, matching the set
18725              * can be vectorized by creating a word full of the result bytes,
18726              * and a word full of the mask bytes, yielding a significant speed
18727              * up.  Here, see if this node matches such a set.  As a concrete
18728              * example consider [01], and the byte representing '0' which is
18729              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
18730              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
18731              * 0x30.  Any other bytes ANDed yield something else.  So [01],
18732              * which is a common usage, is optimizable into ANYOFM, and can
18733              * benefit from the speed up.  We can only do this on UTF-8
18734              * invariant bytes, because they have the same bit patterns under
18735              * UTF-8 as not. */
18736             PERL_UINT_FAST8_T inverted = 0;
18737 #ifdef EBCDIC
18738             const PERL_UINT_FAST8_T max_permissible = 0xFF;
18739 #else
18740             const PERL_UINT_FAST8_T max_permissible = 0x7F;
18741 #endif
18742             /* If doesn't fit the criteria for ANYOFM, invert and try again.
18743              * If that works we will instead later generate an NANYOFM, and
18744              * invert back when through */
18745             if (invlist_highest(cp_list) > max_permissible) {
18746                 _invlist_invert(cp_list);
18747                 inverted = 1;
18748             }
18749
18750             if (invlist_highest(cp_list) <= max_permissible) {
18751                 UV this_start, this_end;
18752                 UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
18753                 U8 bits_differing = 0;
18754                 Size_t full_cp_count = 0;
18755                 bool first_time = TRUE;
18756
18757                 /* Go through the bytes and find the bit positions that differ
18758                  * */
18759                 invlist_iterinit(cp_list);
18760                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
18761                     unsigned int i = this_start;
18762
18763                     if (first_time) {
18764                         if (! UVCHR_IS_INVARIANT(i)) {
18765                             goto done_anyofm;
18766                         }
18767
18768                         first_time = FALSE;
18769                         lowest_cp = this_start;
18770
18771                         /* We have set up the code point to compare with.
18772                          * Don't compare it with itself */
18773                         i++;
18774                     }
18775
18776                     /* Find the bit positions that differ from the lowest code
18777                      * point in the node.  Keep track of all such positions by
18778                      * OR'ing */
18779                     for (; i <= this_end; i++) {
18780                         if (! UVCHR_IS_INVARIANT(i)) {
18781                             goto done_anyofm;
18782                         }
18783
18784                         bits_differing  |= i ^ lowest_cp;
18785                     }
18786
18787                     full_cp_count += this_end - this_start + 1;
18788                 }
18789                 invlist_iterfinish(cp_list);
18790
18791                 /* At the end of the loop, we count how many bits differ from
18792                  * the bits in lowest code point, call the count 'd'.  If the
18793                  * set we found contains 2**d elements, it is the closure of
18794                  * all code points that differ only in those bit positions.  To
18795                  * convince yourself of that, first note that the number in the
18796                  * closure must be a power of 2, which we test for.  The only
18797                  * way we could have that count and it be some differing set,
18798                  * is if we got some code points that don't differ from the
18799                  * lowest code point in any position, but do differ from each
18800                  * other in some other position.  That means one code point has
18801                  * a 1 in that position, and another has a 0.  But that would
18802                  * mean that one of them differs from the lowest code point in
18803                  * that position, which possibility we've already excluded.  */
18804                 if (  (inverted || full_cp_count > 1)
18805                     && full_cp_count == 1U << PL_bitcount[bits_differing])
18806                 {
18807                     U8 ANYOFM_mask;
18808
18809                     op = ANYOFM + inverted;;
18810
18811                     /* We need to make the bits that differ be 0's */
18812                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
18813
18814                     /* The argument is the lowest code point */
18815                     ret = reganode(pRExC_state, op, lowest_cp);
18816                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18817                 }
18818             }
18819           done_anyofm:
18820
18821             if (inverted) {
18822                 _invlist_invert(cp_list);
18823             }
18824
18825             if (op != END) {
18826                 goto not_anyof;
18827             }
18828         }
18829
18830         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
18831             PERL_UINT_FAST8_T type;
18832             SV * intersection = NULL;
18833             SV* d_invlist = NULL;
18834
18835             /* See if this matches any of the POSIX classes.  The POSIXA and
18836              * POSIXD ones are about the same speed as ANYOF ops, but take less
18837              * room; the ones that have above-Latin1 code point matches are
18838              * somewhat faster than ANYOF.  */
18839
18840             for (type = POSIXA; type >= POSIXD; type--) {
18841                 int posix_class;
18842
18843                 if (type == POSIXL) {   /* But not /l posix classes */
18844                     continue;
18845                 }
18846
18847                 for (posix_class = 0;
18848                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18849                      posix_class++)
18850                 {
18851                     SV** our_code_points = &cp_list;
18852                     SV** official_code_points;
18853                     int try_inverted;
18854
18855                     if (type == POSIXA) {
18856                         official_code_points = &PL_Posix_ptrs[posix_class];
18857                     }
18858                     else {
18859                         official_code_points = &PL_XPosix_ptrs[posix_class];
18860                     }
18861
18862                     /* Skip non-existent classes of this type.  e.g. \v only
18863                      * has an entry in PL_XPosix_ptrs */
18864                     if (! *official_code_points) {
18865                         continue;
18866                     }
18867
18868                     /* Try both the regular class, and its inversion */
18869                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18870                         bool this_inverted = invert ^ try_inverted;
18871
18872                         if (type != POSIXD) {
18873
18874                             /* This class that isn't /d can't match if we have
18875                              * /d dependencies */
18876                             if (has_runtime_dependency
18877                                                     & HAS_D_RUNTIME_DEPENDENCY)
18878                             {
18879                                 continue;
18880                             }
18881                         }
18882                         else /* is /d */ if (! this_inverted) {
18883
18884                             /* /d classes don't match anything non-ASCII below
18885                              * 256 unconditionally (which cp_list contains) */
18886                             _invlist_intersection(cp_list, PL_UpperLatin1,
18887                                                            &intersection);
18888                             if (_invlist_len(intersection) != 0) {
18889                                 continue;
18890                             }
18891
18892                             SvREFCNT_dec(d_invlist);
18893                             d_invlist = invlist_clone(cp_list, NULL);
18894
18895                             /* But under UTF-8 it turns into using /u rules.
18896                              * Add the things it matches under these conditions
18897                              * so that we check below that these are identical
18898                              * to what the tested class should match */
18899                             if (upper_latin1_only_utf8_matches) {
18900                                 _invlist_union(
18901                                             d_invlist,
18902                                             upper_latin1_only_utf8_matches,
18903                                             &d_invlist);
18904                             }
18905                             our_code_points = &d_invlist;
18906                         }
18907                         else {  /* POSIXD, inverted.  If this doesn't have this
18908                                    flag set, it isn't /d. */
18909                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
18910                             {
18911                                 continue;
18912                             }
18913                             our_code_points = &cp_list;
18914                         }
18915
18916                         /* Here, have weeded out some things.  We want to see
18917                          * if the list of characters this node contains
18918                          * ('*our_code_points') precisely matches those of the
18919                          * class we are currently checking against
18920                          * ('*official_code_points'). */
18921                         if (_invlistEQ(*our_code_points,
18922                                        *official_code_points,
18923                                        try_inverted))
18924                         {
18925                             /* Here, they precisely match.  Optimize this ANYOF
18926                              * node into its equivalent POSIX one of the
18927                              * correct type, possibly inverted */
18928                             ret = reg_node(pRExC_state, (try_inverted)
18929                                                         ? type + NPOSIXA
18930                                                                 - POSIXA
18931                                                         : type);
18932                             FLAGS(REGNODE_p(ret)) = posix_class;
18933                             SvREFCNT_dec(d_invlist);
18934                             SvREFCNT_dec(intersection);
18935                             goto not_anyof;
18936                         }
18937                     }
18938                 }
18939             }
18940             SvREFCNT_dec(d_invlist);
18941             SvREFCNT_dec(intersection);
18942         }
18943
18944         /* If didn't find an optimization and there is no need for a
18945         * bitmap, optimize to indicate that */
18946         if (     start[0] >= NUM_ANYOF_CODE_POINTS
18947             && ! LOC
18948             && ! upper_latin1_only_utf8_matches)
18949         {
18950             op = ANYOFH;
18951         }
18952     }   /* End of seeing if can optimize it into a different node */
18953
18954   is_anyof: /* It's going to be an ANYOF node. */
18955     if (op != ANYOFH) {
18956         op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
18957              ? ANYOFD
18958              : ((posixl)
18959                 ? ANYOFPOSIXL
18960                 : ((LOC)
18961                    ? ANYOFL
18962                    : ANYOF));
18963     }
18964
18965     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
18966     FILL_NODE(ret, op);        /* We set the argument later */
18967     RExC_emit += 1 + regarglen[op];
18968     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
18969
18970     /* Here, <cp_list> contains all the code points we can determine at
18971      * compile time that match under all conditions.  Go through it, and
18972      * for things that belong in the bitmap, put them there, and delete from
18973      * <cp_list>.  While we are at it, see if everything above 255 is in the
18974      * list, and if so, set a flag to speed up execution */
18975
18976     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
18977
18978     if (posixl) {
18979         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
18980     }
18981
18982     if (invert) {
18983         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
18984     }
18985
18986     /* Here, the bitmap has been populated with all the Latin1 code points that
18987      * always match.  Can now add to the overall list those that match only
18988      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
18989      * */
18990     if (upper_latin1_only_utf8_matches) {
18991         if (cp_list) {
18992             _invlist_union(cp_list,
18993                            upper_latin1_only_utf8_matches,
18994                            &cp_list);
18995             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18996         }
18997         else {
18998             cp_list = upper_latin1_only_utf8_matches;
18999         }
19000         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19001     }
19002
19003     /* If there is a swash and more than one element, we can't use the swash in
19004      * the optimization below. */
19005     if (swash && element_count > 1) {
19006         SvREFCNT_dec_NN(swash);
19007         swash = NULL;
19008     }
19009
19010     /* Note that the optimization of using 'swash' if it is the only thing in
19011      * the class doesn't have us change swash at all, so it can include things
19012      * that are also in the bitmap; otherwise we have purposely deleted that
19013      * duplicate information */
19014     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19015                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19016                    ? listsv : NULL,
19017                   only_utf8_locale_list,
19018                   swash, cBOOL(has_runtime_dependency
19019                                                 & HAS_USER_DEFINED_PROPERTY));
19020     return ret;
19021
19022   not_anyof:
19023
19024     /* Here, the node is getting optimized into something that's not an ANYOF
19025      * one.  Finish up. */
19026
19027     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19028                                            RExC_parse - orig_parse);;
19029     SvREFCNT_dec(cp_list);;
19030     return ret;
19031 }
19032
19033 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19034
19035 STATIC void
19036 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19037                 regnode* const node,
19038                 SV* const cp_list,
19039                 SV* const runtime_defns,
19040                 SV* const only_utf8_locale_list,
19041                 SV* const swash,
19042                 const bool has_user_defined_property)
19043 {
19044     /* Sets the arg field of an ANYOF-type node 'node', using information about
19045      * the node passed-in.  If there is nothing outside the node's bitmap, the
19046      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19047      * the count returned by add_data(), having allocated and stored an array,
19048      * av, that that count references, as follows:
19049      *  av[0] stores the character class description in its textual form.
19050      *        This is used later (regexec.c:Perl_regclass_swash()) to
19051      *        initialize the appropriate swash, and is also useful for dumping
19052      *        the regnode.  This is set to &PL_sv_undef if the textual
19053      *        description is not needed at run-time (as happens if the other
19054      *        elements completely define the class)
19055      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
19056      *        computed from av[0].  But if no further computation need be done,
19057      *        the swash is stored here now (and av[0] is &PL_sv_undef).
19058      *  av[2] stores the inversion list of code points that match only if the
19059      *        current locale is UTF-8
19060      *  av[3] stores the cp_list inversion list for use in addition or instead
19061      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
19062      *        (Otherwise everything needed is already in av[0] and av[1])
19063      *  av[4] is set if any component of the class is from a user-defined
19064      *        property; used only if av[3] exists */
19065
19066     UV n;
19067
19068     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19069
19070     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19071         assert(! (ANYOF_FLAGS(node)
19072                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19073         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19074     }
19075     else {
19076         AV * const av = newAV();
19077         SV *rv;
19078
19079         av_store(av, 0, (runtime_defns)
19080                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
19081         if (swash) {
19082             assert(cp_list);
19083             av_store(av, 1, swash);
19084             SvREFCNT_dec_NN(cp_list);
19085         }
19086         else {
19087             av_store(av, 1, &PL_sv_undef);
19088             if (cp_list) {
19089                 av_store(av, 3, cp_list);
19090                 av_store(av, 4, newSVuv(has_user_defined_property));
19091             }
19092         }
19093
19094         if (only_utf8_locale_list) {
19095             av_store(av, 2, only_utf8_locale_list);
19096         }
19097         else {
19098             av_store(av, 2, &PL_sv_undef);
19099         }
19100
19101         rv = newRV_noinc(MUTABLE_SV(av));
19102         n = add_data(pRExC_state, STR_WITH_LEN("s"));
19103         RExC_rxi->data->data[n] = (void*)rv;
19104         ARG_SET(node, n);
19105     }
19106 }
19107
19108 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19109 SV *
19110 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19111                                         const regnode* node,
19112                                         bool doinit,
19113                                         SV** listsvp,
19114                                         SV** only_utf8_locale_ptr,
19115                                         SV** output_invlist)
19116
19117 {
19118     /* For internal core use only.
19119      * Returns the swash for the input 'node' in the regex 'prog'.
19120      * If <doinit> is 'true', will attempt to create the swash if not already
19121      *    done.
19122      * If <listsvp> is non-null, will return the printable contents of the
19123      *    swash.  This can be used to get debugging information even before the
19124      *    swash exists, by calling this function with 'doinit' set to false, in
19125      *    which case the components that will be used to eventually create the
19126      *    swash are returned  (in a printable form).
19127      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19128      *    store an inversion list of code points that should match only if the
19129      *    execution-time locale is a UTF-8 one.
19130      * If <output_invlist> is not NULL, it is where this routine is to store an
19131      *    inversion list of the code points that would be instead returned in
19132      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19133      *    when this parameter is used, is just the non-code point data that
19134      *    will go into creating the swash.  This currently should be just
19135      *    user-defined properties whose definitions were not known at compile
19136      *    time.  Using this parameter allows for easier manipulation of the
19137      *    swash's data by the caller.  It is illegal to call this function with
19138      *    this parameter set, but not <listsvp>
19139      *
19140      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19141      * that, in spite of this function's name, the swash it returns may include
19142      * the bitmap data as well */
19143
19144     SV *sw  = NULL;
19145     SV *si  = NULL;         /* Input swash initialization string */
19146     SV* invlist = NULL;
19147
19148     RXi_GET_DECL(prog, progi);
19149     const struct reg_data * const data = prog ? progi->data : NULL;
19150
19151     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19152     assert(! output_invlist || listsvp);
19153
19154     if (data && data->count) {
19155         const U32 n = ARG(node);
19156
19157         if (data->what[n] == 's') {
19158             SV * const rv = MUTABLE_SV(data->data[n]);
19159             AV * const av = MUTABLE_AV(SvRV(rv));
19160             SV **const ary = AvARRAY(av);
19161             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
19162
19163             si = *ary;  /* ary[0] = the string to initialize the swash with */
19164
19165             if (av_tindex_skip_len_mg(av) >= 2) {
19166                 if (only_utf8_locale_ptr
19167                     && ary[2]
19168                     && ary[2] != &PL_sv_undef)
19169                 {
19170                     *only_utf8_locale_ptr = ary[2];
19171                 }
19172                 else {
19173                     assert(only_utf8_locale_ptr);
19174                     *only_utf8_locale_ptr = NULL;
19175                 }
19176
19177                 /* Elements 3 and 4 are either both present or both absent. [3]
19178                  * is any inversion list generated at compile time; [4]
19179                  * indicates if that inversion list has any user-defined
19180                  * properties in it. */
19181                 if (av_tindex_skip_len_mg(av) >= 3) {
19182                     invlist = ary[3];
19183                     if (SvUV(ary[4])) {
19184                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
19185                     }
19186                 }
19187                 else {
19188                     invlist = NULL;
19189                 }
19190             }
19191
19192             /* Element [1] is reserved for the set-up swash.  If already there,
19193              * return it; if not, create it and store it there */
19194             if (ary[1] && SvROK(ary[1])) {
19195                 sw = ary[1];
19196             }
19197             else if (doinit && ((si && si != &PL_sv_undef)
19198                                  || (invlist && invlist != &PL_sv_undef))) {
19199                 assert(si);
19200                 sw = _core_swash_init("utf8", /* the utf8 package */
19201                                       "", /* nameless */
19202                                       si,
19203                                       1, /* binary */
19204                                       0, /* not from tr/// */
19205                                       invlist,
19206                                       &swash_init_flags);
19207                 (void)av_store(av, 1, sw);
19208             }
19209         }
19210     }
19211
19212     /* If requested, return a printable version of what this swash matches */
19213     if (listsvp) {
19214         SV* matches_string = NULL;
19215
19216         /* The swash should be used, if possible, to get the data, as it
19217          * contains the resolved data.  But this function can be called at
19218          * compile-time, before everything gets resolved, in which case we
19219          * return the currently best available information, which is the string
19220          * that will eventually be used to do that resolving, 'si' */
19221         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
19222             && (si && si != &PL_sv_undef))
19223         {
19224             /* Here, we only have 'si' (and possibly some passed-in data in
19225              * 'invlist', which is handled below)  If the caller only wants
19226              * 'si', use that.  */
19227             if (! output_invlist) {
19228                 matches_string = newSVsv(si);
19229             }
19230             else {
19231                 /* But if the caller wants an inversion list of the node, we
19232                  * need to parse 'si' and place as much as possible in the
19233                  * desired output inversion list, making 'matches_string' only
19234                  * contain the currently unresolvable things */
19235                 const char *si_string = SvPVX(si);
19236                 STRLEN remaining = SvCUR(si);
19237                 UV prev_cp = 0;
19238                 U8 count = 0;
19239
19240                 /* Ignore everything before the first new-line */
19241                 while (*si_string != '\n' && remaining > 0) {
19242                     si_string++;
19243                     remaining--;
19244                 }
19245                 assert(remaining > 0);
19246
19247                 si_string++;
19248                 remaining--;
19249
19250                 while (remaining > 0) {
19251
19252                     /* The data consists of just strings defining user-defined
19253                      * property names, but in prior incarnations, and perhaps
19254                      * somehow from pluggable regex engines, it could still
19255                      * hold hex code point definitions.  Each component of a
19256                      * range would be separated by a tab, and each range by a
19257                      * new-line.  If these are found, instead add them to the
19258                      * inversion list */
19259                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19260                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19261                     STRLEN len = remaining;
19262                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19263
19264                     /* If the hex decode routine found something, it should go
19265                      * up to the next \n */
19266                     if (   *(si_string + len) == '\n') {
19267                         if (count) {    /* 2nd code point on line */
19268                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19269                         }
19270                         else {
19271                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19272                         }
19273                         count = 0;
19274                         goto prepare_for_next_iteration;
19275                     }
19276
19277                     /* If the hex decode was instead for the lower range limit,
19278                      * save it, and go parse the upper range limit */
19279                     if (*(si_string + len) == '\t') {
19280                         assert(count == 0);
19281
19282                         prev_cp = cp;
19283                         count = 1;
19284                       prepare_for_next_iteration:
19285                         si_string += len + 1;
19286                         remaining -= len + 1;
19287                         continue;
19288                     }
19289
19290                     /* Here, didn't find a legal hex number.  Just add it from
19291                      * here to the next \n */
19292
19293                     remaining -= len;
19294                     while (*(si_string + len) != '\n' && remaining > 0) {
19295                         remaining--;
19296                         len++;
19297                     }
19298                     if (*(si_string + len) == '\n') {
19299                         len++;
19300                         remaining--;
19301                     }
19302                     if (matches_string) {
19303                         sv_catpvn(matches_string, si_string, len - 1);
19304                     }
19305                     else {
19306                         matches_string = newSVpvn(si_string, len - 1);
19307                     }
19308                     si_string += len;
19309                     sv_catpvs(matches_string, " ");
19310                 } /* end of loop through the text */
19311
19312                 assert(matches_string);
19313                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19314                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19315                 }
19316             } /* end of has an 'si' but no swash */
19317         }
19318
19319         /* If we have a swash in place, its equivalent inversion list was above
19320          * placed into 'invlist'.  If not, this variable may contain a stored
19321          * inversion list which is information beyond what is in 'si' */
19322         if (invlist) {
19323
19324             /* Again, if the caller doesn't want the output inversion list, put
19325              * everything in 'matches-string' */
19326             if (! output_invlist) {
19327                 if ( ! matches_string) {
19328                     matches_string = newSVpvs("\n");
19329                 }
19330                 sv_catsv(matches_string, invlist_contents(invlist,
19331                                                   TRUE /* traditional style */
19332                                                   ));
19333             }
19334             else if (! *output_invlist) {
19335                 *output_invlist = invlist_clone(invlist, NULL);
19336             }
19337             else {
19338                 _invlist_union(*output_invlist, invlist, output_invlist);
19339             }
19340         }
19341
19342         *listsvp = matches_string;
19343     }
19344
19345     return sw;
19346 }
19347 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19348
19349 /* reg_skipcomment()
19350
19351    Absorbs an /x style # comment from the input stream,
19352    returning a pointer to the first character beyond the comment, or if the
19353    comment terminates the pattern without anything following it, this returns
19354    one past the final character of the pattern (in other words, RExC_end) and
19355    sets the REG_RUN_ON_COMMENT_SEEN flag.
19356
19357    Note it's the callers responsibility to ensure that we are
19358    actually in /x mode
19359
19360 */
19361
19362 PERL_STATIC_INLINE char*
19363 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19364 {
19365     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19366
19367     assert(*p == '#');
19368
19369     while (p < RExC_end) {
19370         if (*(++p) == '\n') {
19371             return p+1;
19372         }
19373     }
19374
19375     /* we ran off the end of the pattern without ending the comment, so we have
19376      * to add an \n when wrapping */
19377     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19378     return p;
19379 }
19380
19381 STATIC void
19382 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19383                                 char ** p,
19384                                 const bool force_to_xmod
19385                          )
19386 {
19387     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19388      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19389      * is /x whitespace, advance '*p' so that on exit it points to the first
19390      * byte past all such white space and comments */
19391
19392     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19393
19394     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19395
19396     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19397
19398     for (;;) {
19399         if (RExC_end - (*p) >= 3
19400             && *(*p)     == '('
19401             && *(*p + 1) == '?'
19402             && *(*p + 2) == '#')
19403         {
19404             while (*(*p) != ')') {
19405                 if ((*p) == RExC_end)
19406                     FAIL("Sequence (?#... not terminated");
19407                 (*p)++;
19408             }
19409             (*p)++;
19410             continue;
19411         }
19412
19413         if (use_xmod) {
19414             const char * save_p = *p;
19415             while ((*p) < RExC_end) {
19416                 STRLEN len;
19417                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19418                     (*p) += len;
19419                 }
19420                 else if (*(*p) == '#') {
19421                     (*p) = reg_skipcomment(pRExC_state, (*p));
19422                 }
19423                 else {
19424                     break;
19425                 }
19426             }
19427             if (*p != save_p) {
19428                 continue;
19429             }
19430         }
19431
19432         break;
19433     }
19434
19435     return;
19436 }
19437
19438 /* nextchar()
19439
19440    Advances the parse position by one byte, unless that byte is the beginning
19441    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
19442    those two cases, the parse position is advanced beyond all such comments and
19443    white space.
19444
19445    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19446 */
19447
19448 STATIC void
19449 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19450 {
19451     PERL_ARGS_ASSERT_NEXTCHAR;
19452
19453     if (RExC_parse < RExC_end) {
19454         assert(   ! UTF
19455                || UTF8_IS_INVARIANT(*RExC_parse)
19456                || UTF8_IS_START(*RExC_parse));
19457
19458         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
19459
19460         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19461                                 FALSE /* Don't force /x */ );
19462     }
19463 }
19464
19465 STATIC void
19466 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19467 {
19468     /* 'size' is the delta to add or subtract from the current memory allocated
19469      * to the regex engine being constructed */
19470
19471     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19472
19473     RExC_size += size;
19474
19475     Renewc(RExC_rxi,
19476            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
19477                                                 /* +1 for REG_MAGIC */
19478            char,
19479            regexp_internal);
19480     if ( RExC_rxi == NULL )
19481         FAIL("Regexp out of space");
19482     RXi_SET(RExC_rx, RExC_rxi);
19483
19484     RExC_emit_start = RExC_rxi->program;
19485     if (size > 0) {
19486         Zero(REGNODE_p(RExC_emit), size, regnode);
19487     }
19488
19489 #ifdef RE_TRACK_PATTERN_OFFSETS
19490     Renew(RExC_offsets, 2*RExC_size+1, U32);
19491     if (size > 0) {
19492         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
19493     }
19494     RExC_offsets[0] = RExC_size;
19495 #endif
19496 }
19497
19498 STATIC regnode_offset
19499 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19500 {
19501     /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
19502      * and increments RExC_size and RExC_emit
19503      *
19504      * It returns the regnode's offset into the regex engine program */
19505
19506     const regnode_offset ret = RExC_emit;
19507
19508     GET_RE_DEBUG_FLAGS_DECL;
19509
19510     PERL_ARGS_ASSERT_REGNODE_GUTS;
19511
19512     SIZE_ALIGN(RExC_size);
19513     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19514     NODE_ALIGN_FILL(REGNODE_p(ret));
19515 #ifndef RE_TRACK_PATTERN_OFFSETS
19516     PERL_UNUSED_ARG(name);
19517     PERL_UNUSED_ARG(op);
19518 #else
19519     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19520
19521     if (RExC_offsets) {         /* MJD */
19522         MJD_OFFSET_DEBUG(
19523               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19524               name, __LINE__,
19525               PL_reg_name[op],
19526               (UV)(RExC_emit) > RExC_offsets[0]
19527                 ? "Overwriting end of array!\n" : "OK",
19528               (UV)(RExC_emit),
19529               (UV)(RExC_parse - RExC_start),
19530               (UV)RExC_offsets[0]));
19531         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19532     }
19533 #endif
19534     return(ret);
19535 }
19536
19537 /*
19538 - reg_node - emit a node
19539 */
19540 STATIC regnode_offset /* Location. */
19541 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19542 {
19543     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19544     regnode_offset ptr = ret;
19545
19546     PERL_ARGS_ASSERT_REG_NODE;
19547
19548     assert(regarglen[op] == 0);
19549
19550     FILL_ADVANCE_NODE(ptr, op);
19551     RExC_emit = ptr;
19552     return(ret);
19553 }
19554
19555 /*
19556 - reganode - emit a node with an argument
19557 */
19558 STATIC regnode_offset /* Location. */
19559 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19560 {
19561     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19562     regnode_offset ptr = ret;
19563
19564     PERL_ARGS_ASSERT_REGANODE;
19565
19566     /* ANYOF are special cased to allow non-length 1 args */
19567     assert(regarglen[op] == 1);
19568
19569     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19570     RExC_emit = ptr;
19571     return(ret);
19572 }
19573
19574 STATIC regnode_offset
19575 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19576 {
19577     /* emit a node with U32 and I32 arguments */
19578
19579     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19580     regnode_offset ptr = ret;
19581
19582     PERL_ARGS_ASSERT_REG2LANODE;
19583
19584     assert(regarglen[op] == 2);
19585
19586     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19587     RExC_emit = ptr;
19588     return(ret);
19589 }
19590
19591 /*
19592 - reginsert - insert an operator in front of already-emitted operand
19593 *
19594 * That means that on exit 'operand' is the offset of the newly inserted
19595 * operator, and the original operand has been relocated.
19596 *
19597 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19598 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19599 *
19600 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19601 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19602 *
19603 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19604 */
19605 STATIC void
19606 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19607                   const regnode_offset operand, const U32 depth)
19608 {
19609     regnode *src;
19610     regnode *dst;
19611     regnode *place;
19612     const int offset = regarglen[(U8)op];
19613     const int size = NODE_STEP_REGNODE + offset;
19614     GET_RE_DEBUG_FLAGS_DECL;
19615
19616     PERL_ARGS_ASSERT_REGINSERT;
19617     PERL_UNUSED_CONTEXT;
19618     PERL_UNUSED_ARG(depth);
19619 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19620     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19621     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19622                                     studying. If this is wrong then we need to adjust RExC_recurse
19623                                     below like we do with RExC_open_parens/RExC_close_parens. */
19624     change_engine_size(pRExC_state, (Ptrdiff_t) size);
19625     src = REGNODE_p(RExC_emit);
19626     RExC_emit += size;
19627     dst = REGNODE_p(RExC_emit);
19628     if (RExC_open_parens) {
19629         int paren;
19630         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19631         /* remember that RExC_npar is rex->nparens + 1,
19632          * iow it is 1 more than the number of parens seen in
19633          * the pattern so far. */
19634         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19635             /* note, RExC_open_parens[0] is the start of the
19636              * regex, it can't move. RExC_close_parens[0] is the end
19637              * of the regex, it *can* move. */
19638             if ( paren && RExC_open_parens[paren] >= operand ) {
19639                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19640                 RExC_open_parens[paren] += size;
19641             } else {
19642                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19643             }
19644             if ( RExC_close_parens[paren] >= operand ) {
19645                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19646                 RExC_close_parens[paren] += size;
19647             } else {
19648                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19649             }
19650         }
19651     }
19652     if (RExC_end_op)
19653         RExC_end_op += size;
19654
19655     while (src > REGNODE_p(operand)) {
19656         StructCopy(--src, --dst, regnode);
19657 #ifdef RE_TRACK_PATTERN_OFFSETS
19658         if (RExC_offsets) {     /* MJD 20010112 */
19659             MJD_OFFSET_DEBUG(
19660                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19661                   "reginsert",
19662                   __LINE__,
19663                   PL_reg_name[op],
19664                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19665                     ? "Overwriting end of array!\n" : "OK",
19666                   (UV)REGNODE_OFFSET(src),
19667                   (UV)REGNODE_OFFSET(dst),
19668                   (UV)RExC_offsets[0]));
19669             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19670             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19671         }
19672 #endif
19673     }
19674
19675     place = REGNODE_p(operand); /* Op node, where operand used to be. */
19676 #ifdef RE_TRACK_PATTERN_OFFSETS
19677     if (RExC_offsets) {         /* MJD */
19678         MJD_OFFSET_DEBUG(
19679               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19680               "reginsert",
19681               __LINE__,
19682               PL_reg_name[op],
19683               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19684               ? "Overwriting end of array!\n" : "OK",
19685               (UV)REGNODE_OFFSET(place),
19686               (UV)(RExC_parse - RExC_start),
19687               (UV)RExC_offsets[0]));
19688         Set_Node_Offset(place, RExC_parse);
19689         Set_Node_Length(place, 1);
19690     }
19691 #endif
19692     src = NEXTOPER(place);
19693     FLAGS(place) = 0;
19694     FILL_NODE(operand, op);
19695
19696     /* Zero out any arguments in the new node */
19697     Zero(src, offset, regnode);
19698 }
19699
19700 /*
19701 - regtail - set the next-pointer at the end of a node chain of p to val.
19702 - SEE ALSO: regtail_study
19703 */
19704 STATIC void
19705 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19706                 const regnode_offset p,
19707                 const regnode_offset val,
19708                 const U32 depth)
19709 {
19710     regnode_offset scan;
19711     GET_RE_DEBUG_FLAGS_DECL;
19712
19713     PERL_ARGS_ASSERT_REGTAIL;
19714 #ifndef DEBUGGING
19715     PERL_UNUSED_ARG(depth);
19716 #endif
19717
19718     /* Find last node. */
19719     scan = (regnode_offset) p;
19720     for (;;) {
19721         regnode * const temp = regnext(REGNODE_p(scan));
19722         DEBUG_PARSE_r({
19723             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19724             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19725             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
19726                 SvPV_nolen_const(RExC_mysv), scan,
19727                     (temp == NULL ? "->" : ""),
19728                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19729             );
19730         });
19731         if (temp == NULL)
19732             break;
19733         scan = REGNODE_OFFSET(temp);
19734     }
19735
19736     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19737         ARG_SET(REGNODE_p(scan), val - scan);
19738     }
19739     else {
19740         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19741     }
19742 }
19743
19744 #ifdef DEBUGGING
19745 /*
19746 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19747 - Look for optimizable sequences at the same time.
19748 - currently only looks for EXACT chains.
19749
19750 This is experimental code. The idea is to use this routine to perform
19751 in place optimizations on branches and groups as they are constructed,
19752 with the long term intention of removing optimization from study_chunk so
19753 that it is purely analytical.
19754
19755 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19756 to control which is which.
19757
19758 */
19759 /* TODO: All four parms should be const */
19760
19761 STATIC U8
19762 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19763                       const regnode_offset val, U32 depth)
19764 {
19765     regnode_offset scan;
19766     U8 exact = PSEUDO;
19767 #ifdef EXPERIMENTAL_INPLACESCAN
19768     I32 min = 0;
19769 #endif
19770     GET_RE_DEBUG_FLAGS_DECL;
19771
19772     PERL_ARGS_ASSERT_REGTAIL_STUDY;
19773
19774
19775     /* Find last node. */
19776
19777     scan = p;
19778     for (;;) {
19779         regnode * const temp = regnext(REGNODE_p(scan));
19780 #ifdef EXPERIMENTAL_INPLACESCAN
19781         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19782             bool unfolded_multi_char;   /* Unexamined in this routine */
19783             if (join_exact(pRExC_state, scan, &min,
19784                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19785                 return EXACT;
19786         }
19787 #endif
19788         if ( exact ) {
19789             switch (OP(REGNODE_p(scan))) {
19790                 case EXACT:
19791                 case EXACT_ONLY8:
19792                 case EXACTL:
19793                 case EXACTF:
19794                 case EXACTFU_S_EDGE:
19795                 case EXACTFAA_NO_TRIE:
19796                 case EXACTFAA:
19797                 case EXACTFU:
19798                 case EXACTFU_ONLY8:
19799                 case EXACTFLU8:
19800                 case EXACTFUP:
19801                 case EXACTFL:
19802                         if( exact == PSEUDO )
19803                             exact= OP(REGNODE_p(scan));
19804                         else if ( exact != OP(REGNODE_p(scan)) )
19805                             exact= 0;
19806                 case NOTHING:
19807                     break;
19808                 default:
19809                     exact= 0;
19810             }
19811         }
19812         DEBUG_PARSE_r({
19813             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
19814             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19815             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
19816                 SvPV_nolen_const(RExC_mysv),
19817                 scan,
19818                 PL_reg_name[exact]);
19819         });
19820         if (temp == NULL)
19821             break;
19822         scan = REGNODE_OFFSET(temp);
19823     }
19824     DEBUG_PARSE_r({
19825         DEBUG_PARSE_MSG("");
19826         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
19827         Perl_re_printf( aTHX_
19828                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19829                       SvPV_nolen_const(RExC_mysv),
19830                       (IV)val,
19831                       (IV)(val - scan)
19832         );
19833     });
19834     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19835         ARG_SET(REGNODE_p(scan), val - scan);
19836     }
19837     else {
19838         NEXT_OFF(REGNODE_p(scan)) = val - scan;
19839     }
19840
19841     return exact;
19842 }
19843 #endif
19844
19845 STATIC SV*
19846 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
19847
19848     /* Returns an inversion list of all the code points matched by the
19849      * ANYOFM/NANYOFM node 'n' */
19850
19851     SV * cp_list = _new_invlist(-1);
19852     const U8 lowest = (U8) ARG(n);
19853     unsigned int i;
19854     U8 count = 0;
19855     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
19856
19857     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
19858
19859     /* Starting with the lowest code point, any code point that ANDed with the
19860      * mask yields the lowest code point is in the set */
19861     for (i = lowest; i <= 0xFF; i++) {
19862         if ((i & FLAGS(n)) == ARG(n)) {
19863             cp_list = add_cp_to_invlist(cp_list, i);
19864             count++;
19865
19866             /* We know how many code points (a power of two) that are in the
19867              * set.  No use looking once we've got that number */
19868             if (count >= needed) break;
19869         }
19870     }
19871
19872     if (OP(n) == NANYOFM) {
19873         _invlist_invert(cp_list);
19874     }
19875     return cp_list;
19876 }
19877
19878 /*
19879  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19880  */
19881 #ifdef DEBUGGING
19882
19883 static void
19884 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19885 {
19886     int bit;
19887     int set=0;
19888
19889     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19890
19891     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19892         if (flags & (1<<bit)) {
19893             if (!set++ && lead)
19894                 Perl_re_printf( aTHX_  "%s", lead);
19895             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
19896         }
19897     }
19898     if (lead)  {
19899         if (set)
19900             Perl_re_printf( aTHX_  "\n");
19901         else
19902             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
19903     }
19904 }
19905
19906 static void
19907 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19908 {
19909     int bit;
19910     int set=0;
19911     regex_charset cs;
19912
19913     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19914
19915     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19916         if (flags & (1<<bit)) {
19917             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
19918                 continue;
19919             }
19920             if (!set++ && lead)
19921                 Perl_re_printf( aTHX_  "%s", lead);
19922             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
19923         }
19924     }
19925     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19926             if (!set++ && lead) {
19927                 Perl_re_printf( aTHX_  "%s", lead);
19928             }
19929             switch (cs) {
19930                 case REGEX_UNICODE_CHARSET:
19931                     Perl_re_printf( aTHX_  "UNICODE");
19932                     break;
19933                 case REGEX_LOCALE_CHARSET:
19934                     Perl_re_printf( aTHX_  "LOCALE");
19935                     break;
19936                 case REGEX_ASCII_RESTRICTED_CHARSET:
19937                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
19938                     break;
19939                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19940                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
19941                     break;
19942                 default:
19943                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
19944                     break;
19945             }
19946     }
19947     if (lead)  {
19948         if (set)
19949             Perl_re_printf( aTHX_  "\n");
19950         else
19951             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
19952     }
19953 }
19954 #endif
19955
19956 void
19957 Perl_regdump(pTHX_ const regexp *r)
19958 {
19959 #ifdef DEBUGGING
19960     int i;
19961     SV * const sv = sv_newmortal();
19962     SV *dsv= sv_newmortal();
19963     RXi_GET_DECL(r, ri);
19964     GET_RE_DEBUG_FLAGS_DECL;
19965
19966     PERL_ARGS_ASSERT_REGDUMP;
19967
19968     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19969
19970     /* Header fields of interest. */
19971     for (i = 0; i < 2; i++) {
19972         if (r->substrs->data[i].substr) {
19973             RE_PV_QUOTED_DECL(s, 0, dsv,
19974                             SvPVX_const(r->substrs->data[i].substr),
19975                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
19976                             PL_dump_re_max_len);
19977             Perl_re_printf( aTHX_
19978                           "%s %s%s at %" IVdf "..%" UVuf " ",
19979                           i ? "floating" : "anchored",
19980                           s,
19981                           RE_SV_TAIL(r->substrs->data[i].substr),
19982                           (IV)r->substrs->data[i].min_offset,
19983                           (UV)r->substrs->data[i].max_offset);
19984         }
19985         else if (r->substrs->data[i].utf8_substr) {
19986             RE_PV_QUOTED_DECL(s, 1, dsv,
19987                             SvPVX_const(r->substrs->data[i].utf8_substr),
19988                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19989                             30);
19990             Perl_re_printf( aTHX_
19991                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19992                           i ? "floating" : "anchored",
19993                           s,
19994                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19995                           (IV)r->substrs->data[i].min_offset,
19996                           (UV)r->substrs->data[i].max_offset);
19997         }
19998     }
19999
20000     if (r->check_substr || r->check_utf8)
20001         Perl_re_printf( aTHX_
20002                       (const char *)
20003                       (   r->check_substr == r->substrs->data[1].substr
20004                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20005                        ? "(checking floating" : "(checking anchored"));
20006     if (r->intflags & PREGf_NOSCAN)
20007         Perl_re_printf( aTHX_  " noscan");
20008     if (r->extflags & RXf_CHECK_ALL)
20009         Perl_re_printf( aTHX_  " isall");
20010     if (r->check_substr || r->check_utf8)
20011         Perl_re_printf( aTHX_  ") ");
20012
20013     if (ri->regstclass) {
20014         regprop(r, sv, ri->regstclass, NULL, NULL);
20015         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20016     }
20017     if (r->intflags & PREGf_ANCH) {
20018         Perl_re_printf( aTHX_  "anchored");
20019         if (r->intflags & PREGf_ANCH_MBOL)
20020             Perl_re_printf( aTHX_  "(MBOL)");
20021         if (r->intflags & PREGf_ANCH_SBOL)
20022             Perl_re_printf( aTHX_  "(SBOL)");
20023         if (r->intflags & PREGf_ANCH_GPOS)
20024             Perl_re_printf( aTHX_  "(GPOS)");
20025         Perl_re_printf( aTHX_ " ");
20026     }
20027     if (r->intflags & PREGf_GPOS_SEEN)
20028         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20029     if (r->intflags & PREGf_SKIP)
20030         Perl_re_printf( aTHX_  "plus ");
20031     if (r->intflags & PREGf_IMPLICIT)
20032         Perl_re_printf( aTHX_  "implicit ");
20033     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20034     if (r->extflags & RXf_EVAL_SEEN)
20035         Perl_re_printf( aTHX_  "with eval ");
20036     Perl_re_printf( aTHX_  "\n");
20037     DEBUG_FLAGS_r({
20038         regdump_extflags("r->extflags: ", r->extflags);
20039         regdump_intflags("r->intflags: ", r->intflags);
20040     });
20041 #else
20042     PERL_ARGS_ASSERT_REGDUMP;
20043     PERL_UNUSED_CONTEXT;
20044     PERL_UNUSED_ARG(r);
20045 #endif  /* DEBUGGING */
20046 }
20047
20048 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20049 #ifdef DEBUGGING
20050
20051 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20052      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20053      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20054      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20055      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20056      || _CC_VERTSPACE != 15
20057 #   error Need to adjust order of anyofs[]
20058 #  endif
20059 static const char * const anyofs[] = {
20060     "\\w",
20061     "\\W",
20062     "\\d",
20063     "\\D",
20064     "[:alpha:]",
20065     "[:^alpha:]",
20066     "[:lower:]",
20067     "[:^lower:]",
20068     "[:upper:]",
20069     "[:^upper:]",
20070     "[:punct:]",
20071     "[:^punct:]",
20072     "[:print:]",
20073     "[:^print:]",
20074     "[:alnum:]",
20075     "[:^alnum:]",
20076     "[:graph:]",
20077     "[:^graph:]",
20078     "[:cased:]",
20079     "[:^cased:]",
20080     "\\s",
20081     "\\S",
20082     "[:blank:]",
20083     "[:^blank:]",
20084     "[:xdigit:]",
20085     "[:^xdigit:]",
20086     "[:cntrl:]",
20087     "[:^cntrl:]",
20088     "[:ascii:]",
20089     "[:^ascii:]",
20090     "\\v",
20091     "\\V"
20092 };
20093 #endif
20094
20095 /*
20096 - regprop - printable representation of opcode, with run time support
20097 */
20098
20099 void
20100 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20101 {
20102 #ifdef DEBUGGING
20103     int k;
20104     RXi_GET_DECL(prog, progi);
20105     GET_RE_DEBUG_FLAGS_DECL;
20106
20107     PERL_ARGS_ASSERT_REGPROP;
20108
20109     SvPVCLEAR(sv);
20110
20111     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
20112         /* It would be nice to FAIL() here, but this may be called from
20113            regexec.c, and it would be hard to supply pRExC_state. */
20114         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20115                                               (int)OP(o), (int)REGNODE_MAX);
20116     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20117
20118     k = PL_regkind[OP(o)];
20119
20120     if (k == EXACT) {
20121         sv_catpvs(sv, " ");
20122         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20123          * is a crude hack but it may be the best for now since
20124          * we have no flag "this EXACTish node was UTF-8"
20125          * --jhi */
20126         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20127                   PL_colors[0], PL_colors[1],
20128                   PERL_PV_ESCAPE_UNI_DETECT |
20129                   PERL_PV_ESCAPE_NONASCII   |
20130                   PERL_PV_PRETTY_ELLIPSES   |
20131                   PERL_PV_PRETTY_LTGT       |
20132                   PERL_PV_PRETTY_NOCLEAR
20133                   );
20134     } else if (k == TRIE) {
20135         /* print the details of the trie in dumpuntil instead, as
20136          * progi->data isn't available here */
20137         const char op = OP(o);
20138         const U32 n = ARG(o);
20139         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20140                (reg_ac_data *)progi->data->data[n] :
20141                NULL;
20142         const reg_trie_data * const trie
20143             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20144
20145         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20146         DEBUG_TRIE_COMPILE_r({
20147           if (trie->jump)
20148             sv_catpvs(sv, "(JUMP)");
20149           Perl_sv_catpvf(aTHX_ sv,
20150             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20151             (UV)trie->startstate,
20152             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20153             (UV)trie->wordcount,
20154             (UV)trie->minlen,
20155             (UV)trie->maxlen,
20156             (UV)TRIE_CHARCOUNT(trie),
20157             (UV)trie->uniquecharcount
20158           );
20159         });
20160         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20161             sv_catpvs(sv, "[");
20162             (void) put_charclass_bitmap_innards(sv,
20163                                                 ((IS_ANYOF_TRIE(op))
20164                                                  ? ANYOF_BITMAP(o)
20165                                                  : TRIE_BITMAP(trie)),
20166                                                 NULL,
20167                                                 NULL,
20168                                                 NULL,
20169                                                 FALSE
20170                                                );
20171             sv_catpvs(sv, "]");
20172         }
20173     } else if (k == CURLY) {
20174         U32 lo = ARG1(o), hi = ARG2(o);
20175         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20176             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20177         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20178         if (hi == REG_INFTY)
20179             sv_catpvs(sv, "INFTY");
20180         else
20181             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20182         sv_catpvs(sv, "}");
20183     }
20184     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
20185         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20186     else if (k == REF || k == OPEN || k == CLOSE
20187              || k == GROUPP || OP(o)==ACCEPT)
20188     {
20189         AV *name_list= NULL;
20190         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20191         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20192         if ( RXp_PAREN_NAMES(prog) ) {
20193             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20194         } else if ( pRExC_state ) {
20195             name_list= RExC_paren_name_list;
20196         }
20197         if (name_list) {
20198             if ( k != REF || (OP(o) < NREF)) {
20199                 SV **name= av_fetch(name_list, parno, 0 );
20200                 if (name)
20201                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20202             }
20203             else {
20204                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20205                 I32 *nums=(I32*)SvPVX(sv_dat);
20206                 SV **name= av_fetch(name_list, nums[0], 0 );
20207                 I32 n;
20208                 if (name) {
20209                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20210                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20211                                     (n ? "," : ""), (IV)nums[n]);
20212                     }
20213                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20214                 }
20215             }
20216         }
20217         if ( k == REF && reginfo) {
20218             U32 n = ARG(o);  /* which paren pair */
20219             I32 ln = prog->offs[n].start;
20220             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20221                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20222             else if (ln == prog->offs[n].end)
20223                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20224             else {
20225                 const char *s = reginfo->strbeg + ln;
20226                 Perl_sv_catpvf(aTHX_ sv, ": ");
20227                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20228                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20229             }
20230         }
20231     } else if (k == GOSUB) {
20232         AV *name_list= NULL;
20233         if ( RXp_PAREN_NAMES(prog) ) {
20234             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20235         } else if ( pRExC_state ) {
20236             name_list= RExC_paren_name_list;
20237         }
20238
20239         /* Paren and offset */
20240         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20241                 (int)((o + (int)ARG2L(o)) - progi->program) );
20242         if (name_list) {
20243             SV **name= av_fetch(name_list, ARG(o), 0 );
20244             if (name)
20245                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20246         }
20247     }
20248     else if (k == LOGICAL)
20249         /* 2: embedded, otherwise 1 */
20250         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20251     else if (k == ANYOF) {
20252         const U8 flags = ANYOF_FLAGS(o);
20253         bool do_sep = FALSE;    /* Do we need to separate various components of
20254                                    the output? */
20255         /* Set if there is still an unresolved user-defined property */
20256         SV *unresolved                = NULL;
20257
20258         /* Things that are ignored except when the runtime locale is UTF-8 */
20259         SV *only_utf8_locale_invlist = NULL;
20260
20261         /* Code points that don't fit in the bitmap */
20262         SV *nonbitmap_invlist = NULL;
20263
20264         /* And things that aren't in the bitmap, but are small enough to be */
20265         SV* bitmap_range_not_in_bitmap = NULL;
20266
20267         const bool inverted = flags & ANYOF_INVERT;
20268
20269         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20270             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20271                 sv_catpvs(sv, "{utf8-locale-reqd}");
20272             }
20273             if (flags & ANYOFL_FOLD) {
20274                 sv_catpvs(sv, "{i}");
20275             }
20276         }
20277
20278         /* If there is stuff outside the bitmap, get it */
20279         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20280             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20281                                                 &unresolved,
20282                                                 &only_utf8_locale_invlist,
20283                                                 &nonbitmap_invlist);
20284             /* The non-bitmap data may contain stuff that could fit in the
20285              * bitmap.  This could come from a user-defined property being
20286              * finally resolved when this call was done; or much more likely
20287              * because there are matches that require UTF-8 to be valid, and so
20288              * aren't in the bitmap.  This is teased apart later */
20289             _invlist_intersection(nonbitmap_invlist,
20290                                   PL_InBitmap,
20291                                   &bitmap_range_not_in_bitmap);
20292             /* Leave just the things that don't fit into the bitmap */
20293             _invlist_subtract(nonbitmap_invlist,
20294                               PL_InBitmap,
20295                               &nonbitmap_invlist);
20296         }
20297
20298         /* Obey this flag to add all above-the-bitmap code points */
20299         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20300             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20301                                                       NUM_ANYOF_CODE_POINTS,
20302                                                       UV_MAX);
20303         }
20304
20305         /* Ready to start outputting.  First, the initial left bracket */
20306         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20307
20308         if (OP(o) != ANYOFH) {
20309             /* Then all the things that could fit in the bitmap */
20310             do_sep = put_charclass_bitmap_innards(sv,
20311                                                   ANYOF_BITMAP(o),
20312                                                   bitmap_range_not_in_bitmap,
20313                                                   only_utf8_locale_invlist,
20314                                                   o,
20315
20316                                                   /* Can't try inverting for a
20317                                                    * better display if there
20318                                                    * are things that haven't
20319                                                    * been resolved */
20320                                                   unresolved != NULL);
20321             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20322
20323             /* If there are user-defined properties which haven't been defined
20324              * yet, output them.  If the result is not to be inverted, it is
20325              * clearest to output them in a separate [] from the bitmap range
20326              * stuff.  If the result is to be complemented, we have to show
20327              * everything in one [], as the inversion applies to the whole
20328              * thing.  Use {braces} to separate them from anything in the
20329              * bitmap and anything above the bitmap. */
20330             if (unresolved) {
20331                 if (inverted) {
20332                     if (! do_sep) { /* If didn't output anything in the bitmap
20333                                      */
20334                         sv_catpvs(sv, "^");
20335                     }
20336                     sv_catpvs(sv, "{");
20337                 }
20338                 else if (do_sep) {
20339                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20340                                                       PL_colors[0]);
20341                 }
20342                 sv_catsv(sv, unresolved);
20343                 if (inverted) {
20344                     sv_catpvs(sv, "}");
20345                 }
20346                 do_sep = ! inverted;
20347             }
20348         }
20349
20350         /* And, finally, add the above-the-bitmap stuff */
20351         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20352             SV* contents;
20353
20354             /* See if truncation size is overridden */
20355             const STRLEN dump_len = (PL_dump_re_max_len > 256)
20356                                     ? PL_dump_re_max_len
20357                                     : 256;
20358
20359             /* This is output in a separate [] */
20360             if (do_sep) {
20361                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20362             }
20363
20364             /* And, for easy of understanding, it is shown in the
20365              * uncomplemented form if possible.  The one exception being if
20366              * there are unresolved items, where the inversion has to be
20367              * delayed until runtime */
20368             if (inverted && ! unresolved) {
20369                 _invlist_invert(nonbitmap_invlist);
20370                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20371             }
20372
20373             contents = invlist_contents(nonbitmap_invlist,
20374                                         FALSE /* output suitable for catsv */
20375                                        );
20376
20377             /* If the output is shorter than the permissible maximum, just do it. */
20378             if (SvCUR(contents) <= dump_len) {
20379                 sv_catsv(sv, contents);
20380             }
20381             else {
20382                 const char * contents_string = SvPVX(contents);
20383                 STRLEN i = dump_len;
20384
20385                 /* Otherwise, start at the permissible max and work back to the
20386                  * first break possibility */
20387                 while (i > 0 && contents_string[i] != ' ') {
20388                     i--;
20389                 }
20390                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
20391                                        find a legal break */
20392                     i = dump_len;
20393                 }
20394
20395                 sv_catpvn(sv, contents_string, i);
20396                 sv_catpvs(sv, "...");
20397             }
20398
20399             SvREFCNT_dec_NN(contents);
20400             SvREFCNT_dec_NN(nonbitmap_invlist);
20401         }
20402
20403         /* And finally the matching, closing ']' */
20404         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20405
20406         SvREFCNT_dec(unresolved);
20407     }
20408     else if (k == ANYOFM) {
20409         SV * cp_list = get_ANYOFM_contents(o);
20410
20411         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20412         if (OP(o) == NANYOFM) {
20413             _invlist_invert(cp_list);
20414         }
20415
20416         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20417         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20418
20419         SvREFCNT_dec(cp_list);
20420     }
20421     else if (k == POSIXD || k == NPOSIXD) {
20422         U8 index = FLAGS(o) * 2;
20423         if (index < C_ARRAY_LENGTH(anyofs)) {
20424             if (*anyofs[index] != '[')  {
20425                 sv_catpvs(sv, "[");
20426             }
20427             sv_catpv(sv, anyofs[index]);
20428             if (*anyofs[index] != '[')  {
20429                 sv_catpvs(sv, "]");
20430             }
20431         }
20432         else {
20433             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20434         }
20435     }
20436     else if (k == BOUND || k == NBOUND) {
20437         /* Must be synced with order of 'bound_type' in regcomp.h */
20438         const char * const bounds[] = {
20439             "",      /* Traditional */
20440             "{gcb}",
20441             "{lb}",
20442             "{sb}",
20443             "{wb}"
20444         };
20445         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20446         sv_catpv(sv, bounds[FLAGS(o)]);
20447     }
20448     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
20449         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
20450     else if (OP(o) == SBOL)
20451         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20452
20453     /* add on the verb argument if there is one */
20454     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20455         if ( ARG(o) )
20456             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20457                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20458         else
20459             sv_catpvs(sv, ":NULL");
20460     }
20461 #else
20462     PERL_UNUSED_CONTEXT;
20463     PERL_UNUSED_ARG(sv);
20464     PERL_UNUSED_ARG(o);
20465     PERL_UNUSED_ARG(prog);
20466     PERL_UNUSED_ARG(reginfo);
20467     PERL_UNUSED_ARG(pRExC_state);
20468 #endif  /* DEBUGGING */
20469 }
20470
20471
20472
20473 SV *
20474 Perl_re_intuit_string(pTHX_ REGEXP * const r)
20475 {                               /* Assume that RE_INTUIT is set */
20476     struct regexp *const prog = ReANY(r);
20477     GET_RE_DEBUG_FLAGS_DECL;
20478
20479     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20480     PERL_UNUSED_CONTEXT;
20481
20482     DEBUG_COMPILE_r(
20483         {
20484             const char * const s = SvPV_nolen_const(RX_UTF8(r)
20485                       ? prog->check_utf8 : prog->check_substr);
20486
20487             if (!PL_colorset) reginitcolors();
20488             Perl_re_printf( aTHX_
20489                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20490                       PL_colors[4],
20491                       RX_UTF8(r) ? "utf8 " : "",
20492                       PL_colors[5], PL_colors[0],
20493                       s,
20494                       PL_colors[1],
20495                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20496         } );
20497
20498     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20499     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20500 }
20501
20502 /*
20503    pregfree()
20504
20505    handles refcounting and freeing the perl core regexp structure. When
20506    it is necessary to actually free the structure the first thing it
20507    does is call the 'free' method of the regexp_engine associated to
20508    the regexp, allowing the handling of the void *pprivate; member
20509    first. (This routine is not overridable by extensions, which is why
20510    the extensions free is called first.)
20511
20512    See regdupe and regdupe_internal if you change anything here.
20513 */
20514 #ifndef PERL_IN_XSUB_RE
20515 void
20516 Perl_pregfree(pTHX_ REGEXP *r)
20517 {
20518     SvREFCNT_dec(r);
20519 }
20520
20521 void
20522 Perl_pregfree2(pTHX_ REGEXP *rx)
20523 {
20524     struct regexp *const r = ReANY(rx);
20525     GET_RE_DEBUG_FLAGS_DECL;
20526
20527     PERL_ARGS_ASSERT_PREGFREE2;
20528
20529     if (! r)
20530         return;
20531
20532     if (r->mother_re) {
20533         ReREFCNT_dec(r->mother_re);
20534     } else {
20535         CALLREGFREE_PVT(rx); /* free the private data */
20536         SvREFCNT_dec(RXp_PAREN_NAMES(r));
20537     }
20538     if (r->substrs) {
20539         int i;
20540         for (i = 0; i < 2; i++) {
20541             SvREFCNT_dec(r->substrs->data[i].substr);
20542             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20543         }
20544         Safefree(r->substrs);
20545     }
20546     RX_MATCH_COPY_FREE(rx);
20547 #ifdef PERL_ANY_COW
20548     SvREFCNT_dec(r->saved_copy);
20549 #endif
20550     Safefree(r->offs);
20551     SvREFCNT_dec(r->qr_anoncv);
20552     if (r->recurse_locinput)
20553         Safefree(r->recurse_locinput);
20554 }
20555
20556
20557 /*  reg_temp_copy()
20558
20559     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20560     except that dsv will be created if NULL.
20561
20562     This function is used in two main ways. First to implement
20563         $r = qr/....; $s = $$r;
20564
20565     Secondly, it is used as a hacky workaround to the structural issue of
20566     match results
20567     being stored in the regexp structure which is in turn stored in
20568     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20569     could be PL_curpm in multiple contexts, and could require multiple
20570     result sets being associated with the pattern simultaneously, such
20571     as when doing a recursive match with (??{$qr})
20572
20573     The solution is to make a lightweight copy of the regexp structure
20574     when a qr// is returned from the code executed by (??{$qr}) this
20575     lightweight copy doesn't actually own any of its data except for
20576     the starp/end and the actual regexp structure itself.
20577
20578 */
20579
20580
20581 REGEXP *
20582 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20583 {
20584     struct regexp *drx;
20585     struct regexp *const srx = ReANY(ssv);
20586     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20587
20588     PERL_ARGS_ASSERT_REG_TEMP_COPY;
20589
20590     if (!dsv)
20591         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20592     else {
20593         SvOK_off((SV *)dsv);
20594         if (islv) {
20595             /* For PVLVs, the head (sv_any) points to an XPVLV, while
20596              * the LV's xpvlenu_rx will point to a regexp body, which
20597              * we allocate here */
20598             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20599             assert(!SvPVX(dsv));
20600             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20601             temp->sv_any = NULL;
20602             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20603             SvREFCNT_dec_NN(temp);
20604             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20605                ing below will not set it. */
20606             SvCUR_set(dsv, SvCUR(ssv));
20607         }
20608     }
20609     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20610        sv_force_normal(sv) is called.  */
20611     SvFAKE_on(dsv);
20612     drx = ReANY(dsv);
20613
20614     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20615     SvPV_set(dsv, RX_WRAPPED(ssv));
20616     /* We share the same string buffer as the original regexp, on which we
20617        hold a reference count, incremented when mother_re is set below.
20618        The string pointer is copied here, being part of the regexp struct.
20619      */
20620     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20621            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20622     if (!islv)
20623         SvLEN_set(dsv, 0);
20624     if (srx->offs) {
20625         const I32 npar = srx->nparens+1;
20626         Newx(drx->offs, npar, regexp_paren_pair);
20627         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20628     }
20629     if (srx->substrs) {
20630         int i;
20631         Newx(drx->substrs, 1, struct reg_substr_data);
20632         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20633
20634         for (i = 0; i < 2; i++) {
20635             SvREFCNT_inc_void(drx->substrs->data[i].substr);
20636             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20637         }
20638
20639         /* check_substr and check_utf8, if non-NULL, point to either their
20640            anchored or float namesakes, and don't hold a second reference.  */
20641     }
20642     RX_MATCH_COPIED_off(dsv);
20643 #ifdef PERL_ANY_COW
20644     drx->saved_copy = NULL;
20645 #endif
20646     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20647     SvREFCNT_inc_void(drx->qr_anoncv);
20648     if (srx->recurse_locinput)
20649         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20650
20651     return dsv;
20652 }
20653 #endif
20654
20655
20656 /* regfree_internal()
20657
20658    Free the private data in a regexp. This is overloadable by
20659    extensions. Perl takes care of the regexp structure in pregfree(),
20660    this covers the *pprivate pointer which technically perl doesn't
20661    know about, however of course we have to handle the
20662    regexp_internal structure when no extension is in use.
20663
20664    Note this is called before freeing anything in the regexp
20665    structure.
20666  */
20667
20668 void
20669 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20670 {
20671     struct regexp *const r = ReANY(rx);
20672     RXi_GET_DECL(r, ri);
20673     GET_RE_DEBUG_FLAGS_DECL;
20674
20675     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20676
20677     if (! ri) {
20678         return;
20679     }
20680
20681     DEBUG_COMPILE_r({
20682         if (!PL_colorset)
20683             reginitcolors();
20684         {
20685             SV *dsv= sv_newmortal();
20686             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20687                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20688             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20689                 PL_colors[4], PL_colors[5], s);
20690         }
20691     });
20692
20693 #ifdef RE_TRACK_PATTERN_OFFSETS
20694     if (ri->u.offsets)
20695         Safefree(ri->u.offsets);             /* 20010421 MJD */
20696 #endif
20697     if (ri->code_blocks)
20698         S_free_codeblocks(aTHX_ ri->code_blocks);
20699
20700     if (ri->data) {
20701         int n = ri->data->count;
20702
20703         while (--n >= 0) {
20704           /* If you add a ->what type here, update the comment in regcomp.h */
20705             switch (ri->data->what[n]) {
20706             case 'a':
20707             case 'r':
20708             case 's':
20709             case 'S':
20710             case 'u':
20711                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20712                 break;
20713             case 'f':
20714                 Safefree(ri->data->data[n]);
20715                 break;
20716             case 'l':
20717             case 'L':
20718                 break;
20719             case 'T':
20720                 { /* Aho Corasick add-on structure for a trie node.
20721                      Used in stclass optimization only */
20722                     U32 refcount;
20723                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20724 #ifdef USE_ITHREADS
20725                     dVAR;
20726 #endif
20727                     OP_REFCNT_LOCK;
20728                     refcount = --aho->refcount;
20729                     OP_REFCNT_UNLOCK;
20730                     if ( !refcount ) {
20731                         PerlMemShared_free(aho->states);
20732                         PerlMemShared_free(aho->fail);
20733                          /* do this last!!!! */
20734                         PerlMemShared_free(ri->data->data[n]);
20735                         /* we should only ever get called once, so
20736                          * assert as much, and also guard the free
20737                          * which /might/ happen twice. At the least
20738                          * it will make code anlyzers happy and it
20739                          * doesn't cost much. - Yves */
20740                         assert(ri->regstclass);
20741                         if (ri->regstclass) {
20742                             PerlMemShared_free(ri->regstclass);
20743                             ri->regstclass = 0;
20744                         }
20745                     }
20746                 }
20747                 break;
20748             case 't':
20749                 {
20750                     /* trie structure. */
20751                     U32 refcount;
20752                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
20753 #ifdef USE_ITHREADS
20754                     dVAR;
20755 #endif
20756                     OP_REFCNT_LOCK;
20757                     refcount = --trie->refcount;
20758                     OP_REFCNT_UNLOCK;
20759                     if ( !refcount ) {
20760                         PerlMemShared_free(trie->charmap);
20761                         PerlMemShared_free(trie->states);
20762                         PerlMemShared_free(trie->trans);
20763                         if (trie->bitmap)
20764                             PerlMemShared_free(trie->bitmap);
20765                         if (trie->jump)
20766                             PerlMemShared_free(trie->jump);
20767                         PerlMemShared_free(trie->wordinfo);
20768                         /* do this last!!!! */
20769                         PerlMemShared_free(ri->data->data[n]);
20770                     }
20771                 }
20772                 break;
20773             default:
20774                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
20775                                                     ri->data->what[n]);
20776             }
20777         }
20778         Safefree(ri->data->what);
20779         Safefree(ri->data);
20780     }
20781
20782     Safefree(ri);
20783 }
20784
20785 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
20786 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
20787 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
20788
20789 /*
20790    re_dup_guts - duplicate a regexp.
20791
20792    This routine is expected to clone a given regexp structure. It is only
20793    compiled under USE_ITHREADS.
20794
20795    After all of the core data stored in struct regexp is duplicated
20796    the regexp_engine.dupe method is used to copy any private data
20797    stored in the *pprivate pointer. This allows extensions to handle
20798    any duplication it needs to do.
20799
20800    See pregfree() and regfree_internal() if you change anything here.
20801 */
20802 #if defined(USE_ITHREADS)
20803 #ifndef PERL_IN_XSUB_RE
20804 void
20805 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
20806 {
20807     dVAR;
20808     I32 npar;
20809     const struct regexp *r = ReANY(sstr);
20810     struct regexp *ret = ReANY(dstr);
20811
20812     PERL_ARGS_ASSERT_RE_DUP_GUTS;
20813
20814     npar = r->nparens+1;
20815     Newx(ret->offs, npar, regexp_paren_pair);
20816     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
20817
20818     if (ret->substrs) {
20819         /* Do it this way to avoid reading from *r after the StructCopy().
20820            That way, if any of the sv_dup_inc()s dislodge *r from the L1
20821            cache, it doesn't matter.  */
20822         int i;
20823         const bool anchored = r->check_substr
20824             ? r->check_substr == r->substrs->data[0].substr
20825             : r->check_utf8   == r->substrs->data[0].utf8_substr;
20826         Newx(ret->substrs, 1, struct reg_substr_data);
20827         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
20828
20829         for (i = 0; i < 2; i++) {
20830             ret->substrs->data[i].substr =
20831                         sv_dup_inc(ret->substrs->data[i].substr, param);
20832             ret->substrs->data[i].utf8_substr =
20833                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
20834         }
20835
20836         /* check_substr and check_utf8, if non-NULL, point to either their
20837            anchored or float namesakes, and don't hold a second reference.  */
20838
20839         if (ret->check_substr) {
20840             if (anchored) {
20841                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
20842
20843                 ret->check_substr = ret->substrs->data[0].substr;
20844                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
20845             } else {
20846                 assert(r->check_substr == r->substrs->data[1].substr);
20847                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
20848
20849                 ret->check_substr = ret->substrs->data[1].substr;
20850                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
20851             }
20852         } else if (ret->check_utf8) {
20853             if (anchored) {
20854                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20855             } else {
20856                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20857             }
20858         }
20859     }
20860
20861     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
20862     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
20863     if (r->recurse_locinput)
20864         Newx(ret->recurse_locinput, r->nparens + 1, char *);
20865
20866     if (ret->pprivate)
20867         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
20868
20869     if (RX_MATCH_COPIED(dstr))
20870         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
20871     else
20872         ret->subbeg = NULL;
20873 #ifdef PERL_ANY_COW
20874     ret->saved_copy = NULL;
20875 #endif
20876
20877     /* Whether mother_re be set or no, we need to copy the string.  We
20878        cannot refrain from copying it when the storage points directly to
20879        our mother regexp, because that's
20880                1: a buffer in a different thread
20881                2: something we no longer hold a reference on
20882                so we need to copy it locally.  */
20883     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20884     ret->mother_re   = NULL;
20885 }
20886 #endif /* PERL_IN_XSUB_RE */
20887
20888 /*
20889    regdupe_internal()
20890
20891    This is the internal complement to regdupe() which is used to copy
20892    the structure pointed to by the *pprivate pointer in the regexp.
20893    This is the core version of the extension overridable cloning hook.
20894    The regexp structure being duplicated will be copied by perl prior
20895    to this and will be provided as the regexp *r argument, however
20896    with the /old/ structures pprivate pointer value. Thus this routine
20897    may override any copying normally done by perl.
20898
20899    It returns a pointer to the new regexp_internal structure.
20900 */
20901
20902 void *
20903 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20904 {
20905     dVAR;
20906     struct regexp *const r = ReANY(rx);
20907     regexp_internal *reti;
20908     int len;
20909     RXi_GET_DECL(r, ri);
20910
20911     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20912
20913     len = ProgLen(ri);
20914
20915     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20916           char, regexp_internal);
20917     Copy(ri->program, reti->program, len+1, regnode);
20918
20919
20920     if (ri->code_blocks) {
20921         int n;
20922         Newx(reti->code_blocks, 1, struct reg_code_blocks);
20923         Newx(reti->code_blocks->cb, ri->code_blocks->count,
20924                     struct reg_code_block);
20925         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20926              ri->code_blocks->count, struct reg_code_block);
20927         for (n = 0; n < ri->code_blocks->count; n++)
20928              reti->code_blocks->cb[n].src_regex = (REGEXP*)
20929                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20930         reti->code_blocks->count = ri->code_blocks->count;
20931         reti->code_blocks->refcnt = 1;
20932     }
20933     else
20934         reti->code_blocks = NULL;
20935
20936     reti->regstclass = NULL;
20937
20938     if (ri->data) {
20939         struct reg_data *d;
20940         const int count = ri->data->count;
20941         int i;
20942
20943         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20944                 char, struct reg_data);
20945         Newx(d->what, count, U8);
20946
20947         d->count = count;
20948         for (i = 0; i < count; i++) {
20949             d->what[i] = ri->data->what[i];
20950             switch (d->what[i]) {
20951                 /* see also regcomp.h and regfree_internal() */
20952             case 'a': /* actually an AV, but the dup function is identical.
20953                          values seem to be "plain sv's" generally. */
20954             case 'r': /* a compiled regex (but still just another SV) */
20955             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20956                          this use case should go away, the code could have used
20957                          'a' instead - see S_set_ANYOF_arg() for array contents. */
20958             case 'S': /* actually an SV, but the dup function is identical.  */
20959             case 'u': /* actually an HV, but the dup function is identical.
20960                          values are "plain sv's" */
20961                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20962                 break;
20963             case 'f':
20964                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20965                  * patterns which could start with several different things. Pre-TRIE
20966                  * this was more important than it is now, however this still helps
20967                  * in some places, for instance /x?a+/ might produce a SSC equivalent
20968                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20969                  * in regexec.c
20970                  */
20971                 /* This is cheating. */
20972                 Newx(d->data[i], 1, regnode_ssc);
20973                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20974                 reti->regstclass = (regnode*)d->data[i];
20975                 break;
20976             case 'T':
20977                 /* AHO-CORASICK fail table */
20978                 /* Trie stclasses are readonly and can thus be shared
20979                  * without duplication. We free the stclass in pregfree
20980                  * when the corresponding reg_ac_data struct is freed.
20981                  */
20982                 reti->regstclass= ri->regstclass;
20983                 /* FALLTHROUGH */
20984             case 't':
20985                 /* TRIE transition table */
20986                 OP_REFCNT_LOCK;
20987                 ((reg_trie_data*)ri->data->data[i])->refcount++;
20988                 OP_REFCNT_UNLOCK;
20989                 /* FALLTHROUGH */
20990             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20991             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20992                          is not from another regexp */
20993                 d->data[i] = ri->data->data[i];
20994                 break;
20995             default:
20996                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20997                                                            ri->data->what[i]);
20998             }
20999         }
21000
21001         reti->data = d;
21002     }
21003     else
21004         reti->data = NULL;
21005
21006     reti->name_list_idx = ri->name_list_idx;
21007
21008 #ifdef RE_TRACK_PATTERN_OFFSETS
21009     if (ri->u.offsets) {
21010         Newx(reti->u.offsets, 2*len+1, U32);
21011         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21012     }
21013 #else
21014     SetProgLen(reti, len);
21015 #endif
21016
21017     return (void*)reti;
21018 }
21019
21020 #endif    /* USE_ITHREADS */
21021
21022 #ifndef PERL_IN_XSUB_RE
21023
21024 /*
21025  - regnext - dig the "next" pointer out of a node
21026  */
21027 regnode *
21028 Perl_regnext(pTHX_ regnode *p)
21029 {
21030     I32 offset;
21031
21032     if (!p)
21033         return(NULL);
21034
21035     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
21036         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21037                                                 (int)OP(p), (int)REGNODE_MAX);
21038     }
21039
21040     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21041     if (offset == 0)
21042         return(NULL);
21043
21044     return(p+offset);
21045 }
21046
21047 #endif
21048
21049 STATIC void
21050 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21051 {
21052     va_list args;
21053     STRLEN l1 = strlen(pat1);
21054     STRLEN l2 = strlen(pat2);
21055     char buf[512];
21056     SV *msv;
21057     const char *message;
21058
21059     PERL_ARGS_ASSERT_RE_CROAK2;
21060
21061     if (l1 > 510)
21062         l1 = 510;
21063     if (l1 + l2 > 510)
21064         l2 = 510 - l1;
21065     Copy(pat1, buf, l1 , char);
21066     Copy(pat2, buf + l1, l2 , char);
21067     buf[l1 + l2] = '\n';
21068     buf[l1 + l2 + 1] = '\0';
21069     va_start(args, pat2);
21070     msv = vmess(buf, &args);
21071     va_end(args);
21072     message = SvPV_const(msv, l1);
21073     if (l1 > 512)
21074         l1 = 512;
21075     Copy(message, buf, l1 , char);
21076     /* l1-1 to avoid \n */
21077     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21078 }
21079
21080 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21081
21082 #ifndef PERL_IN_XSUB_RE
21083 void
21084 Perl_save_re_context(pTHX)
21085 {
21086     I32 nparens = -1;
21087     I32 i;
21088
21089     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21090
21091     if (PL_curpm) {
21092         const REGEXP * const rx = PM_GETRE(PL_curpm);
21093         if (rx)
21094             nparens = RX_NPARENS(rx);
21095     }
21096
21097     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21098      * that PL_curpm will be null, but that utf8.pm and the modules it
21099      * loads will only use $1..$3.
21100      * The t/porting/re_context.t test file checks this assumption.
21101      */
21102     if (nparens == -1)
21103         nparens = 3;
21104
21105     for (i = 1; i <= nparens; i++) {
21106         char digits[TYPE_CHARS(long)];
21107         const STRLEN len = my_snprintf(digits, sizeof(digits),
21108                                        "%lu", (long)i);
21109         GV *const *const gvp
21110             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21111
21112         if (gvp) {
21113             GV * const gv = *gvp;
21114             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21115                 save_scalar(gv);
21116         }
21117     }
21118 }
21119 #endif
21120
21121 #ifdef DEBUGGING
21122
21123 STATIC void
21124 S_put_code_point(pTHX_ SV *sv, UV c)
21125 {
21126     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21127
21128     if (c > 255) {
21129         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21130     }
21131     else if (isPRINT(c)) {
21132         const char string = (char) c;
21133
21134         /* We use {phrase} as metanotation in the class, so also escape literal
21135          * braces */
21136         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21137             sv_catpvs(sv, "\\");
21138         sv_catpvn(sv, &string, 1);
21139     }
21140     else if (isMNEMONIC_CNTRL(c)) {
21141         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21142     }
21143     else {
21144         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21145     }
21146 }
21147
21148 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21149
21150 STATIC void
21151 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21152 {
21153     /* Appends to 'sv' a displayable version of the range of code points from
21154      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21155      * that have them, when they occur at the beginning or end of the range.
21156      * It uses hex to output the remaining code points, unless 'allow_literals'
21157      * is true, in which case the printable ASCII ones are output as-is (though
21158      * some of these will be escaped by put_code_point()).
21159      *
21160      * NOTE:  This is designed only for printing ranges of code points that fit
21161      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21162      */
21163
21164     const unsigned int min_range_count = 3;
21165
21166     assert(start <= end);
21167
21168     PERL_ARGS_ASSERT_PUT_RANGE;
21169
21170     while (start <= end) {
21171         UV this_end;
21172         const char * format;
21173
21174         if (end - start < min_range_count) {
21175
21176             /* Output chars individually when they occur in short ranges */
21177             for (; start <= end; start++) {
21178                 put_code_point(sv, start);
21179             }
21180             break;
21181         }
21182
21183         /* If permitted by the input options, and there is a possibility that
21184          * this range contains a printable literal, look to see if there is
21185          * one. */
21186         if (allow_literals && start <= MAX_PRINT_A) {
21187
21188             /* If the character at the beginning of the range isn't an ASCII
21189              * printable, effectively split the range into two parts:
21190              *  1) the portion before the first such printable,
21191              *  2) the rest
21192              * and output them separately. */
21193             if (! isPRINT_A(start)) {
21194                 UV temp_end = start + 1;
21195
21196                 /* There is no point looking beyond the final possible
21197                  * printable, in MAX_PRINT_A */
21198                 UV max = MIN(end, MAX_PRINT_A);
21199
21200                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21201                     temp_end++;
21202                 }
21203
21204                 /* Here, temp_end points to one beyond the first printable if
21205                  * found, or to one beyond 'max' if not.  If none found, make
21206                  * sure that we use the entire range */
21207                 if (temp_end > MAX_PRINT_A) {
21208                     temp_end = end + 1;
21209                 }
21210
21211                 /* Output the first part of the split range: the part that
21212                  * doesn't have printables, with the parameter set to not look
21213                  * for literals (otherwise we would infinitely recurse) */
21214                 put_range(sv, start, temp_end - 1, FALSE);
21215
21216                 /* The 2nd part of the range (if any) starts here. */
21217                 start = temp_end;
21218
21219                 /* We do a continue, instead of dropping down, because even if
21220                  * the 2nd part is non-empty, it could be so short that we want
21221                  * to output it as individual characters, as tested for at the
21222                  * top of this loop.  */
21223                 continue;
21224             }
21225
21226             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21227              * output a sub-range of just the digits or letters, then process
21228              * the remaining portion as usual. */
21229             if (isALPHANUMERIC_A(start)) {
21230                 UV mask = (isDIGIT_A(start))
21231                            ? _CC_DIGIT
21232                              : isUPPER_A(start)
21233                                ? _CC_UPPER
21234                                : _CC_LOWER;
21235                 UV temp_end = start + 1;
21236
21237                 /* Find the end of the sub-range that includes just the
21238                  * characters in the same class as the first character in it */
21239                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21240                     temp_end++;
21241                 }
21242                 temp_end--;
21243
21244                 /* For short ranges, don't duplicate the code above to output
21245                  * them; just call recursively */
21246                 if (temp_end - start < min_range_count) {
21247                     put_range(sv, start, temp_end, FALSE);
21248                 }
21249                 else {  /* Output as a range */
21250                     put_code_point(sv, start);
21251                     sv_catpvs(sv, "-");
21252                     put_code_point(sv, temp_end);
21253                 }
21254                 start = temp_end + 1;
21255                 continue;
21256             }
21257
21258             /* We output any other printables as individual characters */
21259             if (isPUNCT_A(start) || isSPACE_A(start)) {
21260                 while (start <= end && (isPUNCT_A(start)
21261                                         || isSPACE_A(start)))
21262                 {
21263                     put_code_point(sv, start);
21264                     start++;
21265                 }
21266                 continue;
21267             }
21268         } /* End of looking for literals */
21269
21270         /* Here is not to output as a literal.  Some control characters have
21271          * mnemonic names.  Split off any of those at the beginning and end of
21272          * the range to print mnemonically.  It isn't possible for many of
21273          * these to be in a row, so this won't overwhelm with output */
21274         if (   start <= end
21275             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21276         {
21277             while (isMNEMONIC_CNTRL(start) && start <= end) {
21278                 put_code_point(sv, start);
21279                 start++;
21280             }
21281
21282             /* If this didn't take care of the whole range ... */
21283             if (start <= end) {
21284
21285                 /* Look backwards from the end to find the final non-mnemonic
21286                  * */
21287                 UV temp_end = end;
21288                 while (isMNEMONIC_CNTRL(temp_end)) {
21289                     temp_end--;
21290                 }
21291
21292                 /* And separately output the interior range that doesn't start
21293                  * or end with mnemonics */
21294                 put_range(sv, start, temp_end, FALSE);
21295
21296                 /* Then output the mnemonic trailing controls */
21297                 start = temp_end + 1;
21298                 while (start <= end) {
21299                     put_code_point(sv, start);
21300                     start++;
21301                 }
21302                 break;
21303             }
21304         }
21305
21306         /* As a final resort, output the range or subrange as hex. */
21307
21308         this_end = (end < NUM_ANYOF_CODE_POINTS)
21309                     ? end
21310                     : NUM_ANYOF_CODE_POINTS - 1;
21311 #if NUM_ANYOF_CODE_POINTS > 256
21312         format = (this_end < 256)
21313                  ? "\\x%02" UVXf "-\\x%02" UVXf
21314                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21315 #else
21316         format = "\\x%02" UVXf "-\\x%02" UVXf;
21317 #endif
21318         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21319         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21320         GCC_DIAG_RESTORE_STMT;
21321         break;
21322     }
21323 }
21324
21325 STATIC void
21326 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21327 {
21328     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21329      * 'invlist' */
21330
21331     UV start, end;
21332     bool allow_literals = TRUE;
21333
21334     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21335
21336     /* Generally, it is more readable if printable characters are output as
21337      * literals, but if a range (nearly) spans all of them, it's best to output
21338      * it as a single range.  This code will use a single range if all but 2
21339      * ASCII printables are in it */
21340     invlist_iterinit(invlist);
21341     while (invlist_iternext(invlist, &start, &end)) {
21342
21343         /* If the range starts beyond the final printable, it doesn't have any
21344          * in it */
21345         if (start > MAX_PRINT_A) {
21346             break;
21347         }
21348
21349         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
21350          * all but two, the range must start and end no later than 2 from
21351          * either end */
21352         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21353             if (end > MAX_PRINT_A) {
21354                 end = MAX_PRINT_A;
21355             }
21356             if (start < ' ') {
21357                 start = ' ';
21358             }
21359             if (end - start >= MAX_PRINT_A - ' ' - 2) {
21360                 allow_literals = FALSE;
21361             }
21362             break;
21363         }
21364     }
21365     invlist_iterfinish(invlist);
21366
21367     /* Here we have figured things out.  Output each range */
21368     invlist_iterinit(invlist);
21369     while (invlist_iternext(invlist, &start, &end)) {
21370         if (start >= NUM_ANYOF_CODE_POINTS) {
21371             break;
21372         }
21373         put_range(sv, start, end, allow_literals);
21374     }
21375     invlist_iterfinish(invlist);
21376
21377     return;
21378 }
21379
21380 STATIC SV*
21381 S_put_charclass_bitmap_innards_common(pTHX_
21382         SV* invlist,            /* The bitmap */
21383         SV* posixes,            /* Under /l, things like [:word:], \S */
21384         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
21385         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
21386         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
21387         const bool invert       /* Is the result to be inverted? */
21388 )
21389 {
21390     /* Create and return an SV containing a displayable version of the bitmap
21391      * and associated information determined by the input parameters.  If the
21392      * output would have been only the inversion indicator '^', NULL is instead
21393      * returned. */
21394
21395     SV * output;
21396
21397     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21398
21399     if (invert) {
21400         output = newSVpvs("^");
21401     }
21402     else {
21403         output = newSVpvs("");
21404     }
21405
21406     /* First, the code points in the bitmap that are unconditionally there */
21407     put_charclass_bitmap_innards_invlist(output, invlist);
21408
21409     /* Traditionally, these have been placed after the main code points */
21410     if (posixes) {
21411         sv_catsv(output, posixes);
21412     }
21413
21414     if (only_utf8 && _invlist_len(only_utf8)) {
21415         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21416         put_charclass_bitmap_innards_invlist(output, only_utf8);
21417     }
21418
21419     if (not_utf8 && _invlist_len(not_utf8)) {
21420         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21421         put_charclass_bitmap_innards_invlist(output, not_utf8);
21422     }
21423
21424     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21425         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21426         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21427
21428         /* This is the only list in this routine that can legally contain code
21429          * points outside the bitmap range.  The call just above to
21430          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21431          * output them here.  There's about a half-dozen possible, and none in
21432          * contiguous ranges longer than 2 */
21433         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21434             UV start, end;
21435             SV* above_bitmap = NULL;
21436
21437             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21438
21439             invlist_iterinit(above_bitmap);
21440             while (invlist_iternext(above_bitmap, &start, &end)) {
21441                 UV i;
21442
21443                 for (i = start; i <= end; i++) {
21444                     put_code_point(output, i);
21445                 }
21446             }
21447             invlist_iterfinish(above_bitmap);
21448             SvREFCNT_dec_NN(above_bitmap);
21449         }
21450     }
21451
21452     if (invert && SvCUR(output) == 1) {
21453         return NULL;
21454     }
21455
21456     return output;
21457 }
21458
21459 STATIC bool
21460 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21461                                      char *bitmap,
21462                                      SV *nonbitmap_invlist,
21463                                      SV *only_utf8_locale_invlist,
21464                                      const regnode * const node,
21465                                      const bool force_as_is_display)
21466 {
21467     /* Appends to 'sv' a displayable version of the innards of the bracketed
21468      * character class defined by the other arguments:
21469      *  'bitmap' points to the bitmap, or NULL if to ignore that.
21470      *  'nonbitmap_invlist' is an inversion list of the code points that are in
21471      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
21472      *      none.  The reasons for this could be that they require some
21473      *      condition such as the target string being or not being in UTF-8
21474      *      (under /d), or because they came from a user-defined property that
21475      *      was not resolved at the time of the regex compilation (under /u)
21476      *  'only_utf8_locale_invlist' is an inversion list of the code points that
21477      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
21478      *  'node' is the regex pattern ANYOF node.  It is needed only when the
21479      *      above two parameters are not null, and is passed so that this
21480      *      routine can tease apart the various reasons for them.
21481      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
21482      *      to invert things to see if that leads to a cleaner display.  If
21483      *      FALSE, this routine is free to use its judgment about doing this.
21484      *
21485      * It returns TRUE if there was actually something output.  (It may be that
21486      * the bitmap, etc is empty.)
21487      *
21488      * When called for outputting the bitmap of a non-ANYOF node, just pass the
21489      * bitmap, with the succeeding parameters set to NULL, and the final one to
21490      * FALSE.
21491      */
21492
21493     /* In general, it tries to display the 'cleanest' representation of the
21494      * innards, choosing whether to display them inverted or not, regardless of
21495      * whether the class itself is to be inverted.  However,  there are some
21496      * cases where it can't try inverting, as what actually matches isn't known
21497      * until runtime, and hence the inversion isn't either. */
21498     bool inverting_allowed = ! force_as_is_display;
21499
21500     int i;
21501     STRLEN orig_sv_cur = SvCUR(sv);
21502
21503     SV* invlist;            /* Inversion list we accumulate of code points that
21504                                are unconditionally matched */
21505     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
21506                                UTF-8 */
21507     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
21508                              */
21509     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
21510     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
21511                                        is UTF-8 */
21512
21513     SV* as_is_display;      /* The output string when we take the inputs
21514                                literally */
21515     SV* inverted_display;   /* The output string when we invert the inputs */
21516
21517     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21518
21519     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
21520                                                    to match? */
21521     /* We are biased in favor of displaying things without them being inverted,
21522      * as that is generally easier to understand */
21523     const int bias = 5;
21524
21525     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21526
21527     /* Start off with whatever code points are passed in.  (We clone, so we
21528      * don't change the caller's list) */
21529     if (nonbitmap_invlist) {
21530         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21531         invlist = invlist_clone(nonbitmap_invlist, NULL);
21532     }
21533     else {  /* Worst case size is every other code point is matched */
21534         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21535     }
21536
21537     if (flags) {
21538         if (OP(node) == ANYOFD) {
21539
21540             /* This flag indicates that the code points below 0x100 in the
21541              * nonbitmap list are precisely the ones that match only when the
21542              * target is UTF-8 (they should all be non-ASCII). */
21543             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21544             {
21545                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21546                 _invlist_subtract(invlist, only_utf8, &invlist);
21547             }
21548
21549             /* And this flag for matching all non-ASCII 0xFF and below */
21550             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21551             {
21552                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21553             }
21554         }
21555         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21556
21557             /* If either of these flags are set, what matches isn't
21558              * determinable except during execution, so don't know enough here
21559              * to invert */
21560             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21561                 inverting_allowed = FALSE;
21562             }
21563
21564             /* What the posix classes match also varies at runtime, so these
21565              * will be output symbolically. */
21566             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21567                 int i;
21568
21569                 posixes = newSVpvs("");
21570                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21571                     if (ANYOF_POSIXL_TEST(node, i)) {
21572                         sv_catpv(posixes, anyofs[i]);
21573                     }
21574                 }
21575             }
21576         }
21577     }
21578
21579     /* Accumulate the bit map into the unconditional match list */
21580     if (bitmap) {
21581         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21582             if (BITMAP_TEST(bitmap, i)) {
21583                 int start = i++;
21584                 for (;
21585                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21586                      i++)
21587                 { /* empty */ }
21588                 invlist = _add_range_to_invlist(invlist, start, i-1);
21589             }
21590         }
21591     }
21592
21593     /* Make sure that the conditional match lists don't have anything in them
21594      * that match unconditionally; otherwise the output is quite confusing.
21595      * This could happen if the code that populates these misses some
21596      * duplication. */
21597     if (only_utf8) {
21598         _invlist_subtract(only_utf8, invlist, &only_utf8);
21599     }
21600     if (not_utf8) {
21601         _invlist_subtract(not_utf8, invlist, &not_utf8);
21602     }
21603
21604     if (only_utf8_locale_invlist) {
21605
21606         /* Since this list is passed in, we have to make a copy before
21607          * modifying it */
21608         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21609
21610         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21611
21612         /* And, it can get really weird for us to try outputting an inverted
21613          * form of this list when it has things above the bitmap, so don't even
21614          * try */
21615         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21616             inverting_allowed = FALSE;
21617         }
21618     }
21619
21620     /* Calculate what the output would be if we take the input as-is */
21621     as_is_display = put_charclass_bitmap_innards_common(invlist,
21622                                                     posixes,
21623                                                     only_utf8,
21624                                                     not_utf8,
21625                                                     only_utf8_locale,
21626                                                     invert);
21627
21628     /* If have to take the output as-is, just do that */
21629     if (! inverting_allowed) {
21630         if (as_is_display) {
21631             sv_catsv(sv, as_is_display);
21632             SvREFCNT_dec_NN(as_is_display);
21633         }
21634     }
21635     else { /* But otherwise, create the output again on the inverted input, and
21636               use whichever version is shorter */
21637
21638         int inverted_bias, as_is_bias;
21639
21640         /* We will apply our bias to whichever of the the results doesn't have
21641          * the '^' */
21642         if (invert) {
21643             invert = FALSE;
21644             as_is_bias = bias;
21645             inverted_bias = 0;
21646         }
21647         else {
21648             invert = TRUE;
21649             as_is_bias = 0;
21650             inverted_bias = bias;
21651         }
21652
21653         /* Now invert each of the lists that contribute to the output,
21654          * excluding from the result things outside the possible range */
21655
21656         /* For the unconditional inversion list, we have to add in all the
21657          * conditional code points, so that when inverted, they will be gone
21658          * from it */
21659         _invlist_union(only_utf8, invlist, &invlist);
21660         _invlist_union(not_utf8, invlist, &invlist);
21661         _invlist_union(only_utf8_locale, invlist, &invlist);
21662         _invlist_invert(invlist);
21663         _invlist_intersection(invlist, PL_InBitmap, &invlist);
21664
21665         if (only_utf8) {
21666             _invlist_invert(only_utf8);
21667             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21668         }
21669         else if (not_utf8) {
21670
21671             /* If a code point matches iff the target string is not in UTF-8,
21672              * then complementing the result has it not match iff not in UTF-8,
21673              * which is the same thing as matching iff it is UTF-8. */
21674             only_utf8 = not_utf8;
21675             not_utf8 = NULL;
21676         }
21677
21678         if (only_utf8_locale) {
21679             _invlist_invert(only_utf8_locale);
21680             _invlist_intersection(only_utf8_locale,
21681                                   PL_InBitmap,
21682                                   &only_utf8_locale);
21683         }
21684
21685         inverted_display = put_charclass_bitmap_innards_common(
21686                                             invlist,
21687                                             posixes,
21688                                             only_utf8,
21689                                             not_utf8,
21690                                             only_utf8_locale, invert);
21691
21692         /* Use the shortest representation, taking into account our bias
21693          * against showing it inverted */
21694         if (   inverted_display
21695             && (   ! as_is_display
21696                 || (  SvCUR(inverted_display) + inverted_bias
21697                     < SvCUR(as_is_display)    + as_is_bias)))
21698         {
21699             sv_catsv(sv, inverted_display);
21700         }
21701         else if (as_is_display) {
21702             sv_catsv(sv, as_is_display);
21703         }
21704
21705         SvREFCNT_dec(as_is_display);
21706         SvREFCNT_dec(inverted_display);
21707     }
21708
21709     SvREFCNT_dec_NN(invlist);
21710     SvREFCNT_dec(only_utf8);
21711     SvREFCNT_dec(not_utf8);
21712     SvREFCNT_dec(posixes);
21713     SvREFCNT_dec(only_utf8_locale);
21714
21715     return SvCUR(sv) > orig_sv_cur;
21716 }
21717
21718 #define CLEAR_OPTSTART                                                       \
21719     if (optstart) STMT_START {                                               \
21720         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
21721                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21722         optstart=NULL;                                                       \
21723     } STMT_END
21724
21725 #define DUMPUNTIL(b,e)                                                       \
21726                     CLEAR_OPTSTART;                                          \
21727                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21728
21729 STATIC const regnode *
21730 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
21731             const regnode *last, const regnode *plast,
21732             SV* sv, I32 indent, U32 depth)
21733 {
21734     U8 op = PSEUDO;     /* Arbitrary non-END op. */
21735     const regnode *next;
21736     const regnode *optstart= NULL;
21737
21738     RXi_GET_DECL(r, ri);
21739     GET_RE_DEBUG_FLAGS_DECL;
21740
21741     PERL_ARGS_ASSERT_DUMPUNTIL;
21742
21743 #ifdef DEBUG_DUMPUNTIL
21744     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
21745         last ? last-start : 0, plast ? plast-start : 0);
21746 #endif
21747
21748     if (plast && plast < last)
21749         last= plast;
21750
21751     while (PL_regkind[op] != END && (!last || node < last)) {
21752         assert(node);
21753         /* While that wasn't END last time... */
21754         NODE_ALIGN(node);
21755         op = OP(node);
21756         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
21757             indent--;
21758         next = regnext((regnode *)node);
21759
21760         /* Where, what. */
21761         if (OP(node) == OPTIMIZED) {
21762             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
21763                 optstart = node;
21764             else
21765                 goto after_print;
21766         } else
21767             CLEAR_OPTSTART;
21768
21769         regprop(r, sv, node, NULL, NULL);
21770         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
21771                       (int)(2*indent + 1), "", SvPVX_const(sv));
21772
21773         if (OP(node) != OPTIMIZED) {
21774             if (next == NULL)           /* Next ptr. */
21775                 Perl_re_printf( aTHX_  " (0)");
21776             else if (PL_regkind[(U8)op] == BRANCH
21777                      && PL_regkind[OP(next)] != BRANCH )
21778                 Perl_re_printf( aTHX_  " (FAIL)");
21779             else
21780                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
21781             Perl_re_printf( aTHX_ "\n");
21782         }
21783
21784       after_print:
21785         if (PL_regkind[(U8)op] == BRANCHJ) {
21786             assert(next);
21787             {
21788                 const regnode *nnode = (OP(next) == LONGJMP
21789                                        ? regnext((regnode *)next)
21790                                        : next);
21791                 if (last && nnode > last)
21792                     nnode = last;
21793                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
21794             }
21795         }
21796         else if (PL_regkind[(U8)op] == BRANCH) {
21797             assert(next);
21798             DUMPUNTIL(NEXTOPER(node), next);
21799         }
21800         else if ( PL_regkind[(U8)op]  == TRIE ) {
21801             const regnode *this_trie = node;
21802             const char op = OP(node);
21803             const U32 n = ARG(node);
21804             const reg_ac_data * const ac = op>=AHOCORASICK ?
21805                (reg_ac_data *)ri->data->data[n] :
21806                NULL;
21807             const reg_trie_data * const trie =
21808                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
21809 #ifdef DEBUGGING
21810             AV *const trie_words
21811                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
21812 #endif
21813             const regnode *nextbranch= NULL;
21814             I32 word_idx;
21815             SvPVCLEAR(sv);
21816             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
21817                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
21818
21819                 Perl_re_indentf( aTHX_  "%s ",
21820                     indent+3,
21821                     elem_ptr
21822                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
21823                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
21824                                 PL_colors[0], PL_colors[1],
21825                                 (SvUTF8(*elem_ptr)
21826                                  ? PERL_PV_ESCAPE_UNI
21827                                  : 0)
21828                                 | PERL_PV_PRETTY_ELLIPSES
21829                                 | PERL_PV_PRETTY_LTGT
21830                             )
21831                     : "???"
21832                 );
21833                 if (trie->jump) {
21834                     U16 dist= trie->jump[word_idx+1];
21835                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
21836                                (UV)((dist ? this_trie + dist : next) - start));
21837                     if (dist) {
21838                         if (!nextbranch)
21839                             nextbranch= this_trie + trie->jump[0];
21840                         DUMPUNTIL(this_trie + dist, nextbranch);
21841                     }
21842                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
21843                         nextbranch= regnext((regnode *)nextbranch);
21844                 } else {
21845                     Perl_re_printf( aTHX_  "\n");
21846                 }
21847             }
21848             if (last && next > last)
21849                 node= last;
21850             else
21851                 node= next;
21852         }
21853         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
21854             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
21855                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
21856         }
21857         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
21858             assert(next);
21859             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
21860         }
21861         else if ( op == PLUS || op == STAR) {
21862             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
21863         }
21864         else if (PL_regkind[(U8)op] == EXACT) {
21865             /* Literal string, where present. */
21866             node += NODE_SZ_STR(node) - 1;
21867             node = NEXTOPER(node);
21868         }
21869         else {
21870             node = NEXTOPER(node);
21871             node += regarglen[(U8)op];
21872         }
21873         if (op == CURLYX || op == OPEN || op == SROPEN)
21874             indent++;
21875     }
21876     CLEAR_OPTSTART;
21877 #ifdef DEBUG_DUMPUNTIL
21878     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
21879 #endif
21880     return node;
21881 }
21882
21883 #endif  /* DEBUGGING */
21884
21885 #ifndef PERL_IN_XSUB_RE
21886
21887 #include "uni_keywords.h"
21888
21889 void
21890 Perl_init_uniprops(pTHX)
21891 {
21892     /* Set up the inversion list global variables */
21893
21894     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21895     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
21896     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
21897     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
21898     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
21899     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
21900     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
21901     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
21902     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
21903     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
21904     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
21905     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
21906     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
21907     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
21908     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
21909     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
21910
21911     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21912     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
21913     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
21914     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
21915     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
21916     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
21917     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
21918     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
21919     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
21920     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
21921     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
21922     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
21923     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
21924     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
21925     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
21926     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
21927
21928     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
21929     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
21930     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
21931     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
21932     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
21933
21934     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
21935     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
21936     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
21937
21938     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
21939
21940     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
21941     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
21942
21943     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
21944     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
21945
21946     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
21947     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21948                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
21949     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21950                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
21951     PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
21952                                             UNI__PERL_NON_FINAL_FOLDS]);
21953
21954     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
21955     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
21956     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
21957     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
21958     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
21959     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
21960     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
21961     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
21962
21963 #ifdef UNI_XIDC
21964     /* The below are used only by deprecated functions.  They could be removed */
21965     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
21966     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
21967     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
21968 #endif
21969 }
21970
21971 SV *
21972 Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
21973                                 const bool to_fold, bool * invert)
21974 {
21975     /* Parse the interior meat of \p{} passed to this in 'name' with length
21976      * 'name_len', and return an inversion list if a property with 'name' is
21977      * found, or NULL if not.  'name' point to the input with leading and
21978      * trailing space trimmed.  'to_fold' indicates if /i is in effect.
21979      *
21980      * When the return is an inversion list, '*invert' will be set to a boolean
21981      * indicating if it should be inverted or not
21982      *
21983      * This currently doesn't handle all cases.  A NULL return indicates the
21984      * caller should try a different approach
21985      */
21986
21987     char* lookup_name;
21988     bool stricter = FALSE;
21989     bool is_nv_type = FALSE;         /* nv= or numeric_value=, or possibly one
21990                                         of the cjk numeric properties (though
21991                                         it requires extra effort to compile
21992                                         them) */
21993     unsigned int i;
21994     unsigned int j = 0, lookup_len;
21995     int equals_pos = -1;        /* Where the '=' is found, or negative if none */
21996     int slash_pos = -1;        /* Where the '/' is found, or negative if none */
21997     int table_index = 0;
21998     bool starts_with_In_or_Is = FALSE;
21999     Size_t lookup_offset = 0;
22000
22001     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
22002
22003     /* The input will be modified into 'lookup_name' */
22004     Newx(lookup_name, name_len, char);
22005     SAVEFREEPV(lookup_name);
22006
22007     /* Parse the input. */
22008     for (i = 0; i < name_len; i++) {
22009         char cur = name[i];
22010
22011         /* These characters can be freely ignored in most situations.  Later it
22012          * may turn out we shouldn't have ignored them, and we have to reparse,
22013          * but we don't have enough information yet to make that decision */
22014         if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
22015             continue;
22016         }
22017
22018         /* Case differences are also ignored.  Our lookup routine assumes
22019          * everything is lowercase */
22020         if (isUPPER_A(cur)) {
22021             lookup_name[j++] = toLOWER(cur);
22022             continue;
22023         }
22024
22025         /* A double colon is either an error, or a package qualifier to a
22026          * subroutine user-defined property; neither of which do we currently
22027          * handle
22028          *
22029          * But a single colon is a synonym for '=' */
22030         if (cur == ':') {
22031             if (i < name_len - 1 && name[i+1] == ':') {
22032                 return NULL;
22033             }
22034             cur = '=';
22035         }
22036
22037         /* Otherwise, this character is part of the name. */
22038         lookup_name[j++] = cur;
22039
22040         /* Only the equals sign needs further processing */
22041         if (cur == '=') {
22042             equals_pos = j; /* Note where it occurred in the input */
22043             break;
22044         }
22045     }
22046
22047     /* Here, we are either done with the whole property name, if it was simple;
22048      * or are positioned just after the '=' if it is compound. */
22049
22050     if (equals_pos >= 0) {
22051         assert(! stricter); /* We shouldn't have set this yet */
22052
22053         /* Space immediately after the '=' is ignored */
22054         i++;
22055         for (; i < name_len; i++) {
22056             if (! isSPACE_A(name[i])) {
22057                 break;
22058             }
22059         }
22060
22061         /* Certain properties need special handling.  They may optionally be
22062          * prefixed by 'is'.  Ignore that prefix for the purposes of checking
22063          * if this is one of those properties */
22064         if (memBEGINPs(lookup_name, name_len, "is")) {
22065             lookup_offset = 2;
22066         }
22067
22068         /* Then check if it is one of these properties.  This is hard-coded
22069          * because easier this way, and the list is unlikely to change.  There
22070          * are several properties like this in the Unihan DB, which is unlikely
22071          * to be compiled, and they all end with 'numeric'.  The interiors
22072          * aren't checked for the precise property.  This would stop working if
22073          * a cjk property were to be created that ended with 'numeric' and
22074          * wasn't a numeric type */
22075         is_nv_type = memEQs(lookup_name + lookup_offset,
22076                        j - 1 - lookup_offset, "numericvalue")
22077                   || memEQs(lookup_name + lookup_offset,
22078                       j - 1 - lookup_offset, "nv")
22079                   || (   memENDPs(lookup_name + lookup_offset,
22080                             j - 1 - lookup_offset, "numeric")
22081                       && (   memBEGINPs(lookup_name + lookup_offset,
22082                                       j - 1 - lookup_offset, "cjk")
22083                           || memBEGINPs(lookup_name + lookup_offset,
22084                                       j - 1 - lookup_offset, "k")));
22085         if (   is_nv_type
22086             || memEQs(lookup_name + lookup_offset,
22087                       j - 1 - lookup_offset, "canonicalcombiningclass")
22088             || memEQs(lookup_name + lookup_offset,
22089                       j - 1 - lookup_offset, "ccc")
22090             || memEQs(lookup_name + lookup_offset,
22091                       j - 1 - lookup_offset, "age")
22092             || memEQs(lookup_name + lookup_offset,
22093                       j - 1 - lookup_offset, "in")
22094             || memEQs(lookup_name + lookup_offset,
22095                       j - 1 - lookup_offset, "presentin"))
22096         {
22097             unsigned int k;
22098
22099             /* What makes these properties special is that the stuff after the
22100              * '=' is a number.  Therefore, we can't throw away '-'
22101              * willy-nilly, as those could be a minus sign.  Other stricter
22102              * rules also apply.  However, these properties all can have the
22103              * rhs not be a number, in which case they contain at least one
22104              * alphabetic.  In those cases, the stricter rules don't apply.
22105              * But the numeric type properties can have the alphas [Ee] to
22106              * signify an exponent, and it is still a number with stricter
22107              * rules.  So look for an alpha that signifys not-strict */
22108             stricter = TRUE;
22109             for (k = i; k < name_len; k++) {
22110                 if (   isALPHA_A(name[k])
22111                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
22112                 {
22113                     stricter = FALSE;
22114                     break;
22115                 }
22116             }
22117         }
22118
22119         if (stricter) {
22120
22121             /* A number may have a leading '+' or '-'.  The latter is retained
22122              * */
22123             if (name[i] == '+') {
22124                 i++;
22125             }
22126             else if (name[i] == '-') {
22127                 lookup_name[j++] = '-';
22128                 i++;
22129             }
22130
22131             /* Skip leading zeros including single underscores separating the
22132              * zeros, or between the final leading zero and the first other
22133              * digit */
22134             for (; i < name_len - 1; i++) {
22135                 if (   name[i] != '0'
22136                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
22137                 {
22138                     break;
22139                 }
22140             }
22141         }
22142     }
22143     else {  /* No '=' */
22144
22145        /* We are now in a position to determine if this property should have
22146         * been parsed using stricter rules.  Only a few are like that, and
22147         * unlikely to change. */
22148         if (   memBEGINPs(lookup_name, j, "perl")
22149             && memNEs(lookup_name + 4, j - 4, "space")
22150             && memNEs(lookup_name + 4, j - 4, "word"))
22151         {
22152             stricter = TRUE;
22153
22154             /* We set the inputs back to 0 and the code below will reparse,
22155              * using strict */
22156             i = j = 0;
22157         }
22158     }
22159
22160     /* Here, we have either finished the property, or are positioned to parse
22161      * the remainder, and we know if stricter rules apply.  Finish out, if not
22162      * already done */
22163     for (; i < name_len; i++) {
22164         char cur = name[i];
22165
22166         /* In all instances, case differences are ignored, and we normalize to
22167          * lowercase */
22168         if (isUPPER_A(cur)) {
22169             lookup_name[j++] = toLOWER(cur);
22170             continue;
22171         }
22172
22173         /* An underscore is skipped, but not under strict rules unless it
22174          * separates two digits */
22175         if (cur == '_') {
22176             if (    stricter
22177                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
22178                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
22179             {
22180                 lookup_name[j++] = '_';
22181             }
22182             continue;
22183         }
22184
22185         /* Hyphens are skipped except under strict */
22186         if (cur == '-' && ! stricter) {
22187             continue;
22188         }
22189
22190         /* XXX Bug in documentation.  It says white space skipped adjacent to
22191          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
22192          * in a number */
22193         if (isSPACE_A(cur) && ! stricter) {
22194             continue;
22195         }
22196
22197         lookup_name[j++] = cur;
22198
22199         /* Unless this is a non-trailing slash, we are done with it */
22200         if (i >= name_len - 1 || cur != '/') {
22201             continue;
22202         }
22203
22204         slash_pos = j;
22205
22206         /* A slash in the 'numeric value' property indicates that what follows
22207          * is a denominator.  It can have a leading '+' and '0's that should be
22208          * skipped.  But we have never allowed a negative denominator, so treat
22209          * a minus like every other character.  (No need to rule out a second
22210          * '/', as that won't match anything anyway */
22211         if (is_nv_type) {
22212             i++;
22213             if (i < name_len && name[i] == '+') {
22214                 i++;
22215             }
22216
22217             /* Skip leading zeros including underscores separating digits */
22218             for (; i < name_len - 1; i++) {
22219                 if (   name[i] != '0'
22220                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
22221                 {
22222                     break;
22223                 }
22224             }
22225
22226             /* Store the first real character in the denominator */
22227             lookup_name[j++] = name[i];
22228         }
22229     }
22230
22231     /* Here are completely done parsing the input 'name', and 'lookup_name'
22232      * contains a copy, normalized.
22233      *
22234      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
22235      * different from without the underscores.  */
22236     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
22237            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
22238         && UNLIKELY(name[name_len-1] == '_'))
22239     {
22240         lookup_name[j++] = '&';
22241     }
22242     else if (name_len > 2 && name[0] == 'I' && (   name[1] == 'n'
22243                                                 || name[1] == 's'))
22244     {
22245
22246         /* Also, if the original input began with 'In' or 'Is', it could be a
22247          * subroutine call instead of a property names, which currently isn't
22248          * handled by this function.  Subroutine calls can't happen if there is
22249          * an '=' in the name */
22250         if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
22251         {
22252             return NULL;
22253         }
22254
22255         starts_with_In_or_Is = TRUE;
22256     }
22257
22258     lookup_len = j;     /* Use a more mnemonic name starting here */
22259
22260     /* Get the index into our pointer table of the inversion list corresponding
22261      * to the property */
22262     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
22263
22264     /* If it didn't find the property */
22265     if (table_index == 0) {
22266
22267         /* If didn't find the property, we try again stripping off any initial
22268          * 'In' or 'Is' */
22269         if (starts_with_In_or_Is) {
22270             lookup_name += 2;
22271             lookup_len -= 2;
22272             equals_pos -= 2;
22273             slash_pos -= 2;
22274
22275             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
22276         }
22277
22278         if (table_index == 0) {
22279             char * canonical;
22280
22281             /* If not found, and not a numeric type property, isn't a legal
22282              * property */
22283             if (! is_nv_type) {
22284                 return NULL;
22285             }
22286
22287             /* But the numeric type properties need more work to decide.  What
22288              * we do is make sure we have the number in canonical form and look
22289              * that up. */
22290
22291             if (slash_pos < 0) {    /* No slash */
22292
22293                 /* When it isn't a rational, take the input, convert it to a
22294                  * NV, then create a canonical string representation of that
22295                  * NV. */
22296
22297                 NV value;
22298
22299                 /* Get the value */
22300                 if (my_atof3(lookup_name + equals_pos, &value,
22301                              lookup_len - equals_pos)
22302                           != lookup_name + lookup_len)
22303                 {
22304                     return NULL;
22305                 }
22306
22307                 /* If the value is an integer, the canonical value is integral */
22308                 if (Perl_ceil(value) == value) {
22309                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
22310                                                 equals_pos, lookup_name, value);
22311                 }
22312                 else {  /* Otherwise, it is %e with a known precision */
22313                     char * exp_ptr;
22314
22315                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
22316                                                 equals_pos, lookup_name,
22317                                                 PL_E_FORMAT_PRECISION, value);
22318
22319                     /* The exponent generated is expecting two digits, whereas
22320                      * %e on some systems will generate three.  Remove leading
22321                      * zeros in excess of 2 from the exponent.  We start
22322                      * looking for them after the '=' */
22323                     exp_ptr = strchr(canonical + equals_pos, 'e');
22324                     if (exp_ptr) {
22325                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
22326                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
22327
22328                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
22329
22330                         if (excess_exponent_len > 0) {
22331                             SSize_t leading_zeros = strspn(cur_ptr, "0");
22332                             SSize_t excess_leading_zeros
22333                                     = MIN(leading_zeros, excess_exponent_len);
22334                             if (excess_leading_zeros > 0) {
22335                                 Move(cur_ptr + excess_leading_zeros,
22336                                      cur_ptr,
22337                                      strlen(cur_ptr) - excess_leading_zeros
22338                                        + 1,  /* Copy the NUL as well */
22339                                      char);
22340                             }
22341                         }
22342                     }
22343                 }
22344             }
22345             else {  /* Has a slash.  Create a rational in canonical form  */
22346                 UV numerator, denominator, gcd, trial;
22347                 const char * end_ptr;
22348                 const char * sign = "";
22349
22350                 /* We can't just find the numerator, denominator, and do the
22351                  * division, then use the method above, because that is
22352                  * inexact.  And the input could be a rational that is within
22353                  * epsilon (given our precision) of a valid rational, and would
22354                  * then incorrectly compare valid.
22355                  *
22356                  * We're only interested in the part after the '=' */
22357                 const char * this_lookup_name = lookup_name + equals_pos;
22358                 lookup_len -= equals_pos;
22359                 slash_pos -= equals_pos;
22360
22361                 /* Handle any leading minus */
22362                 if (this_lookup_name[0] == '-') {
22363                     sign = "-";
22364                     this_lookup_name++;
22365                     lookup_len--;
22366                     slash_pos--;
22367                 }
22368
22369                 /* Convert the numerator to numeric */
22370                 end_ptr = this_lookup_name + slash_pos;
22371                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
22372                     return NULL;
22373                 }
22374
22375                 /* It better have included all characters before the slash */
22376                 if (*end_ptr != '/') {
22377                     return NULL;
22378                 }
22379
22380                 /* Set to look at just the denominator */
22381                 this_lookup_name += slash_pos;
22382                 lookup_len -= slash_pos;
22383                 end_ptr = this_lookup_name + lookup_len;
22384
22385                 /* Convert the denominator to numeric */
22386                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
22387                     return NULL;
22388                 }
22389
22390                 /* It better be the rest of the characters, and don't divide by
22391                  * 0 */
22392                 if (   end_ptr != this_lookup_name + lookup_len
22393                     || denominator == 0)
22394                 {
22395                     return NULL;
22396                 }
22397
22398                 /* Get the greatest common denominator using
22399                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
22400                 gcd = numerator;
22401                 trial = denominator;
22402                 while (trial != 0) {
22403                     UV temp = trial;
22404                     trial = gcd % trial;
22405                     gcd = temp;
22406                 }
22407
22408                 /* If already in lowest possible terms, we have already tried
22409                  * looking this up */
22410                 if (gcd == 1) {
22411                     return NULL;
22412                 }
22413
22414                 /* Reduce the rational, which should put it in canonical form.
22415                  * Then look it up */
22416                 numerator /= gcd;
22417                 denominator /= gcd;
22418
22419                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
22420                         equals_pos, lookup_name, sign, numerator, denominator);
22421             }
22422
22423             /* Here, we have the number in canonical form.  Try that */
22424             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
22425             if (table_index == 0) {
22426                 return NULL;
22427             }
22428         }
22429     }
22430
22431     /* The return is an index into a table of ptrs.  A negative return
22432      * signifies that the real index is the absolute value, but the result
22433      * needs to be inverted */
22434     if (table_index < 0) {
22435         *invert = TRUE;
22436         table_index = -table_index;
22437     }
22438     else {
22439         *invert = FALSE;
22440     }
22441
22442     /* Out-of band indices indicate a deprecated property.  The proper index is
22443      * modulo it with the table size.  And dividing by the table size yields
22444      * an offset into a table constructed to contain the corresponding warning
22445      * message */
22446     if (table_index > MAX_UNI_KEYWORD_INDEX) {
22447         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
22448         table_index %= MAX_UNI_KEYWORD_INDEX;
22449         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
22450                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
22451                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
22452     }
22453
22454     /* In a few properties, a different property is used under /i.  These are
22455      * unlikely to change, so are hard-coded here. */
22456     if (to_fold) {
22457         if (   table_index == UNI_XPOSIXUPPER
22458             || table_index == UNI_XPOSIXLOWER
22459             || table_index == UNI_TITLE)
22460         {
22461             table_index = UNI_CASED;
22462         }
22463         else if (   table_index == UNI_UPPERCASELETTER
22464                  || table_index == UNI_LOWERCASELETTER
22465 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
22466                  || table_index == UNI_TITLECASELETTER
22467 #  endif
22468         ) {
22469             table_index = UNI_CASEDLETTER;
22470         }
22471         else if (  table_index == UNI_POSIXUPPER
22472                 || table_index == UNI_POSIXLOWER)
22473         {
22474             table_index = UNI_POSIXALPHA;
22475         }
22476     }
22477
22478     /* Create and return the inversion list */
22479     return _new_invlist_C_array(uni_prop_ptrs[table_index]);
22480 }
22481
22482 #endif
22483
22484 /*
22485  * ex: set ts=8 sts=4 sw=4 et:
22486  */