regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #define REG_COMP_C
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 EXTERN_C const struct regexp_engine my_reg_engine;
  81 #else
  82 #  include "regcomp.h"
  83 #endif
  84
  85 #include "dquote_inline.h"
  86 #include "invlist_inline.h"
  87 #include "unicode_constants.h"
  88
  89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  90  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  92  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  95
  96 #ifndef STATIC
  97 #define STATIC  static
  98 #endif
  99
 100 /* this is a chain of data about sub patterns we are processing that
 101    need to be handled separately/specially in study_chunk. Its so
 102    we can simulate recursion without losing state.  */
 103 struct scan_frame;
 104 typedef struct scan_frame {
 105     regnode *last_regnode;      /* last node to process in this frame */
 106     regnode *next_regnode;      /* next node to process when last is reached */
 107     U32 prev_recursed_depth;
 108     I32 stopparen;              /* what stopparen do we use */
 109
 110     struct scan_frame *this_prev_frame; /* this previous frame */
 111     struct scan_frame *prev_frame;      /* previous frame */
 112     struct scan_frame *next_frame;      /* next frame */
 113 } scan_frame;
 114
 115 /* Certain characters are output as a sequence with the first being a
 116  * backslash. */
 117 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 118
 119
 120 struct RExC_state_t {
 121     U32         flags;                  /* RXf_* are we folding, multilining? */
 122     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 123     char        *precomp;               /* uncompiled string. */
 124     char        *precomp_end;           /* pointer to end of uncompiled string. */
 125     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 126     regexp      *rx;                    /* perl core regexp structure */
 127     regexp_internal     *rxi;           /* internal data for regexp object
 128                                            pprivate field */
 129     char        *start;                 /* Start of input for compile */
 130     char        *end;                   /* End of input for compile */
 131     char        *parse;                 /* Input-scan pointer. */
 132     char        *copy_start;            /* start of copy of input within
 133                                            constructed parse string */
 134     char        *save_copy_start;       /* Provides one level of saving
 135                                            and restoring 'copy_start' */
 136     char        *copy_start_in_input;   /* Position in input string
 137                                            corresponding to copy_start */
 138     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 139     regnode     *emit_start;            /* Start of emitted-code area */
 140     regnode_offset emit;                /* Code-emit pointer */
 141     I32         naughty;                /* How bad is this pattern? */
 142     I32         sawback;                /* Did we see \1, ...? */
 143     U32         seen;
 144     SSize_t     size;                   /* Number of regnode equivalents in
 145                                            pattern */
 146
 147     /* position beyond 'precomp' of the warning message furthest away from
 148      * 'precomp'.  During the parse, no warnings are raised for any problems
 149      * earlier in the parse than this position.  This works if warnings are
 150      * raised the first time a given spot is parsed, and if only one
 151      * independent warning is raised for any given spot */
 152     Size_t      latest_warn_offset;
 153
 154     I32         npar;                   /* Capture buffer count so far in the
 155                                            parse, (OPEN) plus one. ("par" 0 is
 156                                            the whole pattern)*/
 157     I32         total_par;              /* During initial parse, is either 0,
 158                                            or -1; the latter indicating a
 159                                            reparse is needed.  After that pass,
 160                                            it is what 'npar' became after the
 161                                            pass.  Hence, it being > 0 indicates
 162                                            we are in a reparse situation */
 163     I32         nestroot;               /* root parens we are in - used by
 164                                            accept */
 165     I32         seen_zerolen;
 166     regnode_offset *open_parens;        /* offsets to open parens */
 167     regnode_offset *close_parens;       /* offsets to close parens */
 168     I32      parens_buf_size;           /* #slots malloced open/close_parens */
 169     regnode     *end_op;                /* END node in program */
 170     I32         utf8;           /* whether the pattern is utf8 or not */
 171     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 172                                 /* XXX use this for future optimisation of case
 173                                  * where pattern must be upgraded to utf8. */
 174     I32         uni_semantics;  /* If a d charset modifier should use unicode
 175                                    rules, even if the pattern is not in
 176                                    utf8 */
 177     HV          *paren_names;           /* Paren names */
 178
 179     regnode     **recurse;              /* Recurse regops */
 180     I32         recurse_count;          /* Number of recurse regops we have generated */
 181     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
 182                                            through */
 183     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 184     I32         in_lookbehind;
 185     I32         in_lookahead;
 186     I32         contains_locale;
 187     I32         override_recoding;
 188     I32         recode_x_to_native;
 189     I32         in_multi_char_class;
 190     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
 191                                             within pattern */
 192     int         code_index;             /* next code_blocks[] slot */
 193     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 194     scan_frame *frame_head;
 195     scan_frame *frame_last;
 196     U32         frame_count;
 197     AV         *warn_text;
 198     HV         *unlexed_names;
 199 #ifdef ADD_TO_REGEXEC
 200     char        *starttry;              /* -Dr: where regtry was called. */
 201 #define RExC_starttry   (pRExC_state->starttry)
 202 #endif
 203     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 204 #ifdef DEBUGGING
 205     const char  *lastparse;
 206     I32         lastnum;
 207     AV          *paren_name_list;       /* idx -> name */
 208     U32         study_chunk_recursed_count;
 209     SV          *mysv1;
 210     SV          *mysv2;
 211
 212 #define RExC_lastparse  (pRExC_state->lastparse)
 213 #define RExC_lastnum    (pRExC_state->lastnum)
 214 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 215 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
 216 #define RExC_mysv       (pRExC_state->mysv1)
 217 #define RExC_mysv1      (pRExC_state->mysv1)
 218 #define RExC_mysv2      (pRExC_state->mysv2)
 219
 220 #endif
 221     bool        seen_d_op;
 222     bool        strict;
 223     bool        study_started;
 224     bool        in_script_run;
 225     bool        use_BRANCHJ;
 226 };
 227
 228 #define RExC_flags      (pRExC_state->flags)
 229 #define RExC_pm_flags   (pRExC_state->pm_flags)
 230 #define RExC_precomp    (pRExC_state->precomp)
 231 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
 232 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
 233 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
 234 #define RExC_precomp_end (pRExC_state->precomp_end)
 235 #define RExC_rx_sv      (pRExC_state->rx_sv)
 236 #define RExC_rx         (pRExC_state->rx)
 237 #define RExC_rxi        (pRExC_state->rxi)
 238 #define RExC_start      (pRExC_state->start)
 239 #define RExC_end        (pRExC_state->end)
 240 #define RExC_parse      (pRExC_state->parse)
 241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
 242 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
 244                                                    under /d from /u ? */
 245
 246 #ifdef RE_TRACK_PATTERN_OFFSETS
 247 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
 248                                                          others */
 249 #endif
 250 #define RExC_emit       (pRExC_state->emit)
 251 #define RExC_emit_start (pRExC_state->emit_start)
 252 #define RExC_sawback    (pRExC_state->sawback)
 253 #define RExC_seen       (pRExC_state->seen)
 254 #define RExC_size       (pRExC_state->size)
 255 #define RExC_maxlen        (pRExC_state->maxlen)
 256 #define RExC_npar       (pRExC_state->npar)
 257 #define RExC_total_parens       (pRExC_state->total_par)
 258 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
 259 #define RExC_nestroot   (pRExC_state->nestroot)
 260 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 261 #define RExC_utf8       (pRExC_state->utf8)
 262 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 263 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 264 #define RExC_open_parens        (pRExC_state->open_parens)
 265 #define RExC_close_parens       (pRExC_state->close_parens)
 266 #define RExC_end_op     (pRExC_state->end_op)
 267 #define RExC_paren_names        (pRExC_state->paren_names)
 268 #define RExC_recurse    (pRExC_state->recurse)
 269 #define RExC_recurse_count      (pRExC_state->recurse_count)
 270 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 271 #define RExC_study_chunk_recursed_bytes  \
 272                                    (pRExC_state->study_chunk_recursed_bytes)
 273 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 274 #define RExC_in_lookahead       (pRExC_state->in_lookahead)
 275 #define RExC_contains_locale    (pRExC_state->contains_locale)
 276 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
 277
 278 #ifdef EBCDIC
 279 #  define SET_recode_x_to_native(x)                                         \
 280                     STMT_START { RExC_recode_x_to_native = (x); } STMT_END
 281 #else
 282 #  define SET_recode_x_to_native(x) NOOP
 283 #endif
 284
 285 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 286 #define RExC_frame_head (pRExC_state->frame_head)
 287 #define RExC_frame_last (pRExC_state->frame_last)
 288 #define RExC_frame_count (pRExC_state->frame_count)
 289 #define RExC_strict (pRExC_state->strict)
 290 #define RExC_study_started      (pRExC_state->study_started)
 291 #define RExC_warn_text (pRExC_state->warn_text)
 292 #define RExC_in_script_run      (pRExC_state->in_script_run)
 293 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
 294 #define RExC_unlexed_names (pRExC_state->unlexed_names)
 295
 296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
 297  * a flag to disable back-off on the fixed/floating substrings - if it's
 298  * a high complexity pattern we assume the benefit of avoiding a full match
 299  * is worth the cost of checking for the substrings even if they rarely help.
 300  */
 301 #define RExC_naughty    (pRExC_state->naughty)
 302 #define TOO_NAUGHTY (10)
 303 #define MARK_NAUGHTY(add) \
 304     if (RExC_naughty < TOO_NAUGHTY) \
 305         RExC_naughty += (add)
 306 #define MARK_NAUGHTY_EXP(exp, add) \
 307     if (RExC_naughty < TOO_NAUGHTY) \
 308         RExC_naughty += RExC_naughty / (exp) + (add)
 309
 310 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 311 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 312         ((*s) == '{' && regcurly(s)))
 313
 314 /*
 315  * Flags to be passed up and down.
 316  */
 317 #define WORST           0       /* Worst case. */
 318 #define HASWIDTH        0x01    /* Known to not match null strings, could match
 319                                    non-null ones. */
 320
 321 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 322  * character.  (There needs to be a case: in the switch statement in regexec.c
 323  * for any node marked SIMPLE.)  Note that this is not the same thing as
 324  * REGNODE_SIMPLE */
 325 #define SIMPLE          0x02
 326 #define SPSTART         0x04    /* Starts with * or + */
 327 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 328 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 329 #define RESTART_PARSE   0x20    /* Need to redo the parse */
 330 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
 331                                    calcuate sizes as UTF-8 */
 332
 333 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 334
 335 /* whether trie related optimizations are enabled */
 336 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 337 #define TRIE_STUDY_OPT
 338 #define FULL_TRIE_STUDY
 339 #define TRIE_STCLASS
 340 #endif
 341
 342
 343
 344 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 345 #define PBITVAL(paren) (1 << ((paren) & 7))
 346 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 347 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 348 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 349
 350 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
 351                                      if (!UTF) {                           \
 352                                          *flagp = RESTART_PARSE|NEED_UTF8; \
 353                                          return 0;                         \
 354                                      }                                     \
 355                              } STMT_END
 356
 357 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
 358  * a flag that indicates we need to override /d with /u as a result of
 359  * something in the pattern.  It should only be used in regards to calling
 360  * set_regex_charset() or get_regex_charset() */
 361 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
 362     STMT_START {                                                            \
 363             if (DEPENDS_SEMANTICS) {                                        \
 364                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
 365                 RExC_uni_semantics = 1;                                     \
 366                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
 367                     /* No need to restart the parse if we haven't seen      \
 368                      * anything that differs between /u and /d, and no need \
 369                      * to restart immediately if we're going to reparse     \
 370                      * anyway to count parens */                            \
 371                     *flagp |= RESTART_PARSE;                                \
 372                     return restart_retval;                                  \
 373                 }                                                           \
 374             }                                                               \
 375     } STMT_END
 376
 377 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
 378     STMT_START {                                                            \
 379                 RExC_use_BRANCHJ = 1;                                       \
 380                 *flagp |= RESTART_PARSE;                                    \
 381                 return restart_retval;                                      \
 382     } STMT_END
 383
 384 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
 385  * less.  After that, it must always be positive, because the whole re is
 386  * considered to be surrounded by virtual parens.  Setting it to negative
 387  * indicates there is some construct that needs to know the actual number of
 388  * parens to be properly handled.  And that means an extra pass will be
 389  * required after we've counted them all */
 390 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
 391 #define REQUIRE_PARENS_PASS                                                 \
 392     STMT_START {  /* No-op if have completed a pass */                      \
 393                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
 394     } STMT_END
 395 #define IN_PARENS_PASS (RExC_total_parens < 0)
 396
 397
 398 /* This is used to return failure (zero) early from the calling function if
 399  * various flags in 'flags' are set.  Two flags always cause a return:
 400  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
 401  * additional flags that should cause a return; 0 if none.  If the return will
 402  * be done, '*flagp' is first set to be all of the flags that caused the
 403  * return. */
 404 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
 405     STMT_START {                                                            \
 406             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
 407                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
 408                 return 0;                                                   \
 409             }                                                               \
 410     } STMT_END
 411
 412 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
 413
 414 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
 415                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
 416 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
 417                                     if (MUST_RESTART(*(flagp))) return 0
 418
 419 /* This converts the named class defined in regcomp.h to its equivalent class
 420  * number defined in handy.h. */
 421 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 422 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 423
 424 #define _invlist_union_complement_2nd(a, b, output) \
 425                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 426 #define _invlist_intersection_complement_2nd(a, b, output) \
 427                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 428
 429 /* We add a marker if we are deferring expansion of a potential user-defined
 430  * property until it is needed at runtime the first time it is encountered in a
 431  * pattern match.  This marker that shouldn't conflict with any that could be
 432  * in a legal name is appended to its name to indicate this.  There is a string
 433  * and character form */
 434 #define DEFERRED_PROP_EXPANSION_MARKERs  "~"
 435 #define DEFERRED_PROP_EXPANSION_MARKERc  '~'
 436
 437 /* About scan_data_t.
 438
 439   During optimisation we recurse through the regexp program performing
 440   various inplace (keyhole style) optimisations. In addition study_chunk
 441   and scan_commit populate this data structure with information about
 442   what strings MUST appear in the pattern. We look for the longest
 443   string that must appear at a fixed location, and we look for the
 444   longest string that may appear at a floating location. So for instance
 445   in the pattern:
 446
 447     /FOO[xX]A.*B[xX]BAR/
 448
 449   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 450   strings (because they follow a .* construct). study_chunk will identify
 451   both FOO and BAR as being the longest fixed and floating strings respectively.
 452
 453   The strings can be composites, for instance
 454
 455      /(f)(o)(o)/
 456
 457   will result in a composite fixed substring 'foo'.
 458
 459   For each string some basic information is maintained:
 460
 461   - min_offset
 462     This is the position the string must appear at, or not before.
 463     It also implicitly (when combined with minlenp) tells us how many
 464     characters must match before the string we are searching for.
 465     Likewise when combined with minlenp and the length of the string it
 466     tells us how many characters must appear after the string we have
 467     found.
 468
 469   - max_offset
 470     Only used for floating strings. This is the rightmost point that
 471     the string can appear at. If set to SSize_t_MAX it indicates that the
 472     string can occur infinitely far to the right.
 473     For fixed strings, it is equal to min_offset.
 474
 475   - minlenp
 476     A pointer to the minimum number of characters of the pattern that the
 477     string was found inside. This is important as in the case of positive
 478     lookahead or positive lookbehind we can have multiple patterns
 479     involved. Consider
 480
 481     /(?=FOO).*F/
 482
 483     The minimum length of the pattern overall is 3, the minimum length
 484     of the lookahead part is 3, but the minimum length of the part that
 485     will actually match is 1. So 'FOO's minimum length is 3, but the
 486     minimum length for the F is 1. This is important as the minimum length
 487     is used to determine offsets in front of and behind the string being
 488     looked for.  Since strings can be composites this is the length of the
 489     pattern at the time it was committed with a scan_commit. Note that
 490     the length is calculated by study_chunk, so that the minimum lengths
 491     are not known until the full pattern has been compiled, thus the
 492     pointer to the value.
 493
 494   - lookbehind
 495
 496     In the case of lookbehind the string being searched for can be
 497     offset past the start point of the final matching string.
 498     If this value was just blithely removed from the min_offset it would
 499     invalidate some of the calculations for how many chars must match
 500     before or after (as they are derived from min_offset and minlen and
 501     the length of the string being searched for).
 502     When the final pattern is compiled and the data is moved from the
 503     scan_data_t structure into the regexp structure the information
 504     about lookbehind is factored in, with the information that would
 505     have been lost precalculated in the end_shift field for the
 506     associated string.
 507
 508   The fields pos_min and pos_delta are used to store the minimum offset
 509   and the delta to the maximum offset at the current point in the pattern.
 510
 511 */
 512
 513 struct scan_data_substrs {
 514     SV      *str;       /* longest substring found in pattern */
 515     SSize_t min_offset; /* earliest point in string it can appear */
 516     SSize_t max_offset; /* latest point in string it can appear */
 517     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
 518     SSize_t lookbehind; /* is the pos of the string modified by LB */
 519     I32 flags;          /* per substring SF_* and SCF_* flags */
 520 };
 521
 522 typedef struct scan_data_t {
 523     /*I32 len_min;      unused */
 524     /*I32 len_delta;    unused */
 525     SSize_t pos_min;
 526     SSize_t pos_delta;
 527     SV *last_found;
 528     SSize_t last_end;       /* min value, <0 unless valid. */
 529     SSize_t last_start_min;
 530     SSize_t last_start_max;
 531     U8      cur_is_floating; /* whether the last_* values should be set as
 532                               * the next fixed (0) or floating (1)
 533                               * substring */
 534
 535     /* [0] is longest fixed substring so far, [1] is longest float so far */
 536     struct scan_data_substrs  substrs[2];
 537
 538     I32 flags;             /* common SF_* and SCF_* flags */
 539     I32 whilem_c;
 540     SSize_t *last_closep;
 541     regnode_ssc *start_class;
 542 } scan_data_t;
 543
 544 /*
 545  * Forward declarations for pregcomp()'s friends.
 546  */
 547
 548 static const scan_data_t zero_scan_data = {
 549     0, 0, NULL, 0, 0, 0, 0,
 550     {
 551         { NULL, 0, 0, 0, 0, 0 },
 552         { NULL, 0, 0, 0, 0, 0 },
 553     },
 554     0, 0, NULL, NULL
 555 };
 556
 557 /* study flags */
 558
 559 #define SF_BEFORE_SEOL          0x0001
 560 #define SF_BEFORE_MEOL          0x0002
 561 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 562
 563 #define SF_IS_INF               0x0040
 564 #define SF_HAS_PAR              0x0080
 565 #define SF_IN_PAR               0x0100
 566 #define SF_HAS_EVAL             0x0200
 567
 568
 569 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
 570  * longest substring in the pattern. When it is not set the optimiser keeps
 571  * track of position, but does not keep track of the actual strings seen,
 572  *
 573  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
 574  * /foo/i will not.
 575  *
 576  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
 577  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
 578  * turned off because of the alternation (BRANCH). */
 579 #define SCF_DO_SUBSTR           0x0400
 580
 581 #define SCF_DO_STCLASS_AND      0x0800
 582 #define SCF_DO_STCLASS_OR       0x1000
 583 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 584 #define SCF_WHILEM_VISITED_POS  0x2000
 585
 586 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 587 #define SCF_SEEN_ACCEPT         0x8000
 588 #define SCF_TRIE_DOING_RESTUDY 0x10000
 589 #define SCF_IN_DEFINE          0x20000
 590
 591
 592
 593
 594 #define UTF cBOOL(RExC_utf8)
 595
 596 /* The enums for all these are ordered so things work out correctly */
 597 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 598 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 599                                                      == REGEX_DEPENDS_CHARSET)
 600 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 601 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 602                                                      >= REGEX_UNICODE_CHARSET)
 603 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 604                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 605 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 606                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 607 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 608                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 609
 610 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 611
 612 /* For programs that want to be strictly Unicode compatible by dying if any
 613  * attempt is made to match a non-Unicode code point against a Unicode
 614  * property.  */
 615 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 616
 617 #define OOB_NAMEDCLASS          -1
 618
 619 /* There is no code point that is out-of-bounds, so this is problematic.  But
 620  * its only current use is to initialize a variable that is always set before
 621  * looked at. */
 622 #define OOB_UNICODE             0xDEADBEEF
 623
 624 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 625
 626
 627 /* length of regex to show in messages that don't mark a position within */
 628 #define RegexLengthToShowInErrorMessages 127
 629
 630 /*
 631  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 632  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 633  * op/pragma/warn/regcomp.
 634  */
 635 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 636 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 637
 638 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 639                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
 640
 641 /* The code in this file in places uses one level of recursion with parsing
 642  * rebased to an alternate string constructed by us in memory.  This can take
 643  * the form of something that is completely different from the input, or
 644  * something that uses the input as part of the alternate.  In the first case,
 645  * there should be no possibility of an error, as we are in complete control of
 646  * the alternate string.  But in the second case we don't completely control
 647  * the input portion, so there may be errors in that.  Here's an example:
 648  *      /[abc\x{DF}def]/ui
 649  * is handled specially because \x{df} folds to a sequence of more than one
 650  * character: 'ss'.  What is done is to create and parse an alternate string,
 651  * which looks like this:
 652  *      /(?:\x{DF}|[abc\x{DF}def])/ui
 653  * where it uses the input unchanged in the middle of something it constructs,
 654  * which is a branch for the DF outside the character class, and clustering
 655  * parens around the whole thing. (It knows enough to skip the DF inside the
 656  * class while in this substitute parse.) 'abc' and 'def' may have errors that
 657  * need to be reported.  The general situation looks like this:
 658  *
 659  *                                       |<------- identical ------>|
 660  *              sI                       tI               xI       eI
 661  * Input:       ---------------------------------------------------------------
 662  * Constructed:         ---------------------------------------------------
 663  *                      sC               tC               xC       eC     EC
 664  *                                       |<------- identical ------>|
 665  *
 666  * sI..eI   is the portion of the input pattern we are concerned with here.
 667  * sC..EC   is the constructed substitute parse string.
 668  *  sC..tC  is constructed by us
 669  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
 670  *          In the diagram, these are vertically aligned.
 671  *  eC..EC  is also constructed by us.
 672  * xC       is the position in the substitute parse string where we found a
 673  *          problem.
 674  * xI       is the position in the original pattern corresponding to xC.
 675  *
 676  * We want to display a message showing the real input string.  Thus we need to
 677  * translate from xC to xI.  We know that xC >= tC, since the portion of the
 678  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
 679  * get:
 680  *      xI = tI + (xC - tC)
 681  *
 682  * When the substitute parse is constructed, the code needs to set:
 683  *      RExC_start (sC)
 684  *      RExC_end (eC)
 685  *      RExC_copy_start_in_input  (tI)
 686  *      RExC_copy_start_in_constructed (tC)
 687  * and restore them when done.
 688  *
 689  * During normal processing of the input pattern, both
 690  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
 691  * sI, so that xC equals xI.
 692  */
 693
 694 #define sI              RExC_precomp
 695 #define eI              RExC_precomp_end
 696 #define sC              RExC_start
 697 #define eC              RExC_end
 698 #define tI              RExC_copy_start_in_input
 699 #define tC              RExC_copy_start_in_constructed
 700 #define xI(xC)          (tI + (xC - tC))
 701 #define xI_offset(xC)   (xI(xC) - sI)
 702
 703 #define REPORT_LOCATION_ARGS(xC)                                            \
 704     UTF8fARG(UTF,                                                           \
 705              (xI(xC) > eI) /* Don't run off end */                          \
 706               ? eI - sI   /* Length before the <--HERE */                   \
 707               : ((xI_offset(xC) >= 0)                                       \
 708                  ? xI_offset(xC)                                            \
 709                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
 710                                     IVdf " trying to output message for "   \
 711                                     " pattern %.*s",                        \
 712                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
 713                                     ((int) (eC - sC)), sC), 0)),            \
 714              sI),         /* The input pattern printed up to the <--HERE */ \
 715     UTF8fARG(UTF,                                                           \
 716              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
 717              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
 718
 719 /* Used to point after bad bytes for an error message, but avoid skipping
 720  * past a nul byte. */
 721 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
 722
 723 /* Set up to clean up after our imminent demise */
 724 #define PREPARE_TO_DIE                                                      \
 725     STMT_START {                                                            \
 726         if (RExC_rx_sv)                                                     \
 727             SAVEFREESV(RExC_rx_sv);                                         \
 728         if (RExC_open_parens)                                               \
 729             SAVEFREEPV(RExC_open_parens);                                   \
 730         if (RExC_close_parens)                                              \
 731             SAVEFREEPV(RExC_close_parens);                                  \
 732     } STMT_END
 733
 734 /*
 735  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 736  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 737  * "...".
 738  */
 739 #define _FAIL(code) STMT_START {                                        \
 740     const char *ellipses = "";                                          \
 741     IV len = RExC_precomp_end - RExC_precomp;                           \
 742                                                                         \
 743     PREPARE_TO_DIE;                                                     \
 744     if (len > RegexLengthToShowInErrorMessages) {                       \
 745         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 746         len = RegexLengthToShowInErrorMessages - 10;                    \
 747         ellipses = "...";                                               \
 748     }                                                                   \
 749     code;                                                               \
 750 } STMT_END
 751
 752 #define FAIL(msg) _FAIL(                            \
 753     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",         \
 754             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 755
 756 #define FAIL2(msg,arg) _FAIL(                       \
 757     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 758             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 759
 760 #define FAIL3(msg,arg1,arg2) _FAIL(                         \
 761     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
 762      arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 763
 764 /*
 765  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 766  */
 767 #define Simple_vFAIL(m) STMT_START {                                    \
 768     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 769             m, REPORT_LOCATION_ARGS(RExC_parse));                       \
 770 } STMT_END
 771
 772 /*
 773  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 774  */
 775 #define vFAIL(m) STMT_START {                           \
 776     PREPARE_TO_DIE;                                     \
 777     Simple_vFAIL(m);                                    \
 778 } STMT_END
 779
 780 /*
 781  * Like Simple_vFAIL(), but accepts two arguments.
 782  */
 783 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 784     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,              \
 785                       REPORT_LOCATION_ARGS(RExC_parse));        \
 786 } STMT_END
 787
 788 /*
 789  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 790  */
 791 #define vFAIL2(m,a1) STMT_START {                       \
 792     PREPARE_TO_DIE;                                     \
 793     Simple_vFAIL2(m, a1);                               \
 794 } STMT_END
 795
 796
 797 /*
 798  * Like Simple_vFAIL(), but accepts three arguments.
 799  */
 800 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 801     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 802             REPORT_LOCATION_ARGS(RExC_parse));                  \
 803 } STMT_END
 804
 805 /*
 806  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 807  */
 808 #define vFAIL3(m,a1,a2) STMT_START {                    \
 809     PREPARE_TO_DIE;                                     \
 810     Simple_vFAIL3(m, a1, a2);                           \
 811 } STMT_END
 812
 813 /*
 814  * Like Simple_vFAIL(), but accepts four arguments.
 815  */
 816 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 817     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,      \
 818             REPORT_LOCATION_ARGS(RExC_parse));                  \
 819 } STMT_END
 820
 821 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 822     PREPARE_TO_DIE;                                     \
 823     Simple_vFAIL4(m, a1, a2, a3);                       \
 824 } STMT_END
 825
 826 /* A specialized version of vFAIL2 that works with UTF8f */
 827 #define vFAIL2utf8f(m, a1) STMT_START {             \
 828     PREPARE_TO_DIE;                                 \
 829     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
 830             REPORT_LOCATION_ARGS(RExC_parse));      \
 831 } STMT_END
 832
 833 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
 834     PREPARE_TO_DIE;                                     \
 835     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 836             REPORT_LOCATION_ARGS(RExC_parse));          \
 837 } STMT_END
 838
 839 /* Setting this to NULL is a signal to not output warnings */
 840 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
 841     STMT_START {                                                            \
 842       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
 843       RExC_copy_start_in_constructed = NULL;                                \
 844     } STMT_END
 845 #define RESTORE_WARNINGS                                                    \
 846     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
 847
 848 /* Since a warning can be generated multiple times as the input is reparsed, we
 849  * output it the first time we come to that point in the parse, but suppress it
 850  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
 851  * generate any warnings */
 852 #define TO_OUTPUT_WARNINGS(loc)                                         \
 853   (   RExC_copy_start_in_constructed                                    \
 854    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
 855
 856 /* After we've emitted a warning, we save the position in the input so we don't
 857  * output it again */
 858 #define UPDATE_WARNINGS_LOC(loc)                                        \
 859     STMT_START {                                                        \
 860         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 861             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
 862                                                        - RExC_precomp;  \
 863         }                                                               \
 864     } STMT_END
 865
 866 /* 'warns' is the output of the packWARNx macro used in 'code' */
 867 #define _WARN_HELPER(loc, warns, code)                                  \
 868     STMT_START {                                                        \
 869         if (! RExC_copy_start_in_constructed) {                         \
 870             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
 871                               " expected at '%s'",                      \
 872                               __FILE__, __LINE__, loc);                 \
 873         }                                                               \
 874         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
 875             if (ckDEAD(warns))                                          \
 876                 PREPARE_TO_DIE;                                         \
 877             code;                                                       \
 878             UPDATE_WARNINGS_LOC(loc);                                   \
 879         }                                                               \
 880     } STMT_END
 881
 882 /* m is not necessarily a "literal string", in this macro */
 883 #define reg_warn_non_literal_string(loc, m)                             \
 884     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 885                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 886                                        "%s" REPORT_LOCATION,            \
 887                                   m, REPORT_LOCATION_ARGS(loc)))
 888
 889 #define ckWARNreg(loc,m)                                                \
 890     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 891                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 892                                           m REPORT_LOCATION,            \
 893                                           REPORT_LOCATION_ARGS(loc)))
 894
 895 #define vWARN(loc, m)                                                   \
 896     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 897                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 898                                        m REPORT_LOCATION,               \
 899                                        REPORT_LOCATION_ARGS(loc)))      \
 900
 901 #define vWARN_dep(loc, m)                                               \
 902     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 903                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
 904                                        m REPORT_LOCATION,               \
 905                                        REPORT_LOCATION_ARGS(loc)))
 906
 907 #define ckWARNdep(loc,m)                                                \
 908     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
 909                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
 910                                             m REPORT_LOCATION,          \
 911                                             REPORT_LOCATION_ARGS(loc)))
 912
 913 #define ckWARNregdep(loc,m)                                                 \
 914     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
 915                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
 916                                                       WARN_REGEXP),         \
 917                                              m REPORT_LOCATION,             \
 918                                              REPORT_LOCATION_ARGS(loc)))
 919
 920 #define ckWARN2reg_d(loc,m, a1)                                             \
 921     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 922                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
 923                                             m REPORT_LOCATION,              \
 924                                             a1, REPORT_LOCATION_ARGS(loc)))
 925
 926 #define ckWARN2reg(loc, m, a1)                                              \
 927     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 928                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 929                                           m REPORT_LOCATION,                \
 930                                           a1, REPORT_LOCATION_ARGS(loc)))
 931
 932 #define vWARN3(loc, m, a1, a2)                                              \
 933     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 934                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
 935                                        m REPORT_LOCATION,                   \
 936                                        a1, a2, REPORT_LOCATION_ARGS(loc)))
 937
 938 #define ckWARN3reg(loc, m, a1, a2)                                          \
 939     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
 940                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
 941                                           m REPORT_LOCATION,                \
 942                                           a1, a2,                           \
 943                                           REPORT_LOCATION_ARGS(loc)))
 944
 945 #define vWARN4(loc, m, a1, a2, a3)                                      \
 946     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 947                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 948                                        m REPORT_LOCATION,               \
 949                                        a1, a2, a3,                      \
 950                                        REPORT_LOCATION_ARGS(loc)))
 951
 952 #define ckWARN4reg(loc, m, a1, a2, a3)                                  \
 953     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 954                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
 955                                           m REPORT_LOCATION,            \
 956                                           a1, a2, a3,                   \
 957                                           REPORT_LOCATION_ARGS(loc)))
 958
 959 #define vWARN5(loc, m, a1, a2, a3, a4)                                  \
 960     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 961                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
 962                                        m REPORT_LOCATION,               \
 963                                        a1, a2, a3, a4,                  \
 964                                        REPORT_LOCATION_ARGS(loc)))
 965
 966 #define ckWARNexperimental(loc, class, m)                               \
 967     _WARN_HELPER(loc, packWARN(class),                                  \
 968                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
 969                                             m REPORT_LOCATION,          \
 970                                             REPORT_LOCATION_ARGS(loc)))
 971
 972 /* Convert between a pointer to a node and its offset from the beginning of the
 973  * program */
 974 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
 975 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
 976
 977 /* Macros for recording node offsets.   20001227 mjd@plover.com
 978  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 979  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 980  * Element 0 holds the number n.
 981  * Position is 1 indexed.
 982  */
 983 #ifndef RE_TRACK_PATTERN_OFFSETS
 984 #define Set_Node_Offset_To_R(offset,byte)
 985 #define Set_Node_Offset(node,byte)
 986 #define Set_Cur_Node_Offset
 987 #define Set_Node_Length_To_R(node,len)
 988 #define Set_Node_Length(node,len)
 989 #define Set_Node_Cur_Length(node,start)
 990 #define Node_Offset(n)
 991 #define Node_Length(n)
 992 #define Set_Node_Offset_Length(node,offset,len)
 993 #define ProgLen(ri) ri->u.proglen
 994 #define SetProgLen(ri,x) ri->u.proglen = x
 995 #define Track_Code(code)
 996 #else
 997 #define ProgLen(ri) ri->u.offsets[0]
 998 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 999 #define Set_Node_Offset_To_R(offset,byte) STMT_START {                  \
1000         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
1001                     __LINE__, (int)(offset), (int)(byte)));             \
1002         if((offset) < 0) {                                              \
1003             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
1004                                          (int)(offset));                \
1005         } else {                                                        \
1006             RExC_offsets[2*(offset)-1] = (byte);                        \
1007         }                                                               \
1008 } STMT_END
1009
1010 #define Set_Node_Offset(node,byte)                                      \
1011     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1012 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1013
1014 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
1015         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
1016                 __LINE__, (int)(node), (int)(len)));                    \
1017         if((node) < 0) {                                                \
1018             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
1019                                          (int)(node));                  \
1020         } else {                                                        \
1021             RExC_offsets[2*(node)] = (len);                             \
1022         }                                                               \
1023 } STMT_END
1024
1025 #define Set_Node_Length(node,len) \
1026     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1027 #define Set_Node_Cur_Length(node, start)                \
1028     Set_Node_Length(node, RExC_parse - start)
1029
1030 /* Get offsets and lengths */
1031 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1032 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1033
1034 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
1035     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));       \
1036     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
1037 } STMT_END
1038
1039 #define Track_Code(code) STMT_START { code } STMT_END
1040 #endif
1041
1042 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1043 #define EXPERIMENTAL_INPLACESCAN
1044 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1045
1046 #ifdef DEBUGGING
1047 int
1048 Perl_re_printf(pTHX_ const char *fmt, ...)
1049 {
1050     va_list ap;
1051     int result;
1052     PerlIO *f= Perl_debug_log;
1053     PERL_ARGS_ASSERT_RE_PRINTF;
1054     va_start(ap, fmt);
1055     result = PerlIO_vprintf(f, fmt, ap);
1056     va_end(ap);
1057     return result;
1058 }
1059
1060 int
1061 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1062 {
1063     va_list ap;
1064     int result;
1065     PerlIO *f= Perl_debug_log;
1066     PERL_ARGS_ASSERT_RE_INDENTF;
1067     va_start(ap, depth);
1068     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1069     result = PerlIO_vprintf(f, fmt, ap);
1070     va_end(ap);
1071     return result;
1072 }
1073 #endif /* DEBUGGING */
1074
1075 #define DEBUG_RExC_seen()                                                   \
1076         DEBUG_OPTIMISE_MORE_r({                                             \
1077             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
1078                                                                             \
1079             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
1080                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
1081                                                                             \
1082             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
1083                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
1084                                                                             \
1085             if (RExC_seen & REG_GPOS_SEEN)                                  \
1086                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
1087                                                                             \
1088             if (RExC_seen & REG_RECURSE_SEEN)                               \
1089                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
1090                                                                             \
1091             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
1092                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
1093                                                                             \
1094             if (RExC_seen & REG_VERBARG_SEEN)                               \
1095                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
1096                                                                             \
1097             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
1098                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
1099                                                                             \
1100             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
1101                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
1102                                                                             \
1103             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
1104                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
1105                                                                             \
1106             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
1107                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
1108                                                                             \
1109             Perl_re_printf( aTHX_ "\n");                                    \
1110         });
1111
1112 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1113   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
1114
1115
1116 #ifdef DEBUGGING
1117 static void
1118 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1119                                     const char *close_str)
1120 {
1121     if (!flags)
1122         return;
1123
1124     Perl_re_printf( aTHX_  "%s", open_str);
1125     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1126     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1127     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1128     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1129     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1130     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1131     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1132     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1133     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1134     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1135     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1136     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1137     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1138     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1139     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1140     Perl_re_printf( aTHX_  "%s", close_str);
1141 }
1142
1143
1144 static void
1145 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1146                     U32 depth, int is_inf)
1147 {
1148     GET_RE_DEBUG_FLAGS_DECL;
1149
1150     DEBUG_OPTIMISE_MORE_r({
1151         if (!data)
1152             return;
1153         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1154             depth,
1155             where,
1156             (IV)data->pos_min,
1157             (IV)data->pos_delta,
1158             (UV)data->flags
1159         );
1160
1161         S_debug_show_study_flags(aTHX_ data->flags," [","]");
1162
1163         Perl_re_printf( aTHX_
1164             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1165             (IV)data->whilem_c,
1166             (IV)(data->last_closep ? *((data)->last_closep) : -1),
1167             is_inf ? "INF " : ""
1168         );
1169
1170         if (data->last_found) {
1171             int i;
1172             Perl_re_printf(aTHX_
1173                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1174                     SvPVX_const(data->last_found),
1175                     (IV)data->last_end,
1176                     (IV)data->last_start_min,
1177                     (IV)data->last_start_max
1178             );
1179
1180             for (i = 0; i < 2; i++) {
1181                 Perl_re_printf(aTHX_
1182                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
1183                     data->cur_is_floating == i ? "*" : "",
1184                     i ? "Float" : "Fixed",
1185                     SvPVX_const(data->substrs[i].str),
1186                     (IV)data->substrs[i].min_offset,
1187                     (IV)data->substrs[i].max_offset
1188                 );
1189                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1190             }
1191         }
1192
1193         Perl_re_printf( aTHX_ "\n");
1194     });
1195 }
1196
1197
1198 static void
1199 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1200                 regnode *scan, U32 depth, U32 flags)
1201 {
1202     GET_RE_DEBUG_FLAGS_DECL;
1203
1204     DEBUG_OPTIMISE_r({
1205         regnode *Next;
1206
1207         if (!scan)
1208             return;
1209         Next = regnext(scan);
1210         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1211         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
1212             depth,
1213             str,
1214             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1215             Next ? (REG_NODE_NUM(Next)) : 0 );
1216         S_debug_show_study_flags(aTHX_ flags," [ ","]");
1217         Perl_re_printf( aTHX_  "\n");
1218    });
1219 }
1220
1221
1222 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1223                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
1224
1225 #  define DEBUG_PEEP(str, scan, depth, flags)   \
1226                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1227
1228 #else
1229 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1230 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
1231 #endif
1232
1233
1234 /* =========================================================
1235  * BEGIN edit_distance stuff.
1236  *
1237  * This calculates how many single character changes of any type are needed to
1238  * transform a string into another one.  It is taken from version 3.1 of
1239  *
1240  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1241  */
1242
1243 /* Our unsorted dictionary linked list.   */
1244 /* Note we use UVs, not chars. */
1245
1246 struct dictionary{
1247   UV key;
1248   UV value;
1249   struct dictionary* next;
1250 };
1251 typedef struct dictionary item;
1252
1253
1254 PERL_STATIC_INLINE item*
1255 push(UV key, item* curr)
1256 {
1257     item* head;
1258     Newx(head, 1, item);
1259     head->key = key;
1260     head->value = 0;
1261     head->next = curr;
1262     return head;
1263 }
1264
1265
1266 PERL_STATIC_INLINE item*
1267 find(item* head, UV key)
1268 {
1269     item* iterator = head;
1270     while (iterator){
1271         if (iterator->key == key){
1272             return iterator;
1273         }
1274         iterator = iterator->next;
1275     }
1276
1277     return NULL;
1278 }
1279
1280 PERL_STATIC_INLINE item*
1281 uniquePush(item* head, UV key)
1282 {
1283     item* iterator = head;
1284
1285     while (iterator){
1286         if (iterator->key == key) {
1287             return head;
1288         }
1289         iterator = iterator->next;
1290     }
1291
1292     return push(key, head);
1293 }
1294
1295 PERL_STATIC_INLINE void
1296 dict_free(item* head)
1297 {
1298     item* iterator = head;
1299
1300     while (iterator) {
1301         item* temp = iterator;
1302         iterator = iterator->next;
1303         Safefree(temp);
1304     }
1305
1306     head = NULL;
1307 }
1308
1309 /* End of Dictionary Stuff */
1310
1311 /* All calculations/work are done here */
1312 STATIC int
1313 S_edit_distance(const UV* src,
1314                 const UV* tgt,
1315                 const STRLEN x,             /* length of src[] */
1316                 const STRLEN y,             /* length of tgt[] */
1317                 const SSize_t maxDistance
1318 )
1319 {
1320     item *head = NULL;
1321     UV swapCount, swapScore, targetCharCount, i, j;
1322     UV *scores;
1323     UV score_ceil = x + y;
1324
1325     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1326
1327     /* intialize matrix start values */
1328     Newx(scores, ( (x + 2) * (y + 2)), UV);
1329     scores[0] = score_ceil;
1330     scores[1 * (y + 2) + 0] = score_ceil;
1331     scores[0 * (y + 2) + 1] = score_ceil;
1332     scores[1 * (y + 2) + 1] = 0;
1333     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1334
1335     /* work loops    */
1336     /* i = src index */
1337     /* j = tgt index */
1338     for (i=1;i<=x;i++) {
1339         if (i < x)
1340             head = uniquePush(head, src[i]);
1341         scores[(i+1) * (y + 2) + 1] = i;
1342         scores[(i+1) * (y + 2) + 0] = score_ceil;
1343         swapCount = 0;
1344
1345         for (j=1;j<=y;j++) {
1346             if (i == 1) {
1347                 if(j < y)
1348                 head = uniquePush(head, tgt[j]);
1349                 scores[1 * (y + 2) + (j + 1)] = j;
1350                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1351             }
1352
1353             targetCharCount = find(head, tgt[j-1])->value;
1354             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1355
1356             if (src[i-1] != tgt[j-1]){
1357                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1358             }
1359             else {
1360                 swapCount = j;
1361                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1362             }
1363         }
1364
1365         find(head, src[i-1])->value = i;
1366     }
1367
1368     {
1369         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1370         dict_free(head);
1371         Safefree(scores);
1372         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1373     }
1374 }
1375
1376 /* END of edit_distance() stuff
1377  * ========================================================= */
1378
1379 /* is c a control character for which we have a mnemonic? */
1380 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1381
1382 STATIC const char *
1383 S_cntrl_to_mnemonic(const U8 c)
1384 {
1385     /* Returns the mnemonic string that represents character 'c', if one
1386      * exists; NULL otherwise.  The only ones that exist for the purposes of
1387      * this routine are a few control characters */
1388
1389     switch (c) {
1390         case '\a':       return "\\a";
1391         case '\b':       return "\\b";
1392         case ESC_NATIVE: return "\\e";
1393         case '\f':       return "\\f";
1394         case '\n':       return "\\n";
1395         case '\r':       return "\\r";
1396         case '\t':       return "\\t";
1397     }
1398
1399     return NULL;
1400 }
1401
1402 /* Mark that we cannot extend a found fixed substring at this point.
1403    Update the longest found anchored substring or the longest found
1404    floating substrings if needed. */
1405
1406 STATIC void
1407 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1408                     SSize_t *minlenp, int is_inf)
1409 {
1410     const STRLEN l = CHR_SVLEN(data->last_found);
1411     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1412     const STRLEN old_l = CHR_SVLEN(longest_sv);
1413     GET_RE_DEBUG_FLAGS_DECL;
1414
1415     PERL_ARGS_ASSERT_SCAN_COMMIT;
1416
1417     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1418         const U8 i = data->cur_is_floating;
1419         SvSetMagicSV(longest_sv, data->last_found);
1420         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1421
1422         if (!i) /* fixed */
1423             data->substrs[0].max_offset = data->substrs[0].min_offset;
1424         else { /* float */
1425             data->substrs[1].max_offset = (l
1426                           ? data->last_start_max
1427                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1428                                          ? SSize_t_MAX
1429                                          : data->pos_min + data->pos_delta));
1430             if (is_inf
1431                  || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1432                 data->substrs[1].max_offset = SSize_t_MAX;
1433         }
1434
1435         if (data->flags & SF_BEFORE_EOL)
1436             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1437         else
1438             data->substrs[i].flags &= ~SF_BEFORE_EOL;
1439         data->substrs[i].minlenp = minlenp;
1440         data->substrs[i].lookbehind = 0;
1441     }
1442
1443     SvCUR_set(data->last_found, 0);
1444     {
1445         SV * const sv = data->last_found;
1446         if (SvUTF8(sv) && SvMAGICAL(sv)) {
1447             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1448             if (mg)
1449                 mg->mg_len = 0;
1450         }
1451     }
1452     data->last_end = -1;
1453     data->flags &= ~SF_BEFORE_EOL;
1454     DEBUG_STUDYDATA("commit", data, 0, is_inf);
1455 }
1456
1457 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1458  * list that describes which code points it matches */
1459
1460 STATIC void
1461 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1462 {
1463     /* Set the SSC 'ssc' to match an empty string or any code point */
1464
1465     PERL_ARGS_ASSERT_SSC_ANYTHING;
1466
1467     assert(is_ANYOF_SYNTHETIC(ssc));
1468
1469     /* mortalize so won't leak */
1470     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1471     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1472 }
1473
1474 STATIC int
1475 S_ssc_is_anything(const regnode_ssc *ssc)
1476 {
1477     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1478      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1479      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1480      * in any way, so there's no point in using it */
1481
1482     UV start, end;
1483     bool ret;
1484
1485     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1486
1487     assert(is_ANYOF_SYNTHETIC(ssc));
1488
1489     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1490         return FALSE;
1491     }
1492
1493     /* See if the list consists solely of the range 0 - Infinity */
1494     invlist_iterinit(ssc->invlist);
1495     ret = invlist_iternext(ssc->invlist, &start, &end)
1496           && start == 0
1497           && end == UV_MAX;
1498
1499     invlist_iterfinish(ssc->invlist);
1500
1501     if (ret) {
1502         return TRUE;
1503     }
1504
1505     /* If e.g., both \w and \W are set, matches everything */
1506     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1507         int i;
1508         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1509             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1510                 return TRUE;
1511             }
1512         }
1513     }
1514
1515     return FALSE;
1516 }
1517
1518 STATIC void
1519 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1520 {
1521     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1522      * string, any code point, or any posix class under locale */
1523
1524     PERL_ARGS_ASSERT_SSC_INIT;
1525
1526     Zero(ssc, 1, regnode_ssc);
1527     set_ANYOF_SYNTHETIC(ssc);
1528     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1529     ssc_anything(ssc);
1530
1531     /* If any portion of the regex is to operate under locale rules that aren't
1532      * fully known at compile time, initialization includes it.  The reason
1533      * this isn't done for all regexes is that the optimizer was written under
1534      * the assumption that locale was all-or-nothing.  Given the complexity and
1535      * lack of documentation in the optimizer, and that there are inadequate
1536      * test cases for locale, many parts of it may not work properly, it is
1537      * safest to avoid locale unless necessary. */
1538     if (RExC_contains_locale) {
1539         ANYOF_POSIXL_SETALL(ssc);
1540     }
1541     else {
1542         ANYOF_POSIXL_ZERO(ssc);
1543     }
1544 }
1545
1546 STATIC int
1547 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1548                         const regnode_ssc *ssc)
1549 {
1550     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1551      * to the list of code points matched, and locale posix classes; hence does
1552      * not check its flags) */
1553
1554     UV start, end;
1555     bool ret;
1556
1557     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1558
1559     assert(is_ANYOF_SYNTHETIC(ssc));
1560
1561     invlist_iterinit(ssc->invlist);
1562     ret = invlist_iternext(ssc->invlist, &start, &end)
1563           && start == 0
1564           && end == UV_MAX;
1565
1566     invlist_iterfinish(ssc->invlist);
1567
1568     if (! ret) {
1569         return FALSE;
1570     }
1571
1572     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1573         return FALSE;
1574     }
1575
1576     return TRUE;
1577 }
1578
1579 #define INVLIST_INDEX 0
1580 #define ONLY_LOCALE_MATCHES_INDEX 1
1581 #define DEFERRED_USER_DEFINED_INDEX 2
1582
1583 STATIC SV*
1584 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1585                                const regnode_charclass* const node)
1586 {
1587     /* Returns a mortal inversion list defining which code points are matched
1588      * by 'node', which is of type ANYOF.  Handles complementing the result if
1589      * appropriate.  If some code points aren't knowable at this time, the
1590      * returned list must, and will, contain every code point that is a
1591      * possibility. */
1592
1593     dVAR;
1594     SV* invlist = NULL;
1595     SV* only_utf8_locale_invlist = NULL;
1596     unsigned int i;
1597     const U32 n = ARG(node);
1598     bool new_node_has_latin1 = FALSE;
1599     const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1600                       ? 0
1601                       : ANYOF_FLAGS(node);
1602
1603     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1604
1605     /* Look at the data structure created by S_set_ANYOF_arg() */
1606     if (n != ANYOF_ONLY_HAS_BITMAP) {
1607         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1608         AV * const av = MUTABLE_AV(SvRV(rv));
1609         SV **const ary = AvARRAY(av);
1610         assert(RExC_rxi->data->what[n] == 's');
1611
1612         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1613
1614             /* Here there are things that won't be known until runtime -- we
1615              * have to assume it could be anything */
1616             invlist = sv_2mortal(_new_invlist(1));
1617             return _add_range_to_invlist(invlist, 0, UV_MAX);
1618         }
1619         else if (ary[INVLIST_INDEX]) {
1620
1621             /* Use the node's inversion list */
1622             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1623         }
1624
1625         /* Get the code points valid only under UTF-8 locales */
1626         if (   (flags & ANYOFL_FOLD)
1627             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1628         {
1629             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1630         }
1631     }
1632
1633     if (! invlist) {
1634         invlist = sv_2mortal(_new_invlist(0));
1635     }
1636
1637     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1638      * code points, and an inversion list for the others, but if there are code
1639      * points that should match only conditionally on the target string being
1640      * UTF-8, those are placed in the inversion list, and not the bitmap.
1641      * Since there are circumstances under which they could match, they are
1642      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1643      * to exclude them here, so that when we invert below, the end result
1644      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1645      * have to do this here before we add the unconditionally matched code
1646      * points */
1647     if (flags & ANYOF_INVERT) {
1648         _invlist_intersection_complement_2nd(invlist,
1649                                              PL_UpperLatin1,
1650                                              &invlist);
1651     }
1652
1653     /* Add in the points from the bit map */
1654     if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1655         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1656             if (ANYOF_BITMAP_TEST(node, i)) {
1657                 unsigned int start = i++;
1658
1659                 for (;    i < NUM_ANYOF_CODE_POINTS
1660                        && ANYOF_BITMAP_TEST(node, i); ++i)
1661                 {
1662                     /* empty */
1663                 }
1664                 invlist = _add_range_to_invlist(invlist, start, i-1);
1665                 new_node_has_latin1 = TRUE;
1666             }
1667         }
1668     }
1669
1670     /* If this can match all upper Latin1 code points, have to add them
1671      * as well.  But don't add them if inverting, as when that gets done below,
1672      * it would exclude all these characters, including the ones it shouldn't
1673      * that were added just above */
1674     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1675         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1676     {
1677         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1678     }
1679
1680     /* Similarly for these */
1681     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1682         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1683     }
1684
1685     if (flags & ANYOF_INVERT) {
1686         _invlist_invert(invlist);
1687     }
1688     else if (flags & ANYOFL_FOLD) {
1689         if (new_node_has_latin1) {
1690
1691             /* Under /li, any 0-255 could fold to any other 0-255, depending on
1692              * the locale.  We can skip this if there are no 0-255 at all. */
1693             _invlist_union(invlist, PL_Latin1, &invlist);
1694
1695             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1696             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1697         }
1698         else {
1699             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1700                 invlist = add_cp_to_invlist(invlist, 'I');
1701             }
1702             if (_invlist_contains_cp(invlist,
1703                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1704             {
1705                 invlist = add_cp_to_invlist(invlist, 'i');
1706             }
1707         }
1708     }
1709
1710     /* Similarly add the UTF-8 locale possible matches.  These have to be
1711      * deferred until after the non-UTF-8 locale ones are taken care of just
1712      * above, or it leads to wrong results under ANYOF_INVERT */
1713     if (only_utf8_locale_invlist) {
1714         _invlist_union_maybe_complement_2nd(invlist,
1715                                             only_utf8_locale_invlist,
1716                                             flags & ANYOF_INVERT,
1717                                             &invlist);
1718     }
1719
1720     return invlist;
1721 }
1722
1723 /* These two functions currently do the exact same thing */
1724 #define ssc_init_zero           ssc_init
1725
1726 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1727 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1728
1729 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1730  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1731  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1732
1733 STATIC void
1734 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1735                 const regnode_charclass *and_with)
1736 {
1737     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1738      * another SSC or a regular ANYOF class.  Can create false positives. */
1739
1740     SV* anded_cp_list;
1741     U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1742                           ? 0
1743                           : ANYOF_FLAGS(and_with);
1744     U8  anded_flags;
1745
1746     PERL_ARGS_ASSERT_SSC_AND;
1747
1748     assert(is_ANYOF_SYNTHETIC(ssc));
1749
1750     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1751      * the code point inversion list and just the relevant flags */
1752     if (is_ANYOF_SYNTHETIC(and_with)) {
1753         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1754         anded_flags = and_with_flags;
1755
1756         /* XXX This is a kludge around what appears to be deficiencies in the
1757          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1758          * there are paths through the optimizer where it doesn't get weeded
1759          * out when it should.  And if we don't make some extra provision for
1760          * it like the code just below, it doesn't get added when it should.
1761          * This solution is to add it only when AND'ing, which is here, and
1762          * only when what is being AND'ed is the pristine, original node
1763          * matching anything.  Thus it is like adding it to ssc_anything() but
1764          * only when the result is to be AND'ed.  Probably the same solution
1765          * could be adopted for the same problem we have with /l matching,
1766          * which is solved differently in S_ssc_init(), and that would lead to
1767          * fewer false positives than that solution has.  But if this solution
1768          * creates bugs, the consequences are only that a warning isn't raised
1769          * that should be; while the consequences for having /l bugs is
1770          * incorrect matches */
1771         if (ssc_is_anything((regnode_ssc *)and_with)) {
1772             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1773         }
1774     }
1775     else {
1776         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1777         if (OP(and_with) == ANYOFD) {
1778             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1779         }
1780         else {
1781             anded_flags = and_with_flags
1782             &( ANYOF_COMMON_FLAGS
1783               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1784               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1785             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1786                 anded_flags &=
1787                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1788             }
1789         }
1790     }
1791
1792     ANYOF_FLAGS(ssc) &= anded_flags;
1793
1794     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1795      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1796      * 'and_with' may be inverted.  When not inverted, we have the situation of
1797      * computing:
1798      *  (C1 | P1) & (C2 | P2)
1799      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1800      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1801      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1802      *                    <=  ((C1 & C2) | P1 | P2)
1803      * Alternatively, the last few steps could be:
1804      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1805      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1806      *                    <=  (C1 | C2 | (P1 & P2))
1807      * We favor the second approach if either P1 or P2 is non-empty.  This is
1808      * because these components are a barrier to doing optimizations, as what
1809      * they match cannot be known until the moment of matching as they are
1810      * dependent on the current locale, 'AND"ing them likely will reduce or
1811      * eliminate them.
1812      * But we can do better if we know that C1,P1 are in their initial state (a
1813      * frequent occurrence), each matching everything:
1814      *  (<everything>) & (C2 | P2) =  C2 | P2
1815      * Similarly, if C2,P2 are in their initial state (again a frequent
1816      * occurrence), the result is a no-op
1817      *  (C1 | P1) & (<everything>) =  C1 | P1
1818      *
1819      * Inverted, we have
1820      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1821      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1822      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1823      * */
1824
1825     if ((and_with_flags & ANYOF_INVERT)
1826         && ! is_ANYOF_SYNTHETIC(and_with))
1827     {
1828         unsigned int i;
1829
1830         ssc_intersection(ssc,
1831                          anded_cp_list,
1832                          FALSE /* Has already been inverted */
1833                          );
1834
1835         /* If either P1 or P2 is empty, the intersection will be also; can skip
1836          * the loop */
1837         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1838             ANYOF_POSIXL_ZERO(ssc);
1839         }
1840         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1841
1842             /* Note that the Posix class component P from 'and_with' actually
1843              * looks like:
1844              *      P = Pa | Pb | ... | Pn
1845              * where each component is one posix class, such as in [\w\s].
1846              * Thus
1847              *      ~P = ~(Pa | Pb | ... | Pn)
1848              *         = ~Pa & ~Pb & ... & ~Pn
1849              *        <= ~Pa | ~Pb | ... | ~Pn
1850              * The last is something we can easily calculate, but unfortunately
1851              * is likely to have many false positives.  We could do better
1852              * in some (but certainly not all) instances if two classes in
1853              * P have known relationships.  For example
1854              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1855              * So
1856              *      :lower: & :print: = :lower:
1857              * And similarly for classes that must be disjoint.  For example,
1858              * since \s and \w can have no elements in common based on rules in
1859              * the POSIX standard,
1860              *      \w & ^\S = nothing
1861              * Unfortunately, some vendor locales do not meet the Posix
1862              * standard, in particular almost everything by Microsoft.
1863              * The loop below just changes e.g., \w into \W and vice versa */
1864
1865             regnode_charclass_posixl temp;
1866             int add = 1;    /* To calculate the index of the complement */
1867
1868             Zero(&temp, 1, regnode_charclass_posixl);
1869             ANYOF_POSIXL_ZERO(&temp);
1870             for (i = 0; i < ANYOF_MAX; i++) {
1871                 assert(i % 2 != 0
1872                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1873                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1874
1875                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1876                     ANYOF_POSIXL_SET(&temp, i + add);
1877                 }
1878                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1879             }
1880             ANYOF_POSIXL_AND(&temp, ssc);
1881
1882         } /* else ssc already has no posixes */
1883     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1884          in its initial state */
1885     else if (! is_ANYOF_SYNTHETIC(and_with)
1886              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1887     {
1888         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1889          * copy it over 'ssc' */
1890         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1891             if (is_ANYOF_SYNTHETIC(and_with)) {
1892                 StructCopy(and_with, ssc, regnode_ssc);
1893             }
1894             else {
1895                 ssc->invlist = anded_cp_list;
1896                 ANYOF_POSIXL_ZERO(ssc);
1897                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1898                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1899                 }
1900             }
1901         }
1902         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1903                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
1904         {
1905             /* One or the other of P1, P2 is non-empty. */
1906             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1907                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1908             }
1909             ssc_union(ssc, anded_cp_list, FALSE);
1910         }
1911         else { /* P1 = P2 = empty */
1912             ssc_intersection(ssc, anded_cp_list, FALSE);
1913         }
1914     }
1915 }
1916
1917 STATIC void
1918 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1919                const regnode_charclass *or_with)
1920 {
1921     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1922      * another SSC or a regular ANYOF class.  Can create false positives if
1923      * 'or_with' is to be inverted. */
1924
1925     SV* ored_cp_list;
1926     U8 ored_flags;
1927     U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1928                          ? 0
1929                          : ANYOF_FLAGS(or_with);
1930
1931     PERL_ARGS_ASSERT_SSC_OR;
1932
1933     assert(is_ANYOF_SYNTHETIC(ssc));
1934
1935     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1936      * the code point inversion list and just the relevant flags */
1937     if (is_ANYOF_SYNTHETIC(or_with)) {
1938         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1939         ored_flags = or_with_flags;
1940     }
1941     else {
1942         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1943         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1944         if (OP(or_with) != ANYOFD) {
1945             ored_flags
1946             |= or_with_flags
1947              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1948                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1949             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1950                 ored_flags |=
1951                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1952             }
1953         }
1954     }
1955
1956     ANYOF_FLAGS(ssc) |= ored_flags;
1957
1958     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1959      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1960      * 'or_with' may be inverted.  When not inverted, we have the simple
1961      * situation of computing:
1962      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1963      * If P1|P2 yields a situation with both a class and its complement are
1964      * set, like having both \w and \W, this matches all code points, and we
1965      * can delete these from the P component of the ssc going forward.  XXX We
1966      * might be able to delete all the P components, but I (khw) am not certain
1967      * about this, and it is better to be safe.
1968      *
1969      * Inverted, we have
1970      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1971      *                         <=  (C1 | P1) | ~C2
1972      *                         <=  (C1 | ~C2) | P1
1973      * (which results in actually simpler code than the non-inverted case)
1974      * */
1975
1976     if ((or_with_flags & ANYOF_INVERT)
1977         && ! is_ANYOF_SYNTHETIC(or_with))
1978     {
1979         /* We ignore P2, leaving P1 going forward */
1980     }   /* else  Not inverted */
1981     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1982         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1983         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1984             unsigned int i;
1985             for (i = 0; i < ANYOF_MAX; i += 2) {
1986                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1987                 {
1988                     ssc_match_all_cp(ssc);
1989                     ANYOF_POSIXL_CLEAR(ssc, i);
1990                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1991                 }
1992             }
1993         }
1994     }
1995
1996     ssc_union(ssc,
1997               ored_cp_list,
1998               FALSE /* Already has been inverted */
1999               );
2000 }
2001
2002 PERL_STATIC_INLINE void
2003 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2004 {
2005     PERL_ARGS_ASSERT_SSC_UNION;
2006
2007     assert(is_ANYOF_SYNTHETIC(ssc));
2008
2009     _invlist_union_maybe_complement_2nd(ssc->invlist,
2010                                         invlist,
2011                                         invert2nd,
2012                                         &ssc->invlist);
2013 }
2014
2015 PERL_STATIC_INLINE void
2016 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2017                          SV* const invlist,
2018                          const bool invert2nd)
2019 {
2020     PERL_ARGS_ASSERT_SSC_INTERSECTION;
2021
2022     assert(is_ANYOF_SYNTHETIC(ssc));
2023
2024     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2025                                                invlist,
2026                                                invert2nd,
2027                                                &ssc->invlist);
2028 }
2029
2030 PERL_STATIC_INLINE void
2031 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2032 {
2033     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2034
2035     assert(is_ANYOF_SYNTHETIC(ssc));
2036
2037     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2038 }
2039
2040 PERL_STATIC_INLINE void
2041 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2042 {
2043     /* AND just the single code point 'cp' into the SSC 'ssc' */
2044
2045     SV* cp_list = _new_invlist(2);
2046
2047     PERL_ARGS_ASSERT_SSC_CP_AND;
2048
2049     assert(is_ANYOF_SYNTHETIC(ssc));
2050
2051     cp_list = add_cp_to_invlist(cp_list, cp);
2052     ssc_intersection(ssc, cp_list,
2053                      FALSE /* Not inverted */
2054                      );
2055     SvREFCNT_dec_NN(cp_list);
2056 }
2057
2058 PERL_STATIC_INLINE void
2059 S_ssc_clear_locale(regnode_ssc *ssc)
2060 {
2061     /* Set the SSC 'ssc' to not match any locale things */
2062     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2063
2064     assert(is_ANYOF_SYNTHETIC(ssc));
2065
2066     ANYOF_POSIXL_ZERO(ssc);
2067     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2068 }
2069
2070 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2071
2072 STATIC bool
2073 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2074 {
2075     /* The synthetic start class is used to hopefully quickly winnow down
2076      * places where a pattern could start a match in the target string.  If it
2077      * doesn't really narrow things down that much, there isn't much point to
2078      * having the overhead of using it.  This function uses some very crude
2079      * heuristics to decide if to use the ssc or not.
2080      *
2081      * It returns TRUE if 'ssc' rules out more than half what it considers to
2082      * be the "likely" possible matches, but of course it doesn't know what the
2083      * actual things being matched are going to be; these are only guesses
2084      *
2085      * For /l matches, it assumes that the only likely matches are going to be
2086      *      in the 0-255 range, uniformly distributed, so half of that is 127
2087      * For /a and /d matches, it assumes that the likely matches will be just
2088      *      the ASCII range, so half of that is 63
2089      * For /u and there isn't anything matching above the Latin1 range, it
2090      *      assumes that that is the only range likely to be matched, and uses
2091      *      half that as the cut-off: 127.  If anything matches above Latin1,
2092      *      it assumes that all of Unicode could match (uniformly), except for
2093      *      non-Unicode code points and things in the General Category "Other"
2094      *      (unassigned, private use, surrogates, controls and formats).  This
2095      *      is a much large number. */
2096
2097     U32 count = 0;      /* Running total of number of code points matched by
2098                            'ssc' */
2099     UV start, end;      /* Start and end points of current range in inversion
2100                            XXX outdated.  UTF-8 locales are common, what about invert? list */
2101     const U32 max_code_points = (LOC)
2102                                 ?  256
2103                                 : ((  ! UNI_SEMANTICS
2104                                     ||  invlist_highest(ssc->invlist) < 256)
2105                                   ? 128
2106                                   : NON_OTHER_COUNT);
2107     const U32 max_match = max_code_points / 2;
2108
2109     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2110
2111     invlist_iterinit(ssc->invlist);
2112     while (invlist_iternext(ssc->invlist, &start, &end)) {
2113         if (start >= max_code_points) {
2114             break;
2115         }
2116         end = MIN(end, max_code_points - 1);
2117         count += end - start + 1;
2118         if (count >= max_match) {
2119             invlist_iterfinish(ssc->invlist);
2120             return FALSE;
2121         }
2122     }
2123
2124     return TRUE;
2125 }
2126
2127
2128 STATIC void
2129 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2130 {
2131     /* The inversion list in the SSC is marked mortal; now we need a more
2132      * permanent copy, which is stored the same way that is done in a regular
2133      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2134      * map */
2135
2136     SV* invlist = invlist_clone(ssc->invlist, NULL);
2137
2138     PERL_ARGS_ASSERT_SSC_FINALIZE;
2139
2140     assert(is_ANYOF_SYNTHETIC(ssc));
2141
2142     /* The code in this file assumes that all but these flags aren't relevant
2143      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2144      * by the time we reach here */
2145     assert(! (ANYOF_FLAGS(ssc)
2146         & ~( ANYOF_COMMON_FLAGS
2147             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2148             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2149
2150     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2151
2152     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2153     SvREFCNT_dec(invlist);
2154
2155     /* Make sure is clone-safe */
2156     ssc->invlist = NULL;
2157
2158     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2159         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2160         OP(ssc) = ANYOFPOSIXL;
2161     }
2162     else if (RExC_contains_locale) {
2163         OP(ssc) = ANYOFL;
2164     }
2165
2166     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2167 }
2168
2169 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2170 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
2171 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2172 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
2173                                ? (TRIE_LIST_CUR( idx ) - 1)           \
2174                                : 0 )
2175
2176
2177 #ifdef DEBUGGING
2178 /*
2179    dump_trie(trie,widecharmap,revcharmap)
2180    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2181    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2182
2183    These routines dump out a trie in a somewhat readable format.
2184    The _interim_ variants are used for debugging the interim
2185    tables that are used to generate the final compressed
2186    representation which is what dump_trie expects.
2187
2188    Part of the reason for their existence is to provide a form
2189    of documentation as to how the different representations function.
2190
2191 */
2192
2193 /*
2194   Dumps the final compressed table form of the trie to Perl_debug_log.
2195   Used for debugging make_trie().
2196 */
2197
2198 STATIC void
2199 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2200             AV *revcharmap, U32 depth)
2201 {
2202     U32 state;
2203     SV *sv=sv_newmortal();
2204     int colwidth= widecharmap ? 6 : 4;
2205     U16 word;
2206     GET_RE_DEBUG_FLAGS_DECL;
2207
2208     PERL_ARGS_ASSERT_DUMP_TRIE;
2209
2210     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
2211         depth+1, "Match","Base","Ofs" );
2212
2213     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2214         SV ** const tmp = av_fetch( revcharmap, state, 0);
2215         if ( tmp ) {
2216             Perl_re_printf( aTHX_  "%*s",
2217                 colwidth,
2218                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2219                             PL_colors[0], PL_colors[1],
2220                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2221                             PERL_PV_ESCAPE_FIRSTCHAR
2222                 )
2223             );
2224         }
2225     }
2226     Perl_re_printf( aTHX_  "\n");
2227     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2228
2229     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2230         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2231     Perl_re_printf( aTHX_  "\n");
2232
2233     for( state = 1 ; state < trie->statecount ; state++ ) {
2234         const U32 base = trie->states[ state ].trans.base;
2235
2236         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
2237
2238         if ( trie->states[ state ].wordnum ) {
2239             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2240         } else {
2241             Perl_re_printf( aTHX_  "%6s", "" );
2242         }
2243
2244         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
2245
2246         if ( base ) {
2247             U32 ofs = 0;
2248
2249             while( ( base + ofs  < trie->uniquecharcount ) ||
2250                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2251                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2252                                                                     != state))
2253                     ofs++;
2254
2255             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
2256
2257             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2258                 if ( ( base + ofs >= trie->uniquecharcount )
2259                         && ( base + ofs - trie->uniquecharcount
2260                                                         < trie->lasttrans )
2261                         && trie->trans[ base + ofs
2262                                     - trie->uniquecharcount ].check == state )
2263                 {
2264                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
2265                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2266                    );
2267                 } else {
2268                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
2269                 }
2270             }
2271
2272             Perl_re_printf( aTHX_  "]");
2273
2274         }
2275         Perl_re_printf( aTHX_  "\n" );
2276     }
2277     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2278                                 depth);
2279     for (word=1; word <= trie->wordcount; word++) {
2280         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2281             (int)word, (int)(trie->wordinfo[word].prev),
2282             (int)(trie->wordinfo[word].len));
2283     }
2284     Perl_re_printf( aTHX_  "\n" );
2285 }
2286 /*
2287   Dumps a fully constructed but uncompressed trie in list form.
2288   List tries normally only are used for construction when the number of
2289   possible chars (trie->uniquecharcount) is very high.
2290   Used for debugging make_trie().
2291 */
2292 STATIC void
2293 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2294                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
2295                          U32 depth)
2296 {
2297     U32 state;
2298     SV *sv=sv_newmortal();
2299     int colwidth= widecharmap ? 6 : 4;
2300     GET_RE_DEBUG_FLAGS_DECL;
2301
2302     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2303
2304     /* print out the table precompression.  */
2305     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2306             depth+1 );
2307     Perl_re_indentf( aTHX_  "%s",
2308             depth+1, "------:-----+-----------------\n" );
2309
2310     for( state=1 ; state < next_alloc ; state ++ ) {
2311         U16 charid;
2312
2313         Perl_re_indentf( aTHX_  " %4" UVXf " :",
2314             depth+1, (UV)state  );
2315         if ( ! trie->states[ state ].wordnum ) {
2316             Perl_re_printf( aTHX_  "%5s| ","");
2317         } else {
2318             Perl_re_printf( aTHX_  "W%4x| ",
2319                 trie->states[ state ].wordnum
2320             );
2321         }
2322         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2323             SV ** const tmp = av_fetch( revcharmap,
2324                                         TRIE_LIST_ITEM(state, charid).forid, 0);
2325             if ( tmp ) {
2326                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
2327                     colwidth,
2328                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2329                               colwidth,
2330                               PL_colors[0], PL_colors[1],
2331                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2332                               | PERL_PV_ESCAPE_FIRSTCHAR
2333                     ) ,
2334                     TRIE_LIST_ITEM(state, charid).forid,
2335                     (UV)TRIE_LIST_ITEM(state, charid).newstate
2336                 );
2337                 if (!(charid % 10))
2338                     Perl_re_printf( aTHX_  "\n%*s| ",
2339                         (int)((depth * 2) + 14), "");
2340             }
2341         }
2342         Perl_re_printf( aTHX_  "\n");
2343     }
2344 }
2345
2346 /*
2347   Dumps a fully constructed but uncompressed trie in table form.
2348   This is the normal DFA style state transition table, with a few
2349   twists to facilitate compression later.
2350   Used for debugging make_trie().
2351 */
2352 STATIC void
2353 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2354                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
2355                           U32 depth)
2356 {
2357     U32 state;
2358     U16 charid;
2359     SV *sv=sv_newmortal();
2360     int colwidth= widecharmap ? 6 : 4;
2361     GET_RE_DEBUG_FLAGS_DECL;
2362
2363     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2364
2365     /*
2366        print out the table precompression so that we can do a visual check
2367        that they are identical.
2368      */
2369
2370     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2371
2372     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2373         SV ** const tmp = av_fetch( revcharmap, charid, 0);
2374         if ( tmp ) {
2375             Perl_re_printf( aTHX_  "%*s",
2376                 colwidth,
2377                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2378                             PL_colors[0], PL_colors[1],
2379                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2380                             PERL_PV_ESCAPE_FIRSTCHAR
2381                 )
2382             );
2383         }
2384     }
2385
2386     Perl_re_printf( aTHX_ "\n");
2387     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2388
2389     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2390         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2391     }
2392
2393     Perl_re_printf( aTHX_  "\n" );
2394
2395     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2396
2397         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
2398             depth+1,
2399             (UV)TRIE_NODENUM( state ) );
2400
2401         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2402             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2403             if (v)
2404                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
2405             else
2406                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2407         }
2408         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2409             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
2410                                             (UV)trie->trans[ state ].check );
2411         } else {
2412             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
2413                                             (UV)trie->trans[ state ].check,
2414             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2415         }
2416     }
2417 }
2418
2419 #endif
2420
2421
2422 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2423   startbranch: the first branch in the whole branch sequence
2424   first      : start branch of sequence of branch-exact nodes.
2425                May be the same as startbranch
2426   last       : Thing following the last branch.
2427                May be the same as tail.
2428   tail       : item following the branch sequence
2429   count      : words in the sequence
2430   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2431   depth      : indent depth
2432
2433 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2434
2435 A trie is an N'ary tree where the branches are determined by digital
2436 decomposition of the key. IE, at the root node you look up the 1st character and
2437 follow that branch repeat until you find the end of the branches. Nodes can be
2438 marked as "accepting" meaning they represent a complete word. Eg:
2439
2440   /he|she|his|hers/
2441
2442 would convert into the following structure. Numbers represent states, letters
2443 following numbers represent valid transitions on the letter from that state, if
2444 the number is in square brackets it represents an accepting state, otherwise it
2445 will be in parenthesis.
2446
2447       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2448       |    |
2449       |   (2)
2450       |    |
2451      (1)   +-i->(6)-+-s->[7]
2452       |
2453       +-s->(3)-+-h->(4)-+-e->[5]
2454
2455       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2456
2457 This shows that when matching against the string 'hers' we will begin at state 1
2458 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2459 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2460 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2461 single traverse. We store a mapping from accepting to state to which word was
2462 matched, and then when we have multiple possibilities we try to complete the
2463 rest of the regex in the order in which they occurred in the alternation.
2464
2465 The only prior NFA like behaviour that would be changed by the TRIE support is
2466 the silent ignoring of duplicate alternations which are of the form:
2467
2468  / (DUPE|DUPE) X? (?{ ... }) Y /x
2469
2470 Thus EVAL blocks following a trie may be called a different number of times with
2471 and without the optimisation. With the optimisations dupes will be silently
2472 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2473 the following demonstrates:
2474
2475  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2476
2477 which prints out 'word' three times, but
2478
2479  'words'=~/(word|word|word)(?{ print $1 })S/
2480
2481 which doesnt print it out at all. This is due to other optimisations kicking in.
2482
2483 Example of what happens on a structural level:
2484
2485 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2486
2487    1: CURLYM[1] {1,32767}(18)
2488    5:   BRANCH(8)
2489    6:     EXACT <ac>(16)
2490    8:   BRANCH(11)
2491    9:     EXACT <ad>(16)
2492   11:   BRANCH(14)
2493   12:     EXACT <ab>(16)
2494   16:   SUCCEED(0)
2495   17:   NOTHING(18)
2496   18: END(0)
2497
2498 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2499 and should turn into:
2500
2501    1: CURLYM[1] {1,32767}(18)
2502    5:   TRIE(16)
2503         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2504           <ac>
2505           <ad>
2506           <ab>
2507   16:   SUCCEED(0)
2508   17:   NOTHING(18)
2509   18: END(0)
2510
2511 Cases where tail != last would be like /(?foo|bar)baz/:
2512
2513    1: BRANCH(4)
2514    2:   EXACT <foo>(8)
2515    4: BRANCH(7)
2516    5:   EXACT <bar>(8)
2517    7: TAIL(8)
2518    8: EXACT <baz>(10)
2519   10: END(0)
2520
2521 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2522 and would end up looking like:
2523
2524     1: TRIE(8)
2525       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2526         <foo>
2527         <bar>
2528    7: TAIL(8)
2529    8: EXACT <baz>(10)
2530   10: END(0)
2531
2532     d = uvchr_to_utf8_flags(d, uv, 0);
2533
2534 is the recommended Unicode-aware way of saying
2535
2536     *(d++) = uv;
2537 */
2538
2539 #define TRIE_STORE_REVCHAR(val)                                            \
2540     STMT_START {                                                           \
2541         if (UTF) {                                                         \
2542             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
2543             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
2544             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
2545             *kapow = '\0';                                                 \
2546             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
2547             SvPOK_on(zlopp);                                               \
2548             SvUTF8_on(zlopp);                                              \
2549             av_push(revcharmap, zlopp);                                    \
2550         } else {                                                           \
2551             char ooooff = (char)val;                                           \
2552             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
2553         }                                                                  \
2554         } STMT_END
2555
2556 /* This gets the next character from the input, folding it if not already
2557  * folded. */
2558 #define TRIE_READ_CHAR STMT_START {                                           \
2559     wordlen++;                                                                \
2560     if ( UTF ) {                                                              \
2561         /* if it is UTF then it is either already folded, or does not need    \
2562          * folding */                                                         \
2563         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2564     }                                                                         \
2565     else if (folder == PL_fold_latin1) {                                      \
2566         /* This folder implies Unicode rules, which in the range expressible  \
2567          *  by not UTF is the lower case, with the two exceptions, one of     \
2568          *  which should have been taken care of before calling this */       \
2569         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2570         uvc = toLOWER_L1(*uc);                                                \
2571         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2572         len = 1;                                                              \
2573     } else {                                                                  \
2574         /* raw data, will be folded later if needed */                        \
2575         uvc = (U32)*uc;                                                       \
2576         len = 1;                                                              \
2577     }                                                                         \
2578 } STMT_END
2579
2580
2581
2582 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2583     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2584         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
2585         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2586         TRIE_LIST_LEN( state ) = ging;                          \
2587     }                                                           \
2588     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2589     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2590     TRIE_LIST_CUR( state )++;                                   \
2591 } STMT_END
2592
2593 #define TRIE_LIST_NEW(state) STMT_START {                       \
2594     Newx( trie->states[ state ].trans.list,                     \
2595         4, reg_trie_trans_le );                                 \
2596      TRIE_LIST_CUR( state ) = 1;                                \
2597      TRIE_LIST_LEN( state ) = 4;                                \
2598 } STMT_END
2599
2600 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2601     U16 dupe= trie->states[ state ].wordnum;                    \
2602     regnode * const noper_next = regnext( noper );              \
2603                                                                 \
2604     DEBUG_r({                                                   \
2605         /* store the word for dumping */                        \
2606         SV* tmp;                                                \
2607         if (OP(noper) != NOTHING)                               \
2608             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
2609         else                                                    \
2610             tmp = newSVpvn_utf8( "", 0, UTF );                  \
2611         av_push( trie_words, tmp );                             \
2612     });                                                         \
2613                                                                 \
2614     curword++;                                                  \
2615     trie->wordinfo[curword].prev   = 0;                         \
2616     trie->wordinfo[curword].len    = wordlen;                   \
2617     trie->wordinfo[curword].accept = state;                     \
2618                                                                 \
2619     if ( noper_next < tail ) {                                  \
2620         if (!trie->jump)                                        \
2621             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2622                                                  sizeof(U16) ); \
2623         trie->jump[curword] = (U16)(noper_next - convert);      \
2624         if (!jumper)                                            \
2625             jumper = noper_next;                                \
2626         if (!nextbranch)                                        \
2627             nextbranch= regnext(cur);                           \
2628     }                                                           \
2629                                                                 \
2630     if ( dupe ) {                                               \
2631         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2632         /* chain, so that when the bits of chain are later    */\
2633         /* linked together, the dups appear in the chain      */\
2634         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2635         trie->wordinfo[dupe].prev = curword;                    \
2636     } else {                                                    \
2637         /* we haven't inserted this word yet.                */ \
2638         trie->states[ state ].wordnum = curword;                \
2639     }                                                           \
2640 } STMT_END
2641
2642
2643 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
2644      ( ( base + charid >=  ucharcount                                   \
2645          && base + charid < ubound                                      \
2646          && state == trie->trans[ base - ucharcount + charid ].check    \
2647          && trie->trans[ base - ucharcount + charid ].next )            \
2648            ? trie->trans[ base - ucharcount + charid ].next             \
2649            : ( state==1 ? special : 0 )                                 \
2650       )
2651
2652 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
2653 STMT_START {                                                \
2654     TRIE_BITMAP_SET(trie, uvc);                             \
2655     /* store the folded codepoint */                        \
2656     if ( folder )                                           \
2657         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
2658                                                             \
2659     if ( !UTF ) {                                           \
2660         /* store first byte of utf8 representation of */    \
2661         /* variant codepoints */                            \
2662         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
2663             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
2664         }                                                   \
2665     }                                                       \
2666 } STMT_END
2667 #define MADE_TRIE       1
2668 #define MADE_JUMP_TRIE  2
2669 #define MADE_EXACT_TRIE 4
2670
2671 STATIC I32
2672 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2673                   regnode *first, regnode *last, regnode *tail,
2674                   U32 word_count, U32 flags, U32 depth)
2675 {
2676     /* first pass, loop through and scan words */
2677     reg_trie_data *trie;
2678     HV *widecharmap = NULL;
2679     AV *revcharmap = newAV();
2680     regnode *cur;
2681     STRLEN len = 0;
2682     UV uvc = 0;
2683     U16 curword = 0;
2684     U32 next_alloc = 0;
2685     regnode *jumper = NULL;
2686     regnode *nextbranch = NULL;
2687     regnode *convert = NULL;
2688     U32 *prev_states; /* temp array mapping each state to previous one */
2689     /* we just use folder as a flag in utf8 */
2690     const U8 * folder = NULL;
2691
2692     /* in the below add_data call we are storing either 'tu' or 'tuaa'
2693      * which stands for one trie structure, one hash, optionally followed
2694      * by two arrays */
2695 #ifdef DEBUGGING
2696     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2697     AV *trie_words = NULL;
2698     /* along with revcharmap, this only used during construction but both are
2699      * useful during debugging so we store them in the struct when debugging.
2700      */
2701 #else
2702     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2703     STRLEN trie_charcount=0;
2704 #endif
2705     SV *re_trie_maxbuff;
2706     GET_RE_DEBUG_FLAGS_DECL;
2707
2708     PERL_ARGS_ASSERT_MAKE_TRIE;
2709 #ifndef DEBUGGING
2710     PERL_UNUSED_ARG(depth);
2711 #endif
2712
2713     switch (flags) {
2714         case EXACT: case EXACT_REQ8: case EXACTL: break;
2715         case EXACTFAA:
2716         case EXACTFUP:
2717         case EXACTFU:
2718         case EXACTFLU8: folder = PL_fold_latin1; break;
2719         case EXACTF:  folder = PL_fold; break;
2720         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2721     }
2722
2723     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2724     trie->refcount = 1;
2725     trie->startstate = 1;
2726     trie->wordcount = word_count;
2727     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2728     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2729     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2730         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2731     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2732                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2733
2734     DEBUG_r({
2735         trie_words = newAV();
2736     });
2737
2738     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2739     assert(re_trie_maxbuff);
2740     if (!SvIOK(re_trie_maxbuff)) {
2741         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2742     }
2743     DEBUG_TRIE_COMPILE_r({
2744         Perl_re_indentf( aTHX_
2745           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2746           depth+1,
2747           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2748           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2749     });
2750
2751    /* Find the node we are going to overwrite */
2752     if ( first == startbranch && OP( last ) != BRANCH ) {
2753         /* whole branch chain */
2754         convert = first;
2755     } else {
2756         /* branch sub-chain */
2757         convert = NEXTOPER( first );
2758     }
2759
2760     /*  -- First loop and Setup --
2761
2762        We first traverse the branches and scan each word to determine if it
2763        contains widechars, and how many unique chars there are, this is
2764        important as we have to build a table with at least as many columns as we
2765        have unique chars.
2766
2767        We use an array of integers to represent the character codes 0..255
2768        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2769        the native representation of the character value as the key and IV's for
2770        the coded index.
2771
2772        *TODO* If we keep track of how many times each character is used we can
2773        remap the columns so that the table compression later on is more
2774        efficient in terms of memory by ensuring the most common value is in the
2775        middle and the least common are on the outside.  IMO this would be better
2776        than a most to least common mapping as theres a decent chance the most
2777        common letter will share a node with the least common, meaning the node
2778        will not be compressible. With a middle is most common approach the worst
2779        case is when we have the least common nodes twice.
2780
2781      */
2782
2783     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2784         regnode *noper = NEXTOPER( cur );
2785         const U8 *uc;
2786         const U8 *e;
2787         int foldlen = 0;
2788         U32 wordlen      = 0;         /* required init */
2789         STRLEN minchars = 0;
2790         STRLEN maxchars = 0;
2791         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2792                                                bitmap?*/
2793
2794         if (OP(noper) == NOTHING) {
2795             /* skip past a NOTHING at the start of an alternation
2796              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2797              */
2798             regnode *noper_next= regnext(noper);
2799             if (noper_next < tail)
2800                 noper= noper_next;
2801         }
2802
2803         if (    noper < tail
2804             && (    OP(noper) == flags
2805                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2806                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
2807                                          || OP(noper) == EXACTFUP))))
2808         {
2809             uc= (U8*)STRING(noper);
2810             e= uc + STR_LEN(noper);
2811         } else {
2812             trie->minlen= 0;
2813             continue;
2814         }
2815
2816
2817         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2818             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2819                                           regardless of encoding */
2820             if (OP( noper ) == EXACTFUP) {
2821                 /* false positives are ok, so just set this */
2822                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2823             }
2824         }
2825
2826         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2827                                            branch */
2828             TRIE_CHARCOUNT(trie)++;
2829             TRIE_READ_CHAR;
2830
2831             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2832              * is in effect.  Under /i, this character can match itself, or
2833              * anything that folds to it.  If not under /i, it can match just
2834              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2835              * all fold to k, and all are single characters.   But some folds
2836              * expand to more than one character, so for example LATIN SMALL
2837              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2838              * the string beginning at 'uc' is 'ffi', it could be matched by
2839              * three characters, or just by the one ligature character. (It
2840              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2841              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2842              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2843              * match.)  The trie needs to know the minimum and maximum number
2844              * of characters that could match so that it can use size alone to
2845              * quickly reject many match attempts.  The max is simple: it is
2846              * the number of folded characters in this branch (since a fold is
2847              * never shorter than what folds to it. */
2848
2849             maxchars++;
2850
2851             /* And the min is equal to the max if not under /i (indicated by
2852              * 'folder' being NULL), or there are no multi-character folds.  If
2853              * there is a multi-character fold, the min is incremented just
2854              * once, for the character that folds to the sequence.  Each
2855              * character in the sequence needs to be added to the list below of
2856              * characters in the trie, but we count only the first towards the
2857              * min number of characters needed.  This is done through the
2858              * variable 'foldlen', which is returned by the macros that look
2859              * for these sequences as the number of bytes the sequence
2860              * occupies.  Each time through the loop, we decrement 'foldlen' by
2861              * how many bytes the current char occupies.  Only when it reaches
2862              * 0 do we increment 'minchars' or look for another multi-character
2863              * sequence. */
2864             if (folder == NULL) {
2865                 minchars++;
2866             }
2867             else if (foldlen > 0) {
2868                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2869             }
2870             else {
2871                 minchars++;
2872
2873                 /* See if *uc is the beginning of a multi-character fold.  If
2874                  * so, we decrement the length remaining to look at, to account
2875                  * for the current character this iteration.  (We can use 'uc'
2876                  * instead of the fold returned by TRIE_READ_CHAR because for
2877                  * non-UTF, the latin1_safe macro is smart enough to account
2878                  * for all the unfolded characters, and because for UTF, the
2879                  * string will already have been folded earlier in the
2880                  * compilation process */
2881                 if (UTF) {
2882                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2883                         foldlen -= UTF8SKIP(uc);
2884                     }
2885                 }
2886                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2887                     foldlen--;
2888                 }
2889             }
2890
2891             /* The current character (and any potential folds) should be added
2892              * to the possible matching characters for this position in this
2893              * branch */
2894             if ( uvc < 256 ) {
2895                 if ( folder ) {
2896                     U8 folded= folder[ (U8) uvc ];
2897                     if ( !trie->charmap[ folded ] ) {
2898                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2899                         TRIE_STORE_REVCHAR( folded );
2900                     }
2901                 }
2902                 if ( !trie->charmap[ uvc ] ) {
2903                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2904                     TRIE_STORE_REVCHAR( uvc );
2905                 }
2906                 if ( set_bit ) {
2907                     /* store the codepoint in the bitmap, and its folded
2908                      * equivalent. */
2909                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2910                     set_bit = 0; /* We've done our bit :-) */
2911                 }
2912             } else {
2913
2914                 /* XXX We could come up with the list of code points that fold
2915                  * to this using PL_utf8_foldclosures, except not for
2916                  * multi-char folds, as there may be multiple combinations
2917                  * there that could work, which needs to wait until runtime to
2918                  * resolve (The comment about LIGATURE FFI above is such an
2919                  * example */
2920
2921                 SV** svpp;
2922                 if ( !widecharmap )
2923                     widecharmap = newHV();
2924
2925                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2926
2927                 if ( !svpp )
2928                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2929
2930                 if ( !SvTRUE( *svpp ) ) {
2931                     sv_setiv( *svpp, ++trie->uniquecharcount );
2932                     TRIE_STORE_REVCHAR(uvc);
2933                 }
2934             }
2935         } /* end loop through characters in this branch of the trie */
2936
2937         /* We take the min and max for this branch and combine to find the min
2938          * and max for all branches processed so far */
2939         if( cur == first ) {
2940             trie->minlen = minchars;
2941             trie->maxlen = maxchars;
2942         } else if (minchars < trie->minlen) {
2943             trie->minlen = minchars;
2944         } else if (maxchars > trie->maxlen) {
2945             trie->maxlen = maxchars;
2946         }
2947     } /* end first pass */
2948     DEBUG_TRIE_COMPILE_r(
2949         Perl_re_indentf( aTHX_
2950                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2951                 depth+1,
2952                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2953                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2954                 (int)trie->minlen, (int)trie->maxlen )
2955     );
2956
2957     /*
2958         We now know what we are dealing with in terms of unique chars and
2959         string sizes so we can calculate how much memory a naive
2960         representation using a flat table  will take. If it's over a reasonable
2961         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2962         conservative but potentially much slower representation using an array
2963         of lists.
2964
2965         At the end we convert both representations into the same compressed
2966         form that will be used in regexec.c for matching with. The latter
2967         is a form that cannot be used to construct with but has memory
2968         properties similar to the list form and access properties similar
2969         to the table form making it both suitable for fast searches and
2970         small enough that its feasable to store for the duration of a program.
2971
2972         See the comment in the code where the compressed table is produced
2973         inplace from the flat tabe representation for an explanation of how
2974         the compression works.
2975
2976     */
2977
2978
2979     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2980     prev_states[1] = 0;
2981
2982     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2983                                                     > SvIV(re_trie_maxbuff) )
2984     {
2985         /*
2986             Second Pass -- Array Of Lists Representation
2987
2988             Each state will be represented by a list of charid:state records
2989             (reg_trie_trans_le) the first such element holds the CUR and LEN
2990             points of the allocated array. (See defines above).
2991
2992             We build the initial structure using the lists, and then convert
2993             it into the compressed table form which allows faster lookups
2994             (but cant be modified once converted).
2995         */
2996
2997         STRLEN transcount = 1;
2998
2999         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
3000             depth+1));
3001
3002         trie->states = (reg_trie_state *)
3003             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3004                                   sizeof(reg_trie_state) );
3005         TRIE_LIST_NEW(1);
3006         next_alloc = 2;
3007
3008         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3009
3010             regnode *noper   = NEXTOPER( cur );
3011             U32 state        = 1;         /* required init */
3012             U16 charid       = 0;         /* sanity init */
3013             U32 wordlen      = 0;         /* required init */
3014
3015             if (OP(noper) == NOTHING) {
3016                 regnode *noper_next= regnext(noper);
3017                 if (noper_next < tail)
3018                     noper= noper_next;
3019             }
3020
3021             if (    noper < tail
3022                 && (    OP(noper) == flags
3023                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3024                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3025                                              || OP(noper) == EXACTFUP))))
3026             {
3027                 const U8 *uc= (U8*)STRING(noper);
3028                 const U8 *e= uc + STR_LEN(noper);
3029
3030                 for ( ; uc < e ; uc += len ) {
3031
3032                     TRIE_READ_CHAR;
3033
3034                     if ( uvc < 256 ) {
3035                         charid = trie->charmap[ uvc ];
3036                     } else {
3037                         SV** const svpp = hv_fetch( widecharmap,
3038                                                     (char*)&uvc,
3039                                                     sizeof( UV ),
3040                                                     0);
3041                         if ( !svpp ) {
3042                             charid = 0;
3043                         } else {
3044                             charid=(U16)SvIV( *svpp );
3045                         }
3046                     }
3047                     /* charid is now 0 if we dont know the char read, or
3048                      * nonzero if we do */
3049                     if ( charid ) {
3050
3051                         U16 check;
3052                         U32 newstate = 0;
3053
3054                         charid--;
3055                         if ( !trie->states[ state ].trans.list ) {
3056                             TRIE_LIST_NEW( state );
3057                         }
3058                         for ( check = 1;
3059                               check <= TRIE_LIST_USED( state );
3060                               check++ )
3061                         {
3062                             if ( TRIE_LIST_ITEM( state, check ).forid
3063                                                                     == charid )
3064                             {
3065                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3066                                 break;
3067                             }
3068                         }
3069                         if ( ! newstate ) {
3070                             newstate = next_alloc++;
3071                             prev_states[newstate] = state;
3072                             TRIE_LIST_PUSH( state, charid, newstate );
3073                             transcount++;
3074                         }
3075                         state = newstate;
3076                     } else {
3077                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3078                     }
3079                 }
3080             }
3081             TRIE_HANDLE_WORD(state);
3082
3083         } /* end second pass */
3084
3085         /* next alloc is the NEXT state to be allocated */
3086         trie->statecount = next_alloc;
3087         trie->states = (reg_trie_state *)
3088             PerlMemShared_realloc( trie->states,
3089                                    next_alloc
3090                                    * sizeof(reg_trie_state) );
3091
3092         /* and now dump it out before we compress it */
3093         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3094                                                          revcharmap, next_alloc,
3095                                                          depth+1)
3096         );
3097
3098         trie->trans = (reg_trie_trans *)
3099             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3100         {
3101             U32 state;
3102             U32 tp = 0;
3103             U32 zp = 0;
3104
3105
3106             for( state=1 ; state < next_alloc ; state ++ ) {
3107                 U32 base=0;
3108
3109                 /*
3110                 DEBUG_TRIE_COMPILE_MORE_r(
3111                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
3112                 );
3113                 */
3114
3115                 if (trie->states[state].trans.list) {
3116                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3117                     U16 maxid=minid;
3118                     U16 idx;
3119
3120                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3121                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3122                         if ( forid < minid ) {
3123                             minid=forid;
3124                         } else if ( forid > maxid ) {
3125                             maxid=forid;
3126                         }
3127                     }
3128                     if ( transcount < tp + maxid - minid + 1) {
3129                         transcount *= 2;
3130                         trie->trans = (reg_trie_trans *)
3131                             PerlMemShared_realloc( trie->trans,
3132                                                      transcount
3133                                                      * sizeof(reg_trie_trans) );
3134                         Zero( trie->trans + (transcount / 2),
3135                               transcount / 2,
3136                               reg_trie_trans );
3137                     }
3138                     base = trie->uniquecharcount + tp - minid;
3139                     if ( maxid == minid ) {
3140                         U32 set = 0;
3141                         for ( ; zp < tp ; zp++ ) {
3142                             if ( ! trie->trans[ zp ].next ) {
3143                                 base = trie->uniquecharcount + zp - minid;
3144                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3145                                                                    1).newstate;
3146                                 trie->trans[ zp ].check = state;
3147                                 set = 1;
3148                                 break;
3149                             }
3150                         }
3151                         if ( !set ) {
3152                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3153                                                                    1).newstate;
3154                             trie->trans[ tp ].check = state;
3155                             tp++;
3156                             zp = tp;
3157                         }
3158                     } else {
3159                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3160                             const U32 tid = base
3161                                            - trie->uniquecharcount
3162                                            + TRIE_LIST_ITEM( state, idx ).forid;
3163                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3164                                                                 idx ).newstate;
3165                             trie->trans[ tid ].check = state;
3166                         }
3167                         tp += ( maxid - minid + 1 );
3168                     }
3169                     Safefree(trie->states[ state ].trans.list);
3170                 }
3171                 /*
3172                 DEBUG_TRIE_COMPILE_MORE_r(
3173                     Perl_re_printf( aTHX_  " base: %d\n",base);
3174                 );
3175                 */
3176                 trie->states[ state ].trans.base=base;
3177             }
3178             trie->lasttrans = tp + 1;
3179         }
3180     } else {
3181         /*
3182            Second Pass -- Flat Table Representation.
3183
3184            we dont use the 0 slot of either trans[] or states[] so we add 1 to
3185            each.  We know that we will need Charcount+1 trans at most to store
3186            the data (one row per char at worst case) So we preallocate both
3187            structures assuming worst case.
3188
3189            We then construct the trie using only the .next slots of the entry
3190            structs.
3191
3192            We use the .check field of the first entry of the node temporarily
3193            to make compression both faster and easier by keeping track of how
3194            many non zero fields are in the node.
3195
3196            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3197            transition.
3198
3199            There are two terms at use here: state as a TRIE_NODEIDX() which is
3200            a number representing the first entry of the node, and state as a
3201            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3202            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3203            if there are 2 entrys per node. eg:
3204
3205              A B       A B
3206           1. 2 4    1. 3 7
3207           2. 0 3    3. 0 5
3208           3. 0 0    5. 0 0
3209           4. 0 0    7. 0 0
3210
3211            The table is internally in the right hand, idx form. However as we
3212            also have to deal with the states array which is indexed by nodenum
3213            we have to use TRIE_NODENUM() to convert.
3214
3215         */
3216         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
3217             depth+1));
3218
3219         trie->trans = (reg_trie_trans *)
3220             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3221                                   * trie->uniquecharcount + 1,
3222                                   sizeof(reg_trie_trans) );
3223         trie->states = (reg_trie_state *)
3224             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3225                                   sizeof(reg_trie_state) );
3226         next_alloc = trie->uniquecharcount + 1;
3227
3228
3229         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3230
3231             regnode *noper   = NEXTOPER( cur );
3232
3233             U32 state        = 1;         /* required init */
3234
3235             U16 charid       = 0;         /* sanity init */
3236             U32 accept_state = 0;         /* sanity init */
3237
3238             U32 wordlen      = 0;         /* required init */
3239
3240             if (OP(noper) == NOTHING) {
3241                 regnode *noper_next= regnext(noper);
3242                 if (noper_next < tail)
3243                     noper= noper_next;
3244             }
3245
3246             if (    noper < tail
3247                 && (    OP(noper) == flags
3248                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
3249                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
3250                                              || OP(noper) == EXACTFUP))))
3251             {
3252                 const U8 *uc= (U8*)STRING(noper);
3253                 const U8 *e= uc + STR_LEN(noper);
3254
3255                 for ( ; uc < e ; uc += len ) {
3256
3257                     TRIE_READ_CHAR;
3258
3259                     if ( uvc < 256 ) {
3260                         charid = trie->charmap[ uvc ];
3261                     } else {
3262                         SV* const * const svpp = hv_fetch( widecharmap,
3263                                                            (char*)&uvc,
3264                                                            sizeof( UV ),
3265                                                            0);
3266                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3267                     }
3268                     if ( charid ) {
3269                         charid--;
3270                         if ( !trie->trans[ state + charid ].next ) {
3271                             trie->trans[ state + charid ].next = next_alloc;
3272                             trie->trans[ state ].check++;
3273                             prev_states[TRIE_NODENUM(next_alloc)]
3274                                     = TRIE_NODENUM(state);
3275                             next_alloc += trie->uniquecharcount;
3276                         }
3277                         state = trie->trans[ state + charid ].next;
3278                     } else {
3279                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3280                     }
3281                     /* charid is now 0 if we dont know the char read, or
3282                      * nonzero if we do */
3283                 }
3284             }
3285             accept_state = TRIE_NODENUM( state );
3286             TRIE_HANDLE_WORD(accept_state);
3287
3288         } /* end second pass */
3289
3290         /* and now dump it out before we compress it */
3291         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3292                                                           revcharmap,
3293                                                           next_alloc, depth+1));
3294
3295         {
3296         /*
3297            * Inplace compress the table.*
3298
3299            For sparse data sets the table constructed by the trie algorithm will
3300            be mostly 0/FAIL transitions or to put it another way mostly empty.
3301            (Note that leaf nodes will not contain any transitions.)
3302
3303            This algorithm compresses the tables by eliminating most such
3304            transitions, at the cost of a modest bit of extra work during lookup:
3305
3306            - Each states[] entry contains a .base field which indicates the
3307            index in the state[] array wheres its transition data is stored.
3308
3309            - If .base is 0 there are no valid transitions from that node.
3310
3311            - If .base is nonzero then charid is added to it to find an entry in
3312            the trans array.
3313
3314            -If trans[states[state].base+charid].check!=state then the
3315            transition is taken to be a 0/Fail transition. Thus if there are fail
3316            transitions at the front of the node then the .base offset will point
3317            somewhere inside the previous nodes data (or maybe even into a node
3318            even earlier), but the .check field determines if the transition is
3319            valid.
3320
3321            XXX - wrong maybe?
3322            The following process inplace converts the table to the compressed
3323            table: We first do not compress the root node 1,and mark all its
3324            .check pointers as 1 and set its .base pointer as 1 as well. This
3325            allows us to do a DFA construction from the compressed table later,
3326            and ensures that any .base pointers we calculate later are greater
3327            than 0.
3328
3329            - We set 'pos' to indicate the first entry of the second node.
3330
3331            - We then iterate over the columns of the node, finding the first and
3332            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3333            and set the .check pointers accordingly, and advance pos
3334            appropriately and repreat for the next node. Note that when we copy
3335            the next pointers we have to convert them from the original
3336            NODEIDX form to NODENUM form as the former is not valid post
3337            compression.
3338
3339            - If a node has no transitions used we mark its base as 0 and do not
3340            advance the pos pointer.
3341
3342            - If a node only has one transition we use a second pointer into the
3343            structure to fill in allocated fail transitions from other states.
3344            This pointer is independent of the main pointer and scans forward
3345            looking for null transitions that are allocated to a state. When it
3346            finds one it writes the single transition into the "hole".  If the
3347            pointer doesnt find one the single transition is appended as normal.
3348
3349            - Once compressed we can Renew/realloc the structures to release the
3350            excess space.
3351
3352            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3353            specifically Fig 3.47 and the associated pseudocode.
3354
3355            demq
3356         */
3357         const U32 laststate = TRIE_NODENUM( next_alloc );
3358         U32 state, charid;
3359         U32 pos = 0, zp=0;
3360         trie->statecount = laststate;
3361
3362         for ( state = 1 ; state < laststate ; state++ ) {
3363             U8 flag = 0;
3364             const U32 stateidx = TRIE_NODEIDX( state );
3365             const U32 o_used = trie->trans[ stateidx ].check;
3366             U32 used = trie->trans[ stateidx ].check;
3367             trie->trans[ stateidx ].check = 0;
3368
3369             for ( charid = 0;
3370                   used && charid < trie->uniquecharcount;
3371                   charid++ )
3372             {
3373                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3374                     if ( trie->trans[ stateidx + charid ].next ) {
3375                         if (o_used == 1) {
3376                             for ( ; zp < pos ; zp++ ) {
3377                                 if ( ! trie->trans[ zp ].next ) {
3378                                     break;
3379                                 }
3380                             }
3381                             trie->states[ state ].trans.base
3382                                                     = zp
3383                                                       + trie->uniquecharcount
3384                                                       - charid ;
3385                             trie->trans[ zp ].next
3386                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3387                                                              + charid ].next );
3388                             trie->trans[ zp ].check = state;
3389                             if ( ++zp > pos ) pos = zp;
3390                             break;
3391                         }
3392                         used--;
3393                     }
3394                     if ( !flag ) {
3395                         flag = 1;
3396                         trie->states[ state ].trans.base
3397                                        = pos + trie->uniquecharcount - charid ;
3398                     }
3399                     trie->trans[ pos ].next
3400                         = SAFE_TRIE_NODENUM(
3401                                        trie->trans[ stateidx + charid ].next );
3402                     trie->trans[ pos ].check = state;
3403                     pos++;
3404                 }
3405             }
3406         }
3407         trie->lasttrans = pos + 1;
3408         trie->states = (reg_trie_state *)
3409             PerlMemShared_realloc( trie->states, laststate
3410                                    * sizeof(reg_trie_state) );
3411         DEBUG_TRIE_COMPILE_MORE_r(
3412             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3413                 depth+1,
3414                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3415                        + 1 ),
3416                 (IV)next_alloc,
3417                 (IV)pos,
3418                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3419             );
3420
3421         } /* end table compress */
3422     }
3423     DEBUG_TRIE_COMPILE_MORE_r(
3424             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3425                 depth+1,
3426                 (UV)trie->statecount,
3427                 (UV)trie->lasttrans)
3428     );
3429     /* resize the trans array to remove unused space */
3430     trie->trans = (reg_trie_trans *)
3431         PerlMemShared_realloc( trie->trans, trie->lasttrans
3432                                * sizeof(reg_trie_trans) );
3433
3434     {   /* Modify the program and insert the new TRIE node */
3435         U8 nodetype =(U8)(flags & 0xFF);
3436         char *str=NULL;
3437
3438 #ifdef DEBUGGING
3439         regnode *optimize = NULL;
3440 #ifdef RE_TRACK_PATTERN_OFFSETS
3441
3442         U32 mjd_offset = 0;
3443         U32 mjd_nodelen = 0;
3444 #endif /* RE_TRACK_PATTERN_OFFSETS */
3445 #endif /* DEBUGGING */
3446         /*
3447            This means we convert either the first branch or the first Exact,
3448            depending on whether the thing following (in 'last') is a branch
3449            or not and whther first is the startbranch (ie is it a sub part of
3450            the alternation or is it the whole thing.)
3451            Assuming its a sub part we convert the EXACT otherwise we convert
3452            the whole branch sequence, including the first.
3453          */
3454         /* Find the node we are going to overwrite */
3455         if ( first != startbranch || OP( last ) == BRANCH ) {
3456             /* branch sub-chain */
3457             NEXT_OFF( first ) = (U16)(last - first);
3458 #ifdef RE_TRACK_PATTERN_OFFSETS
3459             DEBUG_r({
3460                 mjd_offset= Node_Offset((convert));
3461                 mjd_nodelen= Node_Length((convert));
3462             });
3463 #endif
3464             /* whole branch chain */
3465         }
3466 #ifdef RE_TRACK_PATTERN_OFFSETS
3467         else {
3468             DEBUG_r({
3469                 const  regnode *nop = NEXTOPER( convert );
3470                 mjd_offset= Node_Offset((nop));
3471                 mjd_nodelen= Node_Length((nop));
3472             });
3473         }
3474         DEBUG_OPTIMISE_r(
3475             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3476                 depth+1,
3477                 (UV)mjd_offset, (UV)mjd_nodelen)
3478         );
3479 #endif
3480         /* But first we check to see if there is a common prefix we can
3481            split out as an EXACT and put in front of the TRIE node.  */
3482         trie->startstate= 1;
3483         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3484             /* we want to find the first state that has more than
3485              * one transition, if that state is not the first state
3486              * then we have a common prefix which we can remove.
3487              */
3488             U32 state;
3489             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3490                 U32 ofs = 0;
3491                 I32 first_ofs = -1; /* keeps track of the ofs of the first
3492                                        transition, -1 means none */
3493                 U32 count = 0;
3494                 const U32 base = trie->states[ state ].trans.base;
3495
3496                 /* does this state terminate an alternation? */
3497                 if ( trie->states[state].wordnum )
3498                         count = 1;
3499
3500                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3501                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3502                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3503                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3504                     {
3505                         if ( ++count > 1 ) {
3506                             /* we have more than one transition */
3507                             SV **tmp;
3508                             U8 *ch;
3509                             /* if this is the first state there is no common prefix
3510                              * to extract, so we can exit */
3511                             if ( state == 1 ) break;
3512                             tmp = av_fetch( revcharmap, ofs, 0);
3513                             ch = (U8*)SvPV_nolen_const( *tmp );
3514
3515                             /* if we are on count 2 then we need to initialize the
3516                              * bitmap, and store the previous char if there was one
3517                              * in it*/
3518                             if ( count == 2 ) {
3519                                 /* clear the bitmap */
3520                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3521                                 DEBUG_OPTIMISE_r(
3522                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
3523                                         depth+1,
3524                                         (UV)state));
3525                                 if (first_ofs >= 0) {
3526                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3527                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3528
3529                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3530                                     DEBUG_OPTIMISE_r(
3531                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3532                                     );
3533                                 }
3534                             }
3535                             /* store the current firstchar in the bitmap */
3536                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3537                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3538                         }
3539                         first_ofs = ofs;
3540                     }
3541                 }
3542                 if ( count == 1 ) {
3543                     /* This state has only one transition, its transition is part
3544                      * of a common prefix - we need to concatenate the char it
3545                      * represents to what we have so far. */
3546                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3547                     STRLEN len;
3548                     char *ch = SvPV( *tmp, len );
3549                     DEBUG_OPTIMISE_r({
3550                         SV *sv=sv_newmortal();
3551                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3552                             depth+1,
3553                             (UV)state, (UV)first_ofs,
3554                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3555                                 PL_colors[0], PL_colors[1],
3556                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3557                                 PERL_PV_ESCAPE_FIRSTCHAR
3558                             )
3559                         );
3560                     });
3561                     if ( state==1 ) {
3562                         OP( convert ) = nodetype;
3563                         str=STRING(convert);
3564                         setSTR_LEN(convert, 0);
3565                     }
3566                     setSTR_LEN(convert, STR_LEN(convert) + len);
3567                     while (len--)
3568                         *str++ = *ch++;
3569                 } else {
3570 #ifdef DEBUGGING
3571                     if (state>1)
3572                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3573 #endif
3574                     break;
3575                 }
3576             }
3577             trie->prefixlen = (state-1);
3578             if (str) {
3579                 regnode *n = convert+NODE_SZ_STR(convert);
3580                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3581                 trie->startstate = state;
3582                 trie->minlen -= (state - 1);
3583                 trie->maxlen -= (state - 1);
3584 #ifdef DEBUGGING
3585                /* At least the UNICOS C compiler choked on this
3586                 * being argument to DEBUG_r(), so let's just have
3587                 * it right here. */
3588                if (
3589 #ifdef PERL_EXT_RE_BUILD
3590                    1
3591 #else
3592                    DEBUG_r_TEST
3593 #endif
3594                    ) {
3595                    regnode *fix = convert;
3596                    U32 word = trie->wordcount;
3597 #ifdef RE_TRACK_PATTERN_OFFSETS
3598                    mjd_nodelen++;
3599 #endif
3600                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3601                    while( ++fix < n ) {
3602                        Set_Node_Offset_Length(fix, 0, 0);
3603                    }
3604                    while (word--) {
3605                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3606                        if (tmp) {
3607                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3608                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3609                            else
3610                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3611                        }
3612                    }
3613                }
3614 #endif
3615                 if (trie->maxlen) {
3616                     convert = n;
3617                 } else {
3618                     NEXT_OFF(convert) = (U16)(tail - convert);
3619                     DEBUG_r(optimize= n);
3620                 }
3621             }
3622         }
3623         if (!jumper)
3624             jumper = last;
3625         if ( trie->maxlen ) {
3626             NEXT_OFF( convert ) = (U16)(tail - convert);
3627             ARG_SET( convert, data_slot );
3628             /* Store the offset to the first unabsorbed branch in
3629                jump[0], which is otherwise unused by the jump logic.
3630                We use this when dumping a trie and during optimisation. */
3631             if (trie->jump)
3632                 trie->jump[0] = (U16)(nextbranch - convert);
3633
3634             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3635              *   and there is a bitmap
3636              *   and the first "jump target" node we found leaves enough room
3637              * then convert the TRIE node into a TRIEC node, with the bitmap
3638              * embedded inline in the opcode - this is hypothetically faster.
3639              */
3640             if ( !trie->states[trie->startstate].wordnum
3641                  && trie->bitmap
3642                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3643             {
3644                 OP( convert ) = TRIEC;
3645                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3646                 PerlMemShared_free(trie->bitmap);
3647                 trie->bitmap= NULL;
3648             } else
3649                 OP( convert ) = TRIE;
3650
3651             /* store the type in the flags */
3652             convert->flags = nodetype;
3653             DEBUG_r({
3654             optimize = convert
3655                       + NODE_STEP_REGNODE
3656                       + regarglen[ OP( convert ) ];
3657             });
3658             /* XXX We really should free up the resource in trie now,
3659                    as we won't use them - (which resources?) dmq */
3660         }
3661         /* needed for dumping*/
3662         DEBUG_r(if (optimize) {
3663             regnode *opt = convert;
3664
3665             while ( ++opt < optimize) {
3666                 Set_Node_Offset_Length(opt, 0, 0);
3667             }
3668             /*
3669                 Try to clean up some of the debris left after the
3670                 optimisation.
3671              */
3672             while( optimize < jumper ) {
3673                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3674                 OP( optimize ) = OPTIMIZED;
3675                 Set_Node_Offset_Length(optimize, 0, 0);
3676                 optimize++;
3677             }
3678             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3679         });
3680     } /* end node insert */
3681
3682     /*  Finish populating the prev field of the wordinfo array.  Walk back
3683      *  from each accept state until we find another accept state, and if
3684      *  so, point the first word's .prev field at the second word. If the
3685      *  second already has a .prev field set, stop now. This will be the
3686      *  case either if we've already processed that word's accept state,
3687      *  or that state had multiple words, and the overspill words were
3688      *  already linked up earlier.
3689      */
3690     {
3691         U16 word;
3692         U32 state;
3693         U16 prev;
3694
3695         for (word=1; word <= trie->wordcount; word++) {
3696             prev = 0;
3697             if (trie->wordinfo[word].prev)
3698                 continue;
3699             state = trie->wordinfo[word].accept;
3700             while (state) {
3701                 state = prev_states[state];
3702                 if (!state)
3703                     break;
3704                 prev = trie->states[state].wordnum;
3705                 if (prev)
3706                     break;
3707             }
3708             trie->wordinfo[word].prev = prev;
3709         }
3710         Safefree(prev_states);
3711     }
3712
3713
3714     /* and now dump out the compressed format */
3715     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3716
3717     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3718 #ifdef DEBUGGING
3719     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3720     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3721 #else
3722     SvREFCNT_dec_NN(revcharmap);
3723 #endif
3724     return trie->jump
3725            ? MADE_JUMP_TRIE
3726            : trie->startstate>1
3727              ? MADE_EXACT_TRIE
3728              : MADE_TRIE;
3729 }
3730
3731 STATIC regnode *
3732 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3733 {
3734 /* The Trie is constructed and compressed now so we can build a fail array if
3735  * it's needed
3736
3737    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3738    3.32 in the
3739    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3740    Ullman 1985/88
3741    ISBN 0-201-10088-6
3742
3743    We find the fail state for each state in the trie, this state is the longest
3744    proper suffix of the current state's 'word' that is also a proper prefix of
3745    another word in our trie. State 1 represents the word '' and is thus the
3746    default fail state. This allows the DFA not to have to restart after its
3747    tried and failed a word at a given point, it simply continues as though it
3748    had been matching the other word in the first place.
3749    Consider
3750       'abcdgu'=~/abcdefg|cdgu/
3751    When we get to 'd' we are still matching the first word, we would encounter
3752    'g' which would fail, which would bring us to the state representing 'd' in
3753    the second word where we would try 'g' and succeed, proceeding to match
3754    'cdgu'.
3755  */
3756  /* add a fail transition */
3757     const U32 trie_offset = ARG(source);
3758     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3759     U32 *q;
3760     const U32 ucharcount = trie->uniquecharcount;
3761     const U32 numstates = trie->statecount;
3762     const U32 ubound = trie->lasttrans + ucharcount;
3763     U32 q_read = 0;
3764     U32 q_write = 0;
3765     U32 charid;
3766     U32 base = trie->states[ 1 ].trans.base;
3767     U32 *fail;
3768     reg_ac_data *aho;
3769     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3770     regnode *stclass;
3771     GET_RE_DEBUG_FLAGS_DECL;
3772
3773     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3774     PERL_UNUSED_CONTEXT;
3775 #ifndef DEBUGGING
3776     PERL_UNUSED_ARG(depth);
3777 #endif
3778
3779     if ( OP(source) == TRIE ) {
3780         struct regnode_1 *op = (struct regnode_1 *)
3781             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3782         StructCopy(source, op, struct regnode_1);
3783         stclass = (regnode *)op;
3784     } else {
3785         struct regnode_charclass *op = (struct regnode_charclass *)
3786             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3787         StructCopy(source, op, struct regnode_charclass);
3788         stclass = (regnode *)op;
3789     }
3790     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3791
3792     ARG_SET( stclass, data_slot );
3793     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3794     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3795     aho->trie=trie_offset;
3796     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3797     Copy( trie->states, aho->states, numstates, reg_trie_state );
3798     Newx( q, numstates, U32);
3799     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3800     aho->refcount = 1;
3801     fail = aho->fail;
3802     /* initialize fail[0..1] to be 1 so that we always have
3803        a valid final fail state */
3804     fail[ 0 ] = fail[ 1 ] = 1;
3805
3806     for ( charid = 0; charid < ucharcount ; charid++ ) {
3807         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3808         if ( newstate ) {
3809             q[ q_write ] = newstate;
3810             /* set to point at the root */
3811             fail[ q[ q_write++ ] ]=1;
3812         }
3813     }
3814     while ( q_read < q_write) {
3815         const U32 cur = q[ q_read++ % numstates ];
3816         base = trie->states[ cur ].trans.base;
3817
3818         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3819             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3820             if (ch_state) {
3821                 U32 fail_state = cur;
3822                 U32 fail_base;
3823                 do {
3824                     fail_state = fail[ fail_state ];
3825                     fail_base = aho->states[ fail_state ].trans.base;
3826                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3827
3828                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3829                 fail[ ch_state ] = fail_state;
3830                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3831                 {
3832                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3833                 }
3834                 q[ q_write++ % numstates] = ch_state;
3835             }
3836         }
3837     }
3838     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3839        when we fail in state 1, this allows us to use the
3840        charclass scan to find a valid start char. This is based on the principle
3841        that theres a good chance the string being searched contains lots of stuff
3842        that cant be a start char.
3843      */
3844     fail[ 0 ] = fail[ 1 ] = 0;
3845     DEBUG_TRIE_COMPILE_r({
3846         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
3847                       depth, (UV)numstates
3848         );
3849         for( q_read=1; q_read<numstates; q_read++ ) {
3850             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
3851         }
3852         Perl_re_printf( aTHX_  "\n");
3853     });
3854     Safefree(q);
3855     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3856     return stclass;
3857 }
3858
3859
3860 /* The below joins as many adjacent EXACTish nodes as possible into a single
3861  * one.  The regop may be changed if the node(s) contain certain sequences that
3862  * require special handling.  The joining is only done if:
3863  * 1) there is room in the current conglomerated node to entirely contain the
3864  *    next one.
3865  * 2) they are compatible node types
3866  *
3867  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3868  * these get optimized out
3869  *
3870  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3871  * as possible, even if that means splitting an existing node so that its first
3872  * part is moved to the preceeding node.  This would maximise the efficiency of
3873  * memEQ during matching.
3874  *
3875  * If a node is to match under /i (folded), the number of characters it matches
3876  * can be different than its character length if it contains a multi-character
3877  * fold.  *min_subtract is set to the total delta number of characters of the
3878  * input nodes.
3879  *
3880  * And *unfolded_multi_char is set to indicate whether or not the node contains
3881  * an unfolded multi-char fold.  This happens when it won't be known until
3882  * runtime whether the fold is valid or not; namely
3883  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3884  *      target string being matched against turns out to be UTF-8 is that fold
3885  *      valid; or
3886  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
3887  *      runtime.
3888  * (Multi-char folds whose components are all above the Latin1 range are not
3889  * run-time locale dependent, and have already been folded by the time this
3890  * function is called.)
3891  *
3892  * This is as good a place as any to discuss the design of handling these
3893  * multi-character fold sequences.  It's been wrong in Perl for a very long
3894  * time.  There are three code points in Unicode whose multi-character folds
3895  * were long ago discovered to mess things up.  The previous designs for
3896  * dealing with these involved assigning a special node for them.  This
3897  * approach doesn't always work, as evidenced by this example:
3898  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3899  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3900  * would match just the \xDF, it won't be able to handle the case where a
3901  * successful match would have to cross the node's boundary.  The new approach
3902  * that hopefully generally solves the problem generates an EXACTFUP node
3903  * that is "sss" in this case.
3904  *
3905  * It turns out that there are problems with all multi-character folds, and not
3906  * just these three.  Now the code is general, for all such cases.  The
3907  * approach taken is:
3908  * 1)   This routine examines each EXACTFish node that could contain multi-
3909  *      character folded sequences.  Since a single character can fold into
3910  *      such a sequence, the minimum match length for this node is less than
3911  *      the number of characters in the node.  This routine returns in
3912  *      *min_subtract how many characters to subtract from the the actual
3913  *      length of the string to get a real minimum match length; it is 0 if
3914  *      there are no multi-char foldeds.  This delta is used by the caller to
3915  *      adjust the min length of the match, and the delta between min and max,
3916  *      so that the optimizer doesn't reject these possibilities based on size
3917  *      constraints.
3918  *
3919  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3920  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
3921  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3922  *      EXACTFU nodes.  The node type of such nodes is then changed to
3923  *      EXACTFUP, indicating it is problematic, and needs careful handling.
3924  *      (The procedures in step 1) above are sufficient to handle this case in
3925  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
3926  *      the only case where there is a possible fold length change in non-UTF-8
3927  *      patterns.  By reserving a special node type for problematic cases, the
3928  *      far more common regular EXACTFU nodes can be processed faster.
3929  *      regexec.c takes advantage of this.
3930  *
3931  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3932  *      problematic cases.   These all only occur when the pattern is not
3933  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
3934  *      length change, it handles the situation where the string cannot be
3935  *      entirely folded.  The strings in an EXACTFish node are folded as much
3936  *      as possible during compilation in regcomp.c.  This saves effort in
3937  *      regex matching.  By using an EXACTFUP node when it is not possible to
3938  *      fully fold at compile time, regexec.c can know that everything in an
3939  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
3940  *      case where folding in EXACTFU nodes can't be done at compile time is
3941  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
3942  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
3943  *      handle two very different cases.  Alternatively, there could have been
3944  *      a node type where there are length changes, one for unfolded, and one
3945  *      for both.  If yet another special case needed to be created, the number
3946  *      of required node types would have to go to 7.  khw figures that even
3947  *      though there are plenty of node types to spare, that the maintenance
3948  *      cost wasn't worth the small speedup of doing it that way, especially
3949  *      since he thinks the MICRO SIGN is rarely encountered in practice.
3950  *
3951  *      There are other cases where folding isn't done at compile time, but
3952  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
3953  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
3954  *      changes.  Some folds in EXACTF depend on if the runtime target string
3955  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
3956  *      when no fold in it depends on the UTF-8ness of the target string.)
3957  *
3958  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3959  *      validity of the fold won't be known until runtime, and so must remain
3960  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
3961  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3962  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3963  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3964  *      The reason this is a problem is that the optimizer part of regexec.c
3965  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3966  *      that a character in the pattern corresponds to at most a single
3967  *      character in the target string.  (And I do mean character, and not byte
3968  *      here, unlike other parts of the documentation that have never been
3969  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
3970  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3971  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
3972  *      EXACTFL nodes, violate the assumption, and they are the only instances
3973  *      where it is violated.  I'm reluctant to try to change the assumption,
3974  *      as the code involved is impenetrable to me (khw), so instead the code
3975  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
3976  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3977  *      boolean indicating whether or not the node contains such a fold.  When
3978  *      it is true, the caller sets a flag that later causes the optimizer in
3979  *      this file to not set values for the floating and fixed string lengths,
3980  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3981  *      assumption.  Thus, there is no optimization based on string lengths for
3982  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3983  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
3984  *      assumption is wrong only in these cases is that all other non-UTF-8
3985  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3986  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3987  *      EXACTF nodes because we don't know at compile time if it actually
3988  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3989  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3990  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
3991  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3992  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3993  *      string would require the pattern to be forced into UTF-8, the overhead
3994  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3995  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3996  *      locale.)
3997  *
3998  *      Similarly, the code that generates tries doesn't currently handle
3999  *      not-already-folded multi-char folds, and it looks like a pain to change
4000  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
4001  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
4002  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
4003  *      using /iaa matching will be doing so almost entirely with ASCII
4004  *      strings, so this should rarely be encountered in practice */
4005
4006 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
4007     if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
4008                                       && OP(scan) != LEXACT_REQ8)  \
4009         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
4010
4011 STATIC U32
4012 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4013                    UV *min_subtract, bool *unfolded_multi_char,
4014                    U32 flags, regnode *val, U32 depth)
4015 {
4016     /* Merge several consecutive EXACTish nodes into one. */
4017
4018     regnode *n = regnext(scan);
4019     U32 stringok = 1;
4020     regnode *next = scan + NODE_SZ_STR(scan);
4021     U32 merged = 0;
4022     U32 stopnow = 0;
4023 #ifdef DEBUGGING
4024     regnode *stop = scan;
4025     GET_RE_DEBUG_FLAGS_DECL;
4026 #else
4027     PERL_UNUSED_ARG(depth);
4028 #endif
4029
4030     PERL_ARGS_ASSERT_JOIN_EXACT;
4031 #ifndef EXPERIMENTAL_INPLACESCAN
4032     PERL_UNUSED_ARG(flags);
4033     PERL_UNUSED_ARG(val);
4034 #endif
4035     DEBUG_PEEP("join", scan, depth, 0);
4036
4037     assert(PL_regkind[OP(scan)] == EXACT);
4038
4039     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
4040      * EXACT ones that are mergeable to the current one. */
4041     while (    n
4042            && (    PL_regkind[OP(n)] == NOTHING
4043                || (stringok && PL_regkind[OP(n)] == EXACT))
4044            && NEXT_OFF(n)
4045            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4046     {
4047
4048         if (OP(n) == TAIL || n > next)
4049             stringok = 0;
4050         if (PL_regkind[OP(n)] == NOTHING) {
4051             DEBUG_PEEP("skip:", n, depth, 0);
4052             NEXT_OFF(scan) += NEXT_OFF(n);
4053             next = n + NODE_STEP_REGNODE;
4054 #ifdef DEBUGGING
4055             if (stringok)
4056                 stop = n;
4057 #endif
4058             n = regnext(n);
4059         }
4060         else if (stringok) {
4061             const unsigned int oldl = STR_LEN(scan);
4062             regnode * const nnext = regnext(n);
4063
4064             /* XXX I (khw) kind of doubt that this works on platforms (should
4065              * Perl ever run on one) where U8_MAX is above 255 because of lots
4066              * of other assumptions */
4067             /* Don't join if the sum can't fit into a single node */
4068             if (oldl + STR_LEN(n) > U8_MAX)
4069                 break;
4070
4071             /* Joining something that requires UTF-8 with something that
4072              * doesn't, means the result requires UTF-8. */
4073             if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4074                 OP(scan) = EXACT_REQ8;
4075             }
4076             else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4077                 ;   /* join is compatible, no need to change OP */
4078             }
4079             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4080                 OP(scan) = EXACTFU_REQ8;
4081             }
4082             else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4083                 ;   /* join is compatible, no need to change OP */
4084             }
4085             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4086                 ;   /* join is compatible, no need to change OP */
4087             }
4088             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4089
4090                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4091                   * which can join with EXACTFU ones.  We check for this case
4092                   * here.  These need to be resolved to either EXACTFU or
4093                   * EXACTF at joining time.  They have nothing in them that
4094                   * would forbid them from being the more desirable EXACTFU
4095                   * nodes except that they begin and/or end with a single [Ss].
4096                   * The reason this is problematic is because they could be
4097                   * joined in this loop with an adjacent node that ends and/or
4098                   * begins with [Ss] which would then form the sequence 'ss',
4099                   * which matches differently under /di than /ui, in which case
4100                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
4101                   * formed, the nodes get absorbed into any adjacent EXACTFU
4102                   * node.  And if the only adjacent node is EXACTF, they get
4103                   * absorbed into that, under the theory that a longer node is
4104                   * better than two shorter ones, even if one is EXACTFU.  Note
4105                   * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4106                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
4107
4108                 if (STRING(n)[STR_LEN(n)-1] == 's') {
4109
4110                     /* Here the joined node would end with 's'.  If the node
4111                      * following the combination is an EXACTF one, it's better to
4112                      * join this trailing edge 's' node with that one, leaving the
4113                      * current one in 'scan' be the more desirable EXACTFU */
4114                     if (OP(nnext) == EXACTF) {
4115                         break;
4116                     }
4117
4118                     OP(scan) = EXACTFU_S_EDGE;
4119
4120                 }   /* Otherwise, the beginning 's' of the 2nd node just
4121                        becomes an interior 's' in 'scan' */
4122             }
4123             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4124                 ;   /* join is compatible, no need to change OP */
4125             }
4126             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4127
4128                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4129                  * nodes.  But the latter nodes can be also joined with EXACTFU
4130                  * ones, and that is a better outcome, so if the node following
4131                  * 'n' is EXACTFU, quit now so that those two can be joined
4132                  * later */
4133                 if (OP(nnext) == EXACTFU) {
4134                     break;
4135                 }
4136
4137                 /* The join is compatible, and the combined node will be
4138                  * EXACTF.  (These don't care if they begin or end with 's' */
4139             }
4140             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4141                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
4142                     && STRING(n)[0] == 's')
4143                 {
4144                     /* When combined, we have the sequence 'ss', which means we
4145                      * have to remain /di */
4146                     OP(scan) = EXACTF;
4147                 }
4148             }
4149             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4150                 if (STRING(n)[0] == 's') {
4151                     ;   /* Here the join is compatible and the combined node
4152                            starts with 's', no need to change OP */
4153                 }
4154                 else {  /* Now the trailing 's' is in the interior */
4155                     OP(scan) = EXACTFU;
4156                 }
4157             }
4158             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4159
4160                 /* The join is compatible, and the combined node will be
4161                  * EXACTF.  (These don't care if they begin or end with 's' */
4162                 OP(scan) = EXACTF;
4163             }
4164             else if (OP(scan) != OP(n)) {
4165
4166                 /* The only other compatible joinings are the same node type */
4167                 break;
4168             }
4169
4170             DEBUG_PEEP("merg", n, depth, 0);
4171             merged++;
4172
4173             NEXT_OFF(scan) += NEXT_OFF(n);
4174             setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
4175             next = n + NODE_SZ_STR(n);
4176             /* Now we can overwrite *n : */
4177             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4178 #ifdef DEBUGGING
4179             stop = next - 1;
4180 #endif
4181             n = nnext;
4182             if (stopnow) break;
4183         }
4184
4185 #ifdef EXPERIMENTAL_INPLACESCAN
4186         if (flags && !NEXT_OFF(n)) {
4187             DEBUG_PEEP("atch", val, depth, 0);
4188             if (reg_off_by_arg[OP(n)]) {
4189                 ARG_SET(n, val - n);
4190             }
4191             else {
4192                 NEXT_OFF(n) = val - n;
4193             }
4194             stopnow = 1;
4195         }
4196 #endif
4197     }
4198
4199     /* This temporary node can now be turned into EXACTFU, and must, as
4200      * regexec.c doesn't handle it */
4201     if (OP(scan) == EXACTFU_S_EDGE) {
4202         OP(scan) = EXACTFU;
4203     }
4204
4205     *min_subtract = 0;
4206     *unfolded_multi_char = FALSE;
4207
4208     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
4209      * can now analyze for sequences of problematic code points.  (Prior to
4210      * this final joining, sequences could have been split over boundaries, and
4211      * hence missed).  The sequences only happen in folding, hence for any
4212      * non-EXACT EXACTish node */
4213     if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4214         U8* s0 = (U8*) STRING(scan);
4215         U8* s = s0;
4216         U8* s_end = s0 + STR_LEN(scan);
4217
4218         int total_count_delta = 0;  /* Total delta number of characters that
4219                                        multi-char folds expand to */
4220
4221         /* One pass is made over the node's string looking for all the
4222          * possibilities.  To avoid some tests in the loop, there are two main
4223          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4224          * non-UTF-8 */
4225         if (UTF) {
4226             U8* folded = NULL;
4227
4228             if (OP(scan) == EXACTFL) {
4229                 U8 *d;
4230
4231                 /* An EXACTFL node would already have been changed to another
4232                  * node type unless there is at least one character in it that
4233                  * is problematic; likely a character whose fold definition
4234                  * won't be known until runtime, and so has yet to be folded.
4235                  * For all but the UTF-8 locale, folds are 1-1 in length, but
4236                  * to handle the UTF-8 case, we need to create a temporary
4237                  * folded copy using UTF-8 locale rules in order to analyze it.
4238                  * This is because our macros that look to see if a sequence is
4239                  * a multi-char fold assume everything is folded (otherwise the
4240                  * tests in those macros would be too complicated and slow).
4241                  * Note that here, the non-problematic folds will have already
4242                  * been done, so we can just copy such characters.  We actually
4243                  * don't completely fold the EXACTFL string.  We skip the
4244                  * unfolded multi-char folds, as that would just create work
4245                  * below to figure out the size they already are */
4246
4247                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4248                 d = folded;
4249                 while (s < s_end) {
4250                     STRLEN s_len = UTF8SKIP(s);
4251                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4252                         Copy(s, d, s_len, U8);
4253                         d += s_len;
4254                     }
4255                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
4256                         *unfolded_multi_char = TRUE;
4257                         Copy(s, d, s_len, U8);
4258                         d += s_len;
4259                     }
4260                     else if (isASCII(*s)) {
4261                         *(d++) = toFOLD(*s);
4262                     }
4263                     else {
4264                         STRLEN len;
4265                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4266                         d += len;
4267                     }
4268                     s += s_len;
4269                 }
4270
4271                 /* Point the remainder of the routine to look at our temporary
4272                  * folded copy */
4273                 s = folded;
4274                 s_end = d;
4275             } /* End of creating folded copy of EXACTFL string */
4276
4277             /* Examine the string for a multi-character fold sequence.  UTF-8
4278              * patterns have all characters pre-folded by the time this code is
4279              * executed */
4280             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4281                                      length sequence we are looking for is 2 */
4282             {
4283                 int count = 0;  /* How many characters in a multi-char fold */
4284                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4285                 if (! len) {    /* Not a multi-char fold: get next char */
4286                     s += UTF8SKIP(s);
4287                     continue;
4288                 }
4289
4290                 { /* Here is a generic multi-char fold. */
4291                     U8* multi_end  = s + len;
4292
4293                     /* Count how many characters are in it.  In the case of
4294                      * /aa, no folds which contain ASCII code points are
4295                      * allowed, so check for those, and skip if found. */
4296                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4297                         count = utf8_length(s, multi_end);
4298                         s = multi_end;
4299                     }
4300                     else {
4301                         while (s < multi_end) {
4302                             if (isASCII(*s)) {
4303                                 s++;
4304                                 goto next_iteration;
4305                             }
4306                             else {
4307                                 s += UTF8SKIP(s);
4308                             }
4309                             count++;
4310                         }
4311                     }
4312                 }
4313
4314                 /* The delta is how long the sequence is minus 1 (1 is how long
4315                  * the character that folds to the sequence is) */
4316                 total_count_delta += count - 1;
4317               next_iteration: ;
4318             }
4319
4320             /* We created a temporary folded copy of the string in EXACTFL
4321              * nodes.  Therefore we need to be sure it doesn't go below zero,
4322              * as the real string could be shorter */
4323             if (OP(scan) == EXACTFL) {
4324                 int total_chars = utf8_length((U8*) STRING(scan),
4325                                            (U8*) STRING(scan) + STR_LEN(scan));
4326                 if (total_count_delta > total_chars) {
4327                     total_count_delta = total_chars;
4328                 }
4329             }
4330
4331             *min_subtract += total_count_delta;
4332             Safefree(folded);
4333         }
4334         else if (OP(scan) == EXACTFAA) {
4335
4336             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
4337              * fold to the ASCII range (and there are no existing ones in the
4338              * upper latin1 range).  But, as outlined in the comments preceding
4339              * this function, we need to flag any occurrences of the sharp s.
4340              * This character forbids trie formation (because of added
4341              * complexity) */
4342 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
4343    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
4344                                       || UNICODE_DOT_DOT_VERSION > 0)
4345             while (s < s_end) {
4346                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4347                     OP(scan) = EXACTFAA_NO_TRIE;
4348                     *unfolded_multi_char = TRUE;
4349                     break;
4350                 }
4351                 s++;
4352             }
4353         }
4354         else {
4355
4356             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
4357              * folds that are all Latin1.  As explained in the comments
4358              * preceding this function, we look also for the sharp s in EXACTF
4359              * and EXACTFL nodes; it can be in the final position.  Otherwise
4360              * we can stop looking 1 byte earlier because have to find at least
4361              * two characters for a multi-fold */
4362             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4363                               ? s_end
4364                               : s_end -1;
4365
4366             while (s < upper) {
4367                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4368                 if (! len) {    /* Not a multi-char fold. */
4369                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4370                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4371                     {
4372                         *unfolded_multi_char = TRUE;
4373                     }
4374                     s++;
4375                     continue;
4376                 }
4377
4378                 if (len == 2
4379                     && isALPHA_FOLD_EQ(*s, 's')
4380                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4381                 {
4382
4383                     /* EXACTF nodes need to know that the minimum length
4384                      * changed so that a sharp s in the string can match this
4385                      * ss in the pattern, but they remain EXACTF nodes, as they
4386                      * won't match this unless the target string is is UTF-8,
4387                      * which we don't know until runtime.  EXACTFL nodes can't
4388                      * transform into EXACTFU nodes */
4389                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4390                         OP(scan) = EXACTFUP;
4391                     }
4392                 }
4393
4394                 *min_subtract += len - 1;
4395                 s += len;
4396             }
4397 #endif
4398         }
4399
4400         if (     STR_LEN(scan) == 1
4401             &&   isALPHA_A(* STRING(scan))
4402             &&  (         OP(scan) == EXACTFAA
4403                  || (     OP(scan) == EXACTFU
4404                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4405         {
4406             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4407
4408             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4409              * with the mask set to the complement of the bit that differs
4410              * between upper and lower case, and the lowest code point of the
4411              * pair (which the '&' forces) */
4412             OP(scan) = ANYOFM;
4413             ARG_SET(scan, *STRING(scan) & mask);
4414             FLAGS(scan) = mask;
4415         }
4416     }
4417
4418 #ifdef DEBUGGING
4419     /* Allow dumping but overwriting the collection of skipped
4420      * ops and/or strings with fake optimized ops */
4421     n = scan + NODE_SZ_STR(scan);
4422     while (n <= stop) {
4423         OP(n) = OPTIMIZED;
4424         FLAGS(n) = 0;
4425         NEXT_OFF(n) = 0;
4426         n++;
4427     }
4428 #endif
4429     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4430     return stopnow;
4431 }
4432
4433 /* REx optimizer.  Converts nodes into quicker variants "in place".
4434    Finds fixed substrings.  */
4435
4436 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4437    to the position after last scanned or to NULL. */
4438
4439 #define INIT_AND_WITHP \
4440     assert(!and_withp); \
4441     Newx(and_withp, 1, regnode_ssc); \
4442     SAVEFREEPV(and_withp)
4443
4444
4445 static void
4446 S_unwind_scan_frames(pTHX_ const void *p)
4447 {
4448     scan_frame *f= (scan_frame *)p;
4449     do {
4450         scan_frame *n= f->next_frame;
4451         Safefree(f);
4452         f= n;
4453     } while (f);
4454 }
4455
4456 /* the return from this sub is the minimum length that could possibly match */
4457 STATIC SSize_t
4458 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4459                         SSize_t *minlenp, SSize_t *deltap,
4460                         regnode *last,
4461                         scan_data_t *data,
4462                         I32 stopparen,
4463                         U32 recursed_depth,
4464                         regnode_ssc *and_withp,
4465                         U32 flags, U32 depth)
4466                         /* scanp: Start here (read-write). */
4467                         /* deltap: Write maxlen-minlen here. */
4468                         /* last: Stop before this one. */
4469                         /* data: string data about the pattern */
4470                         /* stopparen: treat close N as END */
4471                         /* recursed: which subroutines have we recursed into */
4472                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4473 {
4474     dVAR;
4475     /* There must be at least this number of characters to match */
4476     SSize_t min = 0;
4477     I32 pars = 0, code;
4478     regnode *scan = *scanp, *next;
4479     SSize_t delta = 0;
4480     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4481     int is_inf_internal = 0;            /* The studied chunk is infinite */
4482     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4483     scan_data_t data_fake;
4484     SV *re_trie_maxbuff = NULL;
4485     regnode *first_non_open = scan;
4486     SSize_t stopmin = SSize_t_MAX;
4487     scan_frame *frame = NULL;
4488     GET_RE_DEBUG_FLAGS_DECL;
4489
4490     PERL_ARGS_ASSERT_STUDY_CHUNK;
4491     RExC_study_started= 1;
4492
4493     Zero(&data_fake, 1, scan_data_t);
4494
4495     if ( depth == 0 ) {
4496         while (first_non_open && OP(first_non_open) == OPEN)
4497             first_non_open=regnext(first_non_open);
4498     }
4499
4500
4501   fake_study_recurse:
4502     DEBUG_r(
4503         RExC_study_chunk_recursed_count++;
4504     );
4505     DEBUG_OPTIMISE_MORE_r(
4506     {
4507         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4508             depth, (long)stopparen,
4509             (unsigned long)RExC_study_chunk_recursed_count,
4510             (unsigned long)depth, (unsigned long)recursed_depth,
4511             scan,
4512             last);
4513         if (recursed_depth) {
4514             U32 i;
4515             U32 j;
4516             for ( j = 0 ; j < recursed_depth ; j++ ) {
4517                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4518                     if (
4519                         PAREN_TEST(RExC_study_chunk_recursed +
4520                                    ( j * RExC_study_chunk_recursed_bytes), i )
4521                         && (
4522                             !j ||
4523                             !PAREN_TEST(RExC_study_chunk_recursed +
4524                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4525                         )
4526                     ) {
4527                         Perl_re_printf( aTHX_ " %d",(int)i);
4528                         break;
4529                     }
4530                 }
4531                 if ( j + 1 < recursed_depth ) {
4532                     Perl_re_printf( aTHX_  ",");
4533                 }
4534             }
4535         }
4536         Perl_re_printf( aTHX_ "\n");
4537     }
4538     );
4539     while ( scan && OP(scan) != END && scan < last ){
4540         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4541                                    node length to get a real minimum (because
4542                                    the folded version may be shorter) */
4543         bool unfolded_multi_char = FALSE;
4544         /* Peephole optimizer: */
4545         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4546         DEBUG_PEEP("Peep", scan, depth, flags);
4547
4548
4549         /* The reason we do this here is that we need to deal with things like
4550          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4551          * parsing code, as each (?:..) is handled by a different invocation of
4552          * reg() -- Yves
4553          */
4554         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4555
4556         /* Follow the next-chain of the current node and optimize
4557            away all the NOTHINGs from it.  */
4558         if (OP(scan) != CURLYX) {
4559             const int max = (reg_off_by_arg[OP(scan)]
4560                             ? I32_MAX
4561                               /* I32 may be smaller than U16 on CRAYs! */
4562                             : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4563             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4564             int noff;
4565             regnode *n = scan;
4566
4567             /* Skip NOTHING and LONGJMP. */
4568             while (   (n = regnext(n))
4569                    && (   (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4570                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4571                    && off + noff < max)
4572                 off += noff;
4573             if (reg_off_by_arg[OP(scan)])
4574                 ARG(scan) = off;
4575             else
4576                 NEXT_OFF(scan) = off;
4577         }
4578
4579         /* The principal pseudo-switch.  Cannot be a switch, since we look into
4580          * several different things.  */
4581         if ( OP(scan) == DEFINEP ) {
4582             SSize_t minlen = 0;
4583             SSize_t deltanext = 0;
4584             SSize_t fake_last_close = 0;
4585             I32 f = SCF_IN_DEFINE;
4586
4587             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4588             scan = regnext(scan);
4589             assert( OP(scan) == IFTHEN );
4590             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4591
4592             data_fake.last_closep= &fake_last_close;
4593             minlen = *minlenp;
4594             next = regnext(scan);
4595             scan = NEXTOPER(NEXTOPER(scan));
4596             DEBUG_PEEP("scan", scan, depth, flags);
4597             DEBUG_PEEP("next", next, depth, flags);
4598
4599             /* we suppose the run is continuous, last=next...
4600              * NOTE we dont use the return here! */
4601             /* DEFINEP study_chunk() recursion */
4602             (void)study_chunk(pRExC_state, &scan, &minlen,
4603                               &deltanext, next, &data_fake, stopparen,
4604                               recursed_depth, NULL, f, depth+1);
4605
4606             scan = next;
4607         } else
4608         if (
4609             OP(scan) == BRANCH  ||
4610             OP(scan) == BRANCHJ ||
4611             OP(scan) == IFTHEN
4612         ) {
4613             next = regnext(scan);
4614             code = OP(scan);
4615
4616             /* The op(next)==code check below is to see if we
4617              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4618              * IFTHEN is special as it might not appear in pairs.
4619              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4620              * we dont handle it cleanly. */
4621             if (OP(next) == code || code == IFTHEN) {
4622                 /* NOTE - There is similar code to this block below for
4623                  * handling TRIE nodes on a re-study.  If you change stuff here
4624                  * check there too. */
4625                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4626                 regnode_ssc accum;
4627                 regnode * const startbranch=scan;
4628
4629                 if (flags & SCF_DO_SUBSTR) {
4630                     /* Cannot merge strings after this. */
4631                     scan_commit(pRExC_state, data, minlenp, is_inf);
4632                 }
4633
4634                 if (flags & SCF_DO_STCLASS)
4635                     ssc_init_zero(pRExC_state, &accum);
4636
4637                 while (OP(scan) == code) {
4638                     SSize_t deltanext, minnext, fake;
4639                     I32 f = 0;
4640                     regnode_ssc this_class;
4641
4642                     DEBUG_PEEP("Branch", scan, depth, flags);
4643
4644                     num++;
4645                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4646                     if (data) {
4647                         data_fake.whilem_c = data->whilem_c;
4648                         data_fake.last_closep = data->last_closep;
4649                     }
4650                     else
4651                         data_fake.last_closep = &fake;
4652
4653                     data_fake.pos_delta = delta;
4654                     next = regnext(scan);
4655
4656                     scan = NEXTOPER(scan); /* everything */
4657                     if (code != BRANCH)    /* everything but BRANCH */
4658                         scan = NEXTOPER(scan);
4659
4660                     if (flags & SCF_DO_STCLASS) {
4661                         ssc_init(pRExC_state, &this_class);
4662                         data_fake.start_class = &this_class;
4663                         f = SCF_DO_STCLASS_AND;
4664                     }
4665                     if (flags & SCF_WHILEM_VISITED_POS)
4666                         f |= SCF_WHILEM_VISITED_POS;
4667
4668                     /* we suppose the run is continuous, last=next...*/
4669                     /* recurse study_chunk() for each BRANCH in an alternation */
4670                     minnext = study_chunk(pRExC_state, &scan, minlenp,
4671                                       &deltanext, next, &data_fake, stopparen,
4672                                       recursed_depth, NULL, f, depth+1);
4673
4674                     if (min1 > minnext)
4675                         min1 = minnext;
4676                     if (deltanext == SSize_t_MAX) {
4677                         is_inf = is_inf_internal = 1;
4678                         max1 = SSize_t_MAX;
4679                     } else if (max1 < minnext + deltanext)
4680                         max1 = minnext + deltanext;
4681                     scan = next;
4682                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4683                         pars++;
4684                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4685                         if ( stopmin > minnext)
4686                             stopmin = min + min1;
4687                         flags &= ~SCF_DO_SUBSTR;
4688                         if (data)
4689                             data->flags |= SCF_SEEN_ACCEPT;
4690                     }
4691                     if (data) {
4692                         if (data_fake.flags & SF_HAS_EVAL)
4693                             data->flags |= SF_HAS_EVAL;
4694                         data->whilem_c = data_fake.whilem_c;
4695                     }
4696                     if (flags & SCF_DO_STCLASS)
4697                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4698                 }
4699                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4700                     min1 = 0;
4701                 if (flags & SCF_DO_SUBSTR) {
4702                     data->pos_min += min1;
4703                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4704                         data->pos_delta = SSize_t_MAX;
4705                     else
4706                         data->pos_delta += max1 - min1;
4707                     if (max1 != min1 || is_inf)
4708                         data->cur_is_floating = 1;
4709                 }
4710                 min += min1;
4711                 if (delta == SSize_t_MAX
4712                  || SSize_t_MAX - delta - (max1 - min1) < 0)
4713                     delta = SSize_t_MAX;
4714                 else
4715                     delta += max1 - min1;
4716                 if (flags & SCF_DO_STCLASS_OR) {
4717                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4718                     if (min1) {
4719                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4720                         flags &= ~SCF_DO_STCLASS;
4721                     }
4722                 }
4723                 else if (flags & SCF_DO_STCLASS_AND) {
4724                     if (min1) {
4725                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4726                         flags &= ~SCF_DO_STCLASS;
4727                     }
4728                     else {
4729                         /* Switch to OR mode: cache the old value of
4730                          * data->start_class */
4731                         INIT_AND_WITHP;
4732                         StructCopy(data->start_class, and_withp, regnode_ssc);
4733                         flags &= ~SCF_DO_STCLASS_AND;
4734                         StructCopy(&accum, data->start_class, regnode_ssc);
4735                         flags |= SCF_DO_STCLASS_OR;
4736                     }
4737                 }
4738
4739                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4740                         OP( startbranch ) == BRANCH )
4741                 {
4742                 /* demq.
4743
4744                    Assuming this was/is a branch we are dealing with: 'scan'
4745                    now points at the item that follows the branch sequence,
4746                    whatever it is. We now start at the beginning of the
4747                    sequence and look for subsequences of
4748
4749                    BRANCH->EXACT=>x1
4750                    BRANCH->EXACT=>x2
4751                    tail
4752
4753                    which would be constructed from a pattern like
4754                    /A|LIST|OF|WORDS/
4755
4756                    If we can find such a subsequence we need to turn the first
4757                    element into a trie and then add the subsequent branch exact
4758                    strings to the trie.
4759
4760                    We have two cases
4761
4762                      1. patterns where the whole set of branches can be
4763                         converted.
4764
4765                      2. patterns where only a subset can be converted.
4766
4767                    In case 1 we can replace the whole set with a single regop
4768                    for the trie. In case 2 we need to keep the start and end
4769                    branches so
4770
4771                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4772                      becomes BRANCH TRIE; BRANCH X;
4773
4774                   There is an additional case, that being where there is a
4775                   common prefix, which gets split out into an EXACT like node
4776                   preceding the TRIE node.
4777
4778                   If x(1..n)==tail then we can do a simple trie, if not we make
4779                   a "jump" trie, such that when we match the appropriate word
4780                   we "jump" to the appropriate tail node. Essentially we turn
4781                   a nested if into a case structure of sorts.
4782
4783                 */
4784
4785                     int made=0;
4786                     if (!re_trie_maxbuff) {
4787                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4788                         if (!SvIOK(re_trie_maxbuff))
4789                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4790                     }
4791                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4792                         regnode *cur;
4793                         regnode *first = (regnode *)NULL;
4794                         regnode *prev = (regnode *)NULL;
4795                         regnode *tail = scan;
4796                         U8 trietype = 0;
4797                         U32 count=0;
4798
4799                         /* var tail is used because there may be a TAIL
4800                            regop in the way. Ie, the exacts will point to the
4801                            thing following the TAIL, but the last branch will
4802                            point at the TAIL. So we advance tail. If we
4803                            have nested (?:) we may have to move through several
4804                            tails.
4805                          */
4806
4807                         while ( OP( tail ) == TAIL ) {
4808                             /* this is the TAIL generated by (?:) */
4809                             tail = regnext( tail );
4810                         }
4811
4812
4813                         DEBUG_TRIE_COMPILE_r({
4814                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4815                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
4816                               depth+1,
4817                               "Looking for TRIE'able sequences. Tail node is ",
4818                               (UV) REGNODE_OFFSET(tail),
4819                               SvPV_nolen_const( RExC_mysv )
4820                             );
4821                         });
4822
4823                         /*
4824
4825                             Step through the branches
4826                                 cur represents each branch,
4827                                 noper is the first thing to be matched as part
4828                                       of that branch
4829                                 noper_next is the regnext() of that node.
4830
4831                             We normally handle a case like this
4832                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4833                             support building with NOJUMPTRIE, which restricts
4834                             the trie logic to structures like /FOO|BAR/.
4835
4836                             If noper is a trieable nodetype then the branch is
4837                             a possible optimization target. If we are building
4838                             under NOJUMPTRIE then we require that noper_next is
4839                             the same as scan (our current position in the regex
4840                             program).
4841
4842                             Once we have two or more consecutive such branches
4843                             we can create a trie of the EXACT's contents and
4844                             stitch it in place into the program.
4845
4846                             If the sequence represents all of the branches in
4847                             the alternation we replace the entire thing with a
4848                             single TRIE node.
4849
4850                             Otherwise when it is a subsequence we need to
4851                             stitch it in place and replace only the relevant
4852                             branches. This means the first branch has to remain
4853                             as it is used by the alternation logic, and its
4854                             next pointer, and needs to be repointed at the item
4855                             on the branch chain following the last branch we
4856                             have optimized away.
4857
4858                             This could be either a BRANCH, in which case the
4859                             subsequence is internal, or it could be the item
4860                             following the branch sequence in which case the
4861                             subsequence is at the end (which does not
4862                             necessarily mean the first node is the start of the
4863                             alternation).
4864
4865                             TRIE_TYPE(X) is a define which maps the optype to a
4866                             trietype.
4867
4868                                 optype          |  trietype
4869                                 ----------------+-----------
4870                                 NOTHING         | NOTHING
4871                                 EXACT           | EXACT
4872                                 EXACT_REQ8     | EXACT
4873                                 EXACTFU         | EXACTFU
4874                                 EXACTFU_REQ8   | EXACTFU
4875                                 EXACTFUP        | EXACTFU
4876                                 EXACTFAA        | EXACTFAA
4877                                 EXACTL          | EXACTL
4878                                 EXACTFLU8       | EXACTFLU8
4879
4880
4881                         */
4882 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4883                        ? NOTHING                                            \
4884                        : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
4885                          ? EXACT                                            \
4886                          : (     EXACTFU == (X)                             \
4887                               || EXACTFU_REQ8 == (X)                       \
4888                               || EXACTFUP == (X) )                          \
4889                            ? EXACTFU                                        \
4890                            : ( EXACTFAA == (X) )                            \
4891                              ? EXACTFAA                                     \
4892                              : ( EXACTL == (X) )                            \
4893                                ? EXACTL                                     \
4894                                : ( EXACTFLU8 == (X) )                       \
4895                                  ? EXACTFLU8                                \
4896                                  : 0 )
4897
4898                         /* dont use tail as the end marker for this traverse */
4899                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4900                             regnode * const noper = NEXTOPER( cur );
4901                             U8 noper_type = OP( noper );
4902                             U8 noper_trietype = TRIE_TYPE( noper_type );
4903 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4904                             regnode * const noper_next = regnext( noper );
4905                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4906                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4907 #endif
4908
4909                             DEBUG_TRIE_COMPILE_r({
4910                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4911                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4912                                    depth+1,
4913                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4914
4915                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4916                                 Perl_re_printf( aTHX_  " -> %d:%s",
4917                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4918
4919                                 if ( noper_next ) {
4920                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4921                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4922                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4923                                 }
4924                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4925                                    REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
4926                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4927                                 );
4928                             });
4929
4930                             /* Is noper a trieable nodetype that can be merged
4931                              * with the current trie (if there is one)? */
4932                             if ( noper_trietype
4933                                   &&
4934                                   (
4935                                         ( noper_trietype == NOTHING )
4936                                         || ( trietype == NOTHING )
4937                                         || ( trietype == noper_trietype )
4938                                   )
4939 #ifdef NOJUMPTRIE
4940                                   && noper_next >= tail
4941 #endif
4942                                   && count < U16_MAX)
4943                             {
4944                                 /* Handle mergable triable node Either we are
4945                                  * the first node in a new trieable sequence,
4946                                  * in which case we do some bookkeeping,
4947                                  * otherwise we update the end pointer. */
4948                                 if ( !first ) {
4949                                     first = cur;
4950                                     if ( noper_trietype == NOTHING ) {
4951 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4952                                         regnode * const noper_next = regnext( noper );
4953                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4954                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4955 #endif
4956
4957                                         if ( noper_next_trietype ) {
4958                                             trietype = noper_next_trietype;
4959                                         } else if (noper_next_type)  {
4960                                             /* a NOTHING regop is 1 regop wide.
4961                                              * We need at least two for a trie
4962                                              * so we can't merge this in */
4963                                             first = NULL;
4964                                         }
4965                                     } else {
4966                                         trietype = noper_trietype;
4967                                     }
4968                                 } else {
4969                                     if ( trietype == NOTHING )
4970                                         trietype = noper_trietype;
4971                                     prev = cur;
4972                                 }
4973                                 if (first)
4974                                     count++;
4975                             } /* end handle mergable triable node */
4976                             else {
4977                                 /* handle unmergable node -
4978                                  * noper may either be a triable node which can
4979                                  * not be tried together with the current trie,
4980                                  * or a non triable node */
4981                                 if ( prev ) {
4982                                     /* If last is set and trietype is not
4983                                      * NOTHING then we have found at least two
4984                                      * triable branch sequences in a row of a
4985                                      * similar trietype so we can turn them
4986                                      * into a trie. If/when we allow NOTHING to
4987                                      * start a trie sequence this condition
4988                                      * will be required, and it isn't expensive
4989                                      * so we leave it in for now. */
4990                                     if ( trietype && trietype != NOTHING )
4991                                         make_trie( pRExC_state,
4992                                                 startbranch, first, cur, tail,
4993                                                 count, trietype, depth+1 );
4994                                     prev = NULL; /* note: we clear/update
4995                                                     first, trietype etc below,
4996                                                     so we dont do it here */
4997                                 }
4998                                 if ( noper_trietype
4999 #ifdef NOJUMPTRIE
5000                                      && noper_next >= tail
5001 #endif
5002                                 ){
5003                                     /* noper is triable, so we can start a new
5004                                      * trie sequence */
5005                                     count = 1;
5006                                     first = cur;
5007                                     trietype = noper_trietype;
5008                                 } else if (first) {
5009                                     /* if we already saw a first but the
5010                                      * current node is not triable then we have
5011                                      * to reset the first information. */
5012                                     count = 0;
5013                                     first = NULL;
5014                                     trietype = 0;
5015                                 }
5016                             } /* end handle unmergable node */
5017                         } /* loop over branches */
5018                         DEBUG_TRIE_COMPILE_r({
5019                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5020                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
5021                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5022                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5023                                REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5024                                PL_reg_name[trietype]
5025                             );
5026
5027                         });
5028                         if ( prev && trietype ) {
5029                             if ( trietype != NOTHING ) {
5030                                 /* the last branch of the sequence was part of
5031                                  * a trie, so we have to construct it here
5032                                  * outside of the loop */
5033                                 made= make_trie( pRExC_state, startbranch,
5034                                                  first, scan, tail, count,
5035                                                  trietype, depth+1 );
5036 #ifdef TRIE_STUDY_OPT
5037                                 if ( ((made == MADE_EXACT_TRIE &&
5038                                      startbranch == first)
5039                                      || ( first_non_open == first )) &&
5040                                      depth==0 ) {
5041                                     flags |= SCF_TRIE_RESTUDY;
5042                                     if ( startbranch == first
5043                                          && scan >= tail )
5044                                     {
5045                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5046                                     }
5047                                 }
5048 #endif
5049                             } else {
5050                                 /* at this point we know whatever we have is a
5051                                  * NOTHING sequence/branch AND if 'startbranch'
5052                                  * is 'first' then we can turn the whole thing
5053                                  * into a NOTHING
5054                                  */
5055                                 if ( startbranch == first ) {
5056                                     regnode *opt;
5057                                     /* the entire thing is a NOTHING sequence,
5058                                      * something like this: (?:|) So we can
5059                                      * turn it into a plain NOTHING op. */
5060                                     DEBUG_TRIE_COMPILE_r({
5061                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5062                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5063                                           depth+1,
5064                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5065
5066                                     });
5067                                     OP(startbranch)= NOTHING;
5068                                     NEXT_OFF(startbranch)= tail - startbranch;
5069                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
5070                                         OP(opt)= OPTIMIZED;
5071                                 }
5072                             }
5073                         } /* end if ( prev) */
5074                     } /* TRIE_MAXBUF is non zero */
5075                 } /* do trie */
5076
5077             }
5078             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
5079                 scan = NEXTOPER(NEXTOPER(scan));
5080             } else                      /* single branch is optimized. */
5081                 scan = NEXTOPER(scan);
5082             continue;
5083         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5084             I32 paren = 0;
5085             regnode *start = NULL;
5086             regnode *end = NULL;
5087             U32 my_recursed_depth= recursed_depth;
5088
5089             if (OP(scan) != SUSPEND) { /* GOSUB */
5090                 /* Do setup, note this code has side effects beyond
5091                  * the rest of this block. Specifically setting
5092                  * RExC_recurse[] must happen at least once during
5093                  * study_chunk(). */
5094                 paren = ARG(scan);
5095                 RExC_recurse[ARG2L(scan)] = scan;
5096                 start = REGNODE_p(RExC_open_parens[paren]);
5097                 end   = REGNODE_p(RExC_close_parens[paren]);
5098
5099                 /* NOTE we MUST always execute the above code, even
5100                  * if we do nothing with a GOSUB */
5101                 if (
5102                     ( flags & SCF_IN_DEFINE )
5103                     ||
5104                     (
5105                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5106                         &&
5107                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5108                     )
5109                 ) {
5110                     /* no need to do anything here if we are in a define. */
5111                     /* or we are after some kind of infinite construct
5112                      * so we can skip recursing into this item.
5113                      * Since it is infinite we will not change the maxlen
5114                      * or delta, and if we miss something that might raise
5115                      * the minlen it will merely pessimise a little.
5116                      *
5117                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5118                      * might result in a minlen of 1 and not of 4,
5119                      * but this doesn't make us mismatch, just try a bit
5120                      * harder than we should.
5121                      * */
5122                     scan= regnext(scan);
5123                     continue;
5124                 }
5125
5126                 if (
5127                     !recursed_depth
5128                     ||
5129                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5130                 ) {
5131                     /* it is quite possible that there are more efficient ways
5132                      * to do this. We maintain a bitmap per level of recursion
5133                      * of which patterns we have entered so we can detect if a
5134                      * pattern creates a possible infinite loop. When we
5135                      * recurse down a level we copy the previous levels bitmap
5136                      * down. When we are at recursion level 0 we zero the top
5137                      * level bitmap. It would be nice to implement a different
5138                      * more efficient way of doing this. In particular the top
5139                      * level bitmap may be unnecessary.
5140                      */
5141                     if (!recursed_depth) {
5142                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5143                     } else {
5144                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5145                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5146                              RExC_study_chunk_recursed_bytes, U8);
5147                     }
5148                     /* we havent recursed into this paren yet, so recurse into it */
5149                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5150                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5151                     my_recursed_depth= recursed_depth + 1;
5152                 } else {
5153                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5154                     /* some form of infinite recursion, assume infinite length
5155                      * */
5156                     if (flags & SCF_DO_SUBSTR) {
5157                         scan_commit(pRExC_state, data, minlenp, is_inf);
5158                         data->cur_is_floating = 1;
5159                     }
5160                     is_inf = is_inf_internal = 1;
5161                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5162                         ssc_anything(data->start_class);
5163                     flags &= ~SCF_DO_STCLASS;
5164
5165                     start= NULL; /* reset start so we dont recurse later on. */
5166                 }
5167             } else {
5168                 paren = stopparen;
5169                 start = scan + 2;
5170                 end = regnext(scan);
5171             }
5172             if (start) {
5173                 scan_frame *newframe;
5174                 assert(end);
5175                 if (!RExC_frame_last) {
5176                     Newxz(newframe, 1, scan_frame);
5177                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5178                     RExC_frame_head= newframe;
5179                     RExC_frame_count++;
5180                 } else if (!RExC_frame_last->next_frame) {
5181                     Newxz(newframe, 1, scan_frame);
5182                     RExC_frame_last->next_frame= newframe;
5183                     newframe->prev_frame= RExC_frame_last;
5184                     RExC_frame_count++;
5185                 } else {
5186                     newframe= RExC_frame_last->next_frame;
5187                 }
5188                 RExC_frame_last= newframe;
5189
5190                 newframe->next_regnode = regnext(scan);
5191                 newframe->last_regnode = last;
5192                 newframe->stopparen = stopparen;
5193                 newframe->prev_recursed_depth = recursed_depth;
5194                 newframe->this_prev_frame= frame;
5195
5196                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5197                 DEBUG_PEEP("fnew", scan, depth, flags);
5198
5199                 frame = newframe;
5200                 scan =  start;
5201                 stopparen = paren;
5202                 last = end;
5203                 depth = depth + 1;
5204                 recursed_depth= my_recursed_depth;
5205
5206                 continue;
5207             }
5208         }
5209         else if (   OP(scan) == EXACT
5210                  || OP(scan) == LEXACT
5211                  || OP(scan) == EXACT_REQ8
5212                  || OP(scan) == LEXACT_REQ8
5213                  || OP(scan) == EXACTL)
5214         {
5215             SSize_t l = STR_LEN(scan);
5216             UV uc;
5217             assert(l);
5218             if (UTF) {
5219                 const U8 * const s = (U8*)STRING(scan);
5220                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5221                 l = utf8_length(s, s + l);
5222             } else {
5223                 uc = *((U8*)STRING(scan));
5224             }
5225             min += l;
5226             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5227                 /* The code below prefers earlier match for fixed
5228                    offset, later match for variable offset.  */
5229                 if (data->last_end == -1) { /* Update the start info. */
5230                     data->last_start_min = data->pos_min;
5231                     data->last_start_max = is_inf
5232                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
5233                 }
5234                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5235                 if (UTF)
5236                     SvUTF8_on(data->last_found);
5237                 {
5238                     SV * const sv = data->last_found;
5239                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5240                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5241                     if (mg && mg->mg_len >= 0)
5242                         mg->mg_len += utf8_length((U8*)STRING(scan),
5243                                               (U8*)STRING(scan)+STR_LEN(scan));
5244                 }
5245                 data->last_end = data->pos_min + l;
5246                 data->pos_min += l; /* As in the first entry. */
5247                 data->flags &= ~SF_BEFORE_EOL;
5248             }
5249
5250             /* ANDing the code point leaves at most it, and not in locale, and
5251              * can't match null string */
5252             if (flags & SCF_DO_STCLASS_AND) {
5253                 ssc_cp_and(data->start_class, uc);
5254                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5255                 ssc_clear_locale(data->start_class);
5256             }
5257             else if (flags & SCF_DO_STCLASS_OR) {
5258                 ssc_add_cp(data->start_class, uc);
5259                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5260
5261                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5262                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5263             }
5264             flags &= ~SCF_DO_STCLASS;
5265         }
5266         else if (PL_regkind[OP(scan)] == EXACT) {
5267             /* But OP != EXACT!, so is EXACTFish */
5268             SSize_t l = STR_LEN(scan);
5269             const U8 * s = (U8*)STRING(scan);
5270
5271             /* Search for fixed substrings supports EXACT only. */
5272             if (flags & SCF_DO_SUBSTR) {
5273                 assert(data);
5274                 scan_commit(pRExC_state, data, minlenp, is_inf);
5275             }
5276             if (UTF) {
5277                 l = utf8_length(s, s + l);
5278             }
5279             if (unfolded_multi_char) {
5280                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5281             }
5282             min += l - min_subtract;
5283             assert (min >= 0);
5284             delta += min_subtract;
5285             if (flags & SCF_DO_SUBSTR) {
5286                 data->pos_min += l - min_subtract;
5287                 if (data->pos_min < 0) {
5288                     data->pos_min = 0;
5289                 }
5290                 data->pos_delta += min_subtract;
5291                 if (min_subtract) {
5292                     data->cur_is_floating = 1; /* float */
5293                 }
5294             }
5295
5296             if (flags & SCF_DO_STCLASS) {
5297                 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5298
5299                 assert(EXACTF_invlist);
5300                 if (flags & SCF_DO_STCLASS_AND) {
5301                     if (OP(scan) != EXACTFL)
5302                         ssc_clear_locale(data->start_class);
5303                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5304                     ANYOF_POSIXL_ZERO(data->start_class);
5305                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5306                 }
5307                 else {  /* SCF_DO_STCLASS_OR */
5308                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
5309                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5310
5311                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5312                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5313                 }
5314                 flags &= ~SCF_DO_STCLASS;
5315                 SvREFCNT_dec(EXACTF_invlist);
5316             }
5317         }
5318         else if (REGNODE_VARIES(OP(scan))) {
5319             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5320             I32 fl = 0, f = flags;
5321             regnode * const oscan = scan;
5322             regnode_ssc this_class;
5323             regnode_ssc *oclass = NULL;
5324             I32 next_is_eval = 0;
5325
5326             switch (PL_regkind[OP(scan)]) {
5327             case WHILEM:                /* End of (?:...)* . */
5328                 scan = NEXTOPER(scan);
5329                 goto finish;
5330             case PLUS:
5331                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5332                     next = NEXTOPER(scan);
5333                     if (   OP(next) == EXACT
5334                         || OP(next) == LEXACT
5335                         || OP(next) == EXACT_REQ8
5336                         || OP(next) == LEXACT_REQ8
5337                         || OP(next) == EXACTL
5338                         || (flags & SCF_DO_STCLASS))
5339                     {
5340                         mincount = 1;
5341                         maxcount = REG_INFTY;
5342                         next = regnext(scan);
5343                         scan = NEXTOPER(scan);
5344                         goto do_curly;
5345                     }
5346                 }
5347                 if (flags & SCF_DO_SUBSTR)
5348                     data->pos_min++;
5349                 min++;
5350                 /* FALLTHROUGH */
5351             case STAR:
5352                 next = NEXTOPER(scan);
5353
5354                 /* This temporary node can now be turned into EXACTFU, and
5355                  * must, as regexec.c doesn't handle it */
5356                 if (OP(next) == EXACTFU_S_EDGE) {
5357                     OP(next) = EXACTFU;
5358                 }
5359
5360                 if (     STR_LEN(next) == 1
5361                     &&   isALPHA_A(* STRING(next))
5362                     && (         OP(next) == EXACTFAA
5363                         || (     OP(next) == EXACTFU
5364                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5365                 {
5366                     /* These differ in just one bit */
5367                     U8 mask = ~ ('A' ^ 'a');
5368
5369                     assert(isALPHA_A(* STRING(next)));
5370
5371                     /* Then replace it by an ANYOFM node, with
5372                     * the mask set to the complement of the
5373                     * bit that differs between upper and lower
5374                     * case, and the lowest code point of the
5375                     * pair (which the '&' forces) */
5376                     OP(next) = ANYOFM;
5377                     ARG_SET(next, *STRING(next) & mask);
5378                     FLAGS(next) = mask;
5379                 }
5380
5381                 if (flags & SCF_DO_STCLASS) {
5382                     mincount = 0;
5383                     maxcount = REG_INFTY;
5384                     next = regnext(scan);
5385                     scan = NEXTOPER(scan);
5386                     goto do_curly;
5387                 }
5388                 if (flags & SCF_DO_SUBSTR) {
5389                     scan_commit(pRExC_state, data, minlenp, is_inf);
5390                     /* Cannot extend fixed substrings */
5391                     data->cur_is_floating = 1; /* float */
5392                 }
5393                 is_inf = is_inf_internal = 1;
5394                 scan = regnext(scan);
5395                 goto optimize_curly_tail;
5396             case CURLY:
5397                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5398                     && (scan->flags == stopparen))
5399                 {
5400                     mincount = 1;
5401                     maxcount = 1;
5402                 } else {
5403                     mincount = ARG1(scan);
5404                     maxcount = ARG2(scan);
5405                 }
5406                 next = regnext(scan);
5407                 if (OP(scan) == CURLYX) {
5408                     I32 lp = (data ? *(data->last_closep) : 0);
5409                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5410                 }
5411                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5412                 next_is_eval = (OP(scan) == EVAL);
5413               do_curly:
5414                 if (flags & SCF_DO_SUBSTR) {
5415                     if (mincount == 0)
5416                         scan_commit(pRExC_state, data, minlenp, is_inf);
5417                     /* Cannot extend fixed substrings */
5418                     pos_before = data->pos_min;
5419                 }
5420                 if (data) {
5421                     fl = data->flags;
5422                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5423                     if (is_inf)
5424                         data->flags |= SF_IS_INF;
5425                 }
5426                 if (flags & SCF_DO_STCLASS) {
5427                     ssc_init(pRExC_state, &this_class);
5428                     oclass = data->start_class;
5429                     data->start_class = &this_class;
5430                     f |= SCF_DO_STCLASS_AND;
5431                     f &= ~SCF_DO_STCLASS_OR;
5432                 }
5433                 /* Exclude from super-linear cache processing any {n,m}
5434                    regops for which the combination of input pos and regex
5435                    pos is not enough information to determine if a match
5436                    will be possible.
5437
5438                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
5439                    regex pos at the \s*, the prospects for a match depend not
5440                    only on the input position but also on how many (bar\s*)
5441                    repeats into the {4,8} we are. */
5442                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5443                     f &= ~SCF_WHILEM_VISITED_POS;
5444
5445                 /* This will finish on WHILEM, setting scan, or on NULL: */
5446                 /* recurse study_chunk() on loop bodies */
5447                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5448                                   last, data, stopparen, recursed_depth, NULL,
5449                                   (mincount == 0
5450                                    ? (f & ~SCF_DO_SUBSTR)
5451                                    : f)
5452                                   ,depth+1);
5453
5454                 if (flags & SCF_DO_STCLASS)
5455                     data->start_class = oclass;
5456                 if (mincount == 0 || minnext == 0) {
5457                     if (flags & SCF_DO_STCLASS_OR) {
5458                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5459                     }
5460                     else if (flags & SCF_DO_STCLASS_AND) {
5461                         /* Switch to OR mode: cache the old value of
5462                          * data->start_class */
5463                         INIT_AND_WITHP;
5464                         StructCopy(data->start_class, and_withp, regnode_ssc);
5465                         flags &= ~SCF_DO_STCLASS_AND;
5466                         StructCopy(&this_class, data->start_class, regnode_ssc);
5467                         flags |= SCF_DO_STCLASS_OR;
5468                         ANYOF_FLAGS(data->start_class)
5469                                                 |= SSC_MATCHES_EMPTY_STRING;
5470                     }
5471                 } else {                /* Non-zero len */
5472                     if (flags & SCF_DO_STCLASS_OR) {
5473                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5474                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5475                     }
5476                     else if (flags & SCF_DO_STCLASS_AND)
5477                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5478                     flags &= ~SCF_DO_STCLASS;
5479                 }
5480                 if (!scan)              /* It was not CURLYX, but CURLY. */
5481                     scan = next;
5482                 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5483                     /* ? quantifier ok, except for (?{ ... }) */
5484                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
5485                     && (minnext == 0) && (deltanext == 0)
5486                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5487                     && maxcount <= REG_INFTY/3) /* Complement check for big
5488                                                    count */
5489                 {
5490                     _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5491                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5492                             "Quantifier unexpected on zero-length expression "
5493                             "in regex m/%" UTF8f "/",
5494                              UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5495                                   RExC_precomp)));
5496                 }
5497
5498                 min += minnext * mincount;
5499                 is_inf_internal |= deltanext == SSize_t_MAX
5500                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5501                 is_inf |= is_inf_internal;
5502                 if (is_inf) {
5503                     delta = SSize_t_MAX;
5504                 } else {
5505                     delta += (minnext + deltanext) * maxcount
5506                              - minnext * mincount;
5507                 }
5508                 /* Try powerful optimization CURLYX => CURLYN. */
5509                 if (  OP(oscan) == CURLYX && data
5510                       && data->flags & SF_IN_PAR
5511                       && !(data->flags & SF_HAS_EVAL)
5512                       && !deltanext && minnext == 1 ) {
5513                     /* Try to optimize to CURLYN.  */
5514                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5515                     regnode * const nxt1 = nxt;
5516 #ifdef DEBUGGING
5517                     regnode *nxt2;
5518 #endif
5519
5520                     /* Skip open. */
5521                     nxt = regnext(nxt);
5522                     if (!REGNODE_SIMPLE(OP(nxt))
5523                         && !(PL_regkind[OP(nxt)] == EXACT
5524                              && STR_LEN(nxt) == 1))
5525                         goto nogo;
5526 #ifdef DEBUGGING
5527                     nxt2 = nxt;
5528 #endif
5529                     nxt = regnext(nxt);
5530                     if (OP(nxt) != CLOSE)
5531                         goto nogo;
5532                     if (RExC_open_parens) {
5533
5534                         /*open->CURLYM*/
5535                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5536
5537                         /*close->while*/
5538                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5539                     }
5540                     /* Now we know that nxt2 is the only contents: */
5541                     oscan->flags = (U8)ARG(nxt);
5542                     OP(oscan) = CURLYN;
5543                     OP(nxt1) = NOTHING; /* was OPEN. */
5544
5545 #ifdef DEBUGGING
5546                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5547                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5548                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5549                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
5550                     OP(nxt + 1) = OPTIMIZED; /* was count. */
5551                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5552 #endif
5553                 }
5554               nogo:
5555
5556                 /* Try optimization CURLYX => CURLYM. */
5557                 if (  OP(oscan) == CURLYX && data
5558                       && !(data->flags & SF_HAS_PAR)
5559                       && !(data->flags & SF_HAS_EVAL)
5560                       && !deltanext     /* atom is fixed width */
5561                       && minnext != 0   /* CURLYM can't handle zero width */
5562
5563                          /* Nor characters whose fold at run-time may be
5564                           * multi-character */
5565                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5566                 ) {
5567                     /* XXXX How to optimize if data == 0? */
5568                     /* Optimize to a simpler form.  */
5569                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5570                     regnode *nxt2;
5571
5572                     OP(oscan) = CURLYM;
5573                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5574                             && (OP(nxt2) != WHILEM))
5575                         nxt = nxt2;
5576                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5577                     /* Need to optimize away parenths. */
5578                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5579                         /* Set the parenth number.  */
5580                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5581
5582                         oscan->flags = (U8)ARG(nxt);
5583                         if (RExC_open_parens) {
5584                              /*open->CURLYM*/
5585                             RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5586
5587                             /*close->NOTHING*/
5588                             RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5589                                                          + 1;
5590                         }
5591                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
5592                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
5593
5594 #ifdef DEBUGGING
5595                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5596                         OP(nxt + 1) = OPTIMIZED; /* was count. */
5597                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5598                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5599 #endif
5600 #if 0
5601                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
5602                             regnode *nnxt = regnext(nxt1);
5603                             if (nnxt == nxt) {
5604                                 if (reg_off_by_arg[OP(nxt1)])
5605                                     ARG_SET(nxt1, nxt2 - nxt1);
5606                                 else if (nxt2 - nxt1 < U16_MAX)
5607                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
5608                                 else
5609                                     OP(nxt) = NOTHING;  /* Cannot beautify */
5610                             }
5611                             nxt1 = nnxt;
5612                         }
5613 #endif
5614                         /* Optimize again: */
5615                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
5616                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5617                                     NULL, stopparen, recursed_depth, NULL, 0,
5618                                     depth+1);
5619                     }
5620                     else
5621                         oscan->flags = 0;
5622                 }
5623                 else if ((OP(oscan) == CURLYX)
5624                          && (flags & SCF_WHILEM_VISITED_POS)
5625                          /* See the comment on a similar expression above.
5626                             However, this time it's not a subexpression
5627                             we care about, but the expression itself. */
5628                          && (maxcount == REG_INFTY)
5629                          && data) {
5630                     /* This stays as CURLYX, we can put the count/of pair. */
5631                     /* Find WHILEM (as in regexec.c) */
5632                     regnode *nxt = oscan + NEXT_OFF(oscan);
5633
5634                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5635                         nxt += ARG(nxt);
5636                     nxt = PREVOPER(nxt);
5637                     if (nxt->flags & 0xf) {
5638                         /* we've already set whilem count on this node */
5639                     } else if (++data->whilem_c < 16) {
5640                         assert(data->whilem_c <= RExC_whilem_seen);
5641                         nxt->flags = (U8)(data->whilem_c
5642                             | (RExC_whilem_seen << 4)); /* On WHILEM */
5643                     }
5644                 }
5645                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5646                     pars++;
5647                 if (flags & SCF_DO_SUBSTR) {
5648                     SV *last_str = NULL;
5649                     STRLEN last_chrs = 0;
5650                     int counted = mincount != 0;
5651
5652                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5653                                                                   string. */
5654                         SSize_t b = pos_before >= data->last_start_min
5655                             ? pos_before : data->last_start_min;
5656                         STRLEN l;
5657                         const char * const s = SvPV_const(data->last_found, l);
5658                         SSize_t old = b - data->last_start_min;
5659                         assert(old >= 0);
5660
5661                         if (UTF)
5662                             old = utf8_hop_forward((U8*)s, old,
5663                                                (U8 *) SvEND(data->last_found))
5664                                 - (U8*)s;
5665                         l -= old;
5666                         /* Get the added string: */
5667                         last_str = newSVpvn_utf8(s  + old, l, UTF);
5668                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5669                                             (U8*)(s + old + l)) : l;
5670                         if (deltanext == 0 && pos_before == b) {
5671                             /* What was added is a constant string */
5672                             if (mincount > 1) {
5673
5674                                 SvGROW(last_str, (mincount * l) + 1);
5675                                 repeatcpy(SvPVX(last_str) + l,
5676                                           SvPVX_const(last_str), l,
5677                                           mincount - 1);
5678                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5679                                 /* Add additional parts. */
5680                                 SvCUR_set(data->last_found,
5681                                           SvCUR(data->last_found) - l);
5682                                 sv_catsv(data->last_found, last_str);
5683                                 {
5684                                     SV * sv = data->last_found;
5685                                     MAGIC *mg =
5686                                         SvUTF8(sv) && SvMAGICAL(sv) ?
5687                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
5688                                     if (mg && mg->mg_len >= 0)
5689                                         mg->mg_len += last_chrs * (mincount-1);
5690                                 }
5691                                 last_chrs *= mincount;
5692                                 data->last_end += l * (mincount - 1);
5693                             }
5694                         } else {
5695                             /* start offset must point into the last copy */
5696                             data->last_start_min += minnext * (mincount - 1);
5697                             data->last_start_max =
5698                               is_inf
5699                                ? SSize_t_MAX
5700                                : data->last_start_max +
5701                                  (maxcount - 1) * (minnext + data->pos_delta);
5702                         }
5703                     }
5704                     /* It is counted once already... */
5705                     data->pos_min += minnext * (mincount - counted);
5706 #if 0
5707 Perl_re_printf( aTHX_  "counted=%" UVuf " deltanext=%" UVuf
5708                               " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5709                               " maxcount=%" UVuf " mincount=%" UVuf "\n",
5710     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5711     (UV)mincount);
5712 if (deltanext != SSize_t_MAX)
5713 Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
5714     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5715           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5716 #endif
5717                     if (deltanext == SSize_t_MAX
5718                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5719                         data->pos_delta = SSize_t_MAX;
5720                     else
5721                         data->pos_delta += - counted * deltanext +
5722                         (minnext + deltanext) * maxcount - minnext * mincount;
5723                     if (mincount != maxcount) {
5724                          /* Cannot extend fixed substrings found inside
5725                             the group.  */
5726                         scan_commit(pRExC_state, data, minlenp, is_inf);
5727                         if (mincount && last_str) {
5728                             SV * const sv = data->last_found;
5729                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5730                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5731
5732                             if (mg)
5733                                 mg->mg_len = -1;
5734                             sv_setsv(sv, last_str);
5735                             data->last_end = data->pos_min;
5736                             data->last_start_min = data->pos_min - last_chrs;
5737                             data->last_start_max = is_inf
5738                                 ? SSize_t_MAX
5739                                 : data->pos_min + data->pos_delta - last_chrs;
5740                         }
5741                         data->cur_is_floating = 1; /* float */
5742                     }
5743                     SvREFCNT_dec(last_str);
5744                 }
5745                 if (data && (fl & SF_HAS_EVAL))
5746                     data->flags |= SF_HAS_EVAL;
5747               optimize_curly_tail:
5748                 if (OP(oscan) != CURLYX) {
5749                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5750                            && NEXT_OFF(next))
5751                         NEXT_OFF(oscan) += NEXT_OFF(next);
5752                 }
5753                 continue;
5754
5755             default:
5756 #ifdef DEBUGGING
5757                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5758                                                                     OP(scan));
5759 #endif
5760             case REF:
5761             case CLUMP:
5762                 if (flags & SCF_DO_SUBSTR) {
5763                     /* Cannot expect anything... */
5764                     scan_commit(pRExC_state, data, minlenp, is_inf);
5765                     data->cur_is_floating = 1; /* float */
5766                 }
5767                 is_inf = is_inf_internal = 1;
5768                 if (flags & SCF_DO_STCLASS_OR) {
5769                     if (OP(scan) == CLUMP) {
5770                         /* Actually is any start char, but very few code points
5771                          * aren't start characters */
5772                         ssc_match_all_cp(data->start_class);
5773                     }
5774                     else {
5775                         ssc_anything(data->start_class);
5776                     }
5777                 }
5778                 flags &= ~SCF_DO_STCLASS;
5779                 break;
5780             }
5781         }
5782         else if (OP(scan) == LNBREAK) {
5783             if (flags & SCF_DO_STCLASS) {
5784                 if (flags & SCF_DO_STCLASS_AND) {
5785                     ssc_intersection(data->start_class,
5786                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5787                     ssc_clear_locale(data->start_class);
5788                     ANYOF_FLAGS(data->start_class)
5789                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5790                 }
5791                 else if (flags & SCF_DO_STCLASS_OR) {
5792                     ssc_union(data->start_class,
5793                               PL_XPosix_ptrs[_CC_VERTSPACE],
5794                               FALSE);
5795                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5796
5797                     /* See commit msg for
5798                      * 749e076fceedeb708a624933726e7989f2302f6a */
5799                     ANYOF_FLAGS(data->start_class)
5800                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5801                 }
5802                 flags &= ~SCF_DO_STCLASS;
5803             }
5804             min++;
5805             if (delta != SSize_t_MAX)
5806                 delta++;    /* Because of the 2 char string cr-lf */
5807             if (flags & SCF_DO_SUBSTR) {
5808                 /* Cannot expect anything... */
5809                 scan_commit(pRExC_state, data, minlenp, is_inf);
5810                 data->pos_min += 1;
5811                 if (data->pos_delta != SSize_t_MAX) {
5812                     data->pos_delta += 1;
5813                 }
5814                 data->cur_is_floating = 1; /* float */
5815             }
5816         }
5817         else if (REGNODE_SIMPLE(OP(scan))) {
5818
5819             if (flags & SCF_DO_SUBSTR) {
5820                 scan_commit(pRExC_state, data, minlenp, is_inf);
5821                 data->pos_min++;
5822             }
5823             min++;
5824             if (flags & SCF_DO_STCLASS) {
5825                 bool invert = 0;
5826                 SV* my_invlist = NULL;
5827                 U8 namedclass;
5828
5829                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5830                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5831
5832                 /* Some of the logic below assumes that switching
5833                    locale on will only add false positives. */
5834                 switch (OP(scan)) {
5835
5836                 default:
5837 #ifdef DEBUGGING
5838                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5839                                                                      OP(scan));
5840 #endif
5841                 case SANY:
5842                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5843                         ssc_match_all_cp(data->start_class);
5844                     break;
5845
5846                 case REG_ANY:
5847                     {
5848                         SV* REG_ANY_invlist = _new_invlist(2);
5849                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5850                                                             '\n');
5851                         if (flags & SCF_DO_STCLASS_OR) {
5852                             ssc_union(data->start_class,
5853                                       REG_ANY_invlist,
5854                                       TRUE /* TRUE => invert, hence all but \n
5855                                             */
5856                                       );
5857                         }
5858                         else if (flags & SCF_DO_STCLASS_AND) {
5859                             ssc_intersection(data->start_class,
5860                                              REG_ANY_invlist,
5861                                              TRUE  /* TRUE => invert */
5862                                              );
5863                             ssc_clear_locale(data->start_class);
5864                         }
5865                         SvREFCNT_dec_NN(REG_ANY_invlist);
5866                     }
5867                     break;
5868
5869                 case ANYOFD:
5870                 case ANYOFL:
5871                 case ANYOFPOSIXL:
5872                 case ANYOFH:
5873                 case ANYOFHb:
5874                 case ANYOFHr:
5875                 case ANYOFHs:
5876                 case ANYOF:
5877                     if (flags & SCF_DO_STCLASS_AND)
5878                         ssc_and(pRExC_state, data->start_class,
5879                                 (regnode_charclass *) scan);
5880                     else
5881                         ssc_or(pRExC_state, data->start_class,
5882                                                           (regnode_charclass *) scan);
5883                     break;
5884
5885                 case NANYOFM:
5886                 case ANYOFM:
5887                   {
5888                     SV* cp_list = get_ANYOFM_contents(scan);
5889
5890                     if (flags & SCF_DO_STCLASS_OR) {
5891                         ssc_union(data->start_class, cp_list, invert);
5892                     }
5893                     else if (flags & SCF_DO_STCLASS_AND) {
5894                         ssc_intersection(data->start_class, cp_list, invert);
5895                     }
5896
5897                     SvREFCNT_dec_NN(cp_list);
5898                     break;
5899                   }
5900
5901                 case ANYOFR:
5902                 case ANYOFRb:
5903                   {
5904                     SV* cp_list = NULL;
5905
5906                     cp_list = _add_range_to_invlist(cp_list,
5907                                         ANYOFRbase(scan),
5908                                         ANYOFRbase(scan) + ANYOFRdelta(scan));
5909
5910                     if (flags & SCF_DO_STCLASS_OR) {
5911                         ssc_union(data->start_class, cp_list, invert);
5912                     }
5913                     else if (flags & SCF_DO_STCLASS_AND) {
5914                         ssc_intersection(data->start_class, cp_list, invert);
5915                     }
5916
5917                     SvREFCNT_dec_NN(cp_list);
5918                     break;
5919                   }
5920
5921                 case NPOSIXL:
5922                     invert = 1;
5923                     /* FALLTHROUGH */
5924
5925                 case POSIXL:
5926                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5927                     if (flags & SCF_DO_STCLASS_AND) {
5928                         bool was_there = cBOOL(
5929                                           ANYOF_POSIXL_TEST(data->start_class,
5930                                                                  namedclass));
5931                         ANYOF_POSIXL_ZERO(data->start_class);
5932                         if (was_there) {    /* Do an AND */
5933                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5934                         }
5935                         /* No individual code points can now match */
5936                         data->start_class->invlist
5937                                                 = sv_2mortal(_new_invlist(0));
5938                     }
5939                     else {
5940                         int complement = namedclass + ((invert) ? -1 : 1);
5941
5942                         assert(flags & SCF_DO_STCLASS_OR);
5943
5944                         /* If the complement of this class was already there,
5945                          * the result is that they match all code points,
5946                          * (\d + \D == everything).  Remove the classes from
5947                          * future consideration.  Locale is not relevant in
5948                          * this case */
5949                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5950                             ssc_match_all_cp(data->start_class);
5951                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5952                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5953                         }
5954                         else {  /* The usual case; just add this class to the
5955                                    existing set */
5956                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5957                         }
5958                     }
5959                     break;
5960
5961                 case NPOSIXA:   /* For these, we always know the exact set of
5962                                    what's matched */
5963                     invert = 1;
5964                     /* FALLTHROUGH */
5965                 case POSIXA:
5966                     my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5967                     goto join_posix_and_ascii;
5968
5969                 case NPOSIXD:
5970                 case NPOSIXU:
5971                     invert = 1;
5972                     /* FALLTHROUGH */
5973                 case POSIXD:
5974                 case POSIXU:
5975                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5976
5977                     /* NPOSIXD matches all upper Latin1 code points unless the
5978                      * target string being matched is UTF-8, which is
5979                      * unknowable until match time.  Since we are going to
5980                      * invert, we want to get rid of all of them so that the
5981                      * inversion will match all */
5982                     if (OP(scan) == NPOSIXD) {
5983                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5984                                           &my_invlist);
5985                     }
5986
5987                   join_posix_and_ascii:
5988
5989                     if (flags & SCF_DO_STCLASS_AND) {
5990                         ssc_intersection(data->start_class, my_invlist, invert);
5991                         ssc_clear_locale(data->start_class);
5992                     }
5993                     else {
5994                         assert(flags & SCF_DO_STCLASS_OR);
5995                         ssc_union(data->start_class, my_invlist, invert);
5996                     }
5997                     SvREFCNT_dec(my_invlist);
5998                 }
5999                 if (flags & SCF_DO_STCLASS_OR)
6000                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6001                 flags &= ~SCF_DO_STCLASS;
6002             }
6003         }
6004         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6005             data->flags |= (OP(scan) == MEOL
6006                             ? SF_BEFORE_MEOL
6007                             : SF_BEFORE_SEOL);
6008             scan_commit(pRExC_state, data, minlenp, is_inf);
6009
6010         }
6011         else if (  PL_regkind[OP(scan)] == BRANCHJ
6012                  /* Lookbehind, or need to calculate parens/evals/stclass: */
6013                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
6014                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6015         {
6016             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6017                 || OP(scan) == UNLESSM )
6018             {
6019                 /* Negative Lookahead/lookbehind
6020                    In this case we can't do fixed string optimisation.
6021                 */
6022
6023                 SSize_t deltanext, minnext, fake = 0;
6024                 regnode *nscan;
6025                 regnode_ssc intrnl;
6026                 int f = 0;
6027
6028                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6029                 if (data) {
6030                     data_fake.whilem_c = data->whilem_c;
6031                     data_fake.last_closep = data->last_closep;
6032                 }
6033                 else
6034                     data_fake.last_closep = &fake;
6035                 data_fake.pos_delta = delta;
6036                 if ( flags & SCF_DO_STCLASS && !scan->flags
6037                      && OP(scan) == IFMATCH ) { /* Lookahead */
6038                     ssc_init(pRExC_state, &intrnl);
6039                     data_fake.start_class = &intrnl;
6040                     f |= SCF_DO_STCLASS_AND;
6041                 }
6042                 if (flags & SCF_WHILEM_VISITED_POS)
6043                     f |= SCF_WHILEM_VISITED_POS;
6044                 next = regnext(scan);
6045                 nscan = NEXTOPER(NEXTOPER(scan));
6046
6047                 /* recurse study_chunk() for lookahead body */
6048                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6049                                       last, &data_fake, stopparen,
6050                                       recursed_depth, NULL, f, depth+1);
6051                 if (scan->flags) {
6052                     if (   deltanext < 0
6053                         || deltanext > (I32) U8_MAX
6054                         || minnext > (I32)U8_MAX
6055                         || minnext + deltanext > (I32)U8_MAX)
6056                     {
6057                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6058                               (UV)U8_MAX);
6059                     }
6060
6061                     /* The 'next_off' field has been repurposed to count the
6062                      * additional starting positions to try beyond the initial
6063                      * one.  (This leaves it at 0 for non-variable length
6064                      * matches to avoid breakage for those not using this
6065                      * extension) */
6066                     if (deltanext) {
6067                         scan->next_off = deltanext;
6068                         ckWARNexperimental(RExC_parse,
6069                             WARN_EXPERIMENTAL__VLB,
6070                             "Variable length lookbehind is experimental");
6071                     }
6072                     scan->flags = (U8)minnext + deltanext;
6073                 }
6074                 if (data) {
6075                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6076                         pars++;
6077                     if (data_fake.flags & SF_HAS_EVAL)
6078                         data->flags |= SF_HAS_EVAL;
6079                     data->whilem_c = data_fake.whilem_c;
6080                 }
6081                 if (f & SCF_DO_STCLASS_AND) {
6082                     if (flags & SCF_DO_STCLASS_OR) {
6083                         /* OR before, AND after: ideally we would recurse with
6084                          * data_fake to get the AND applied by study of the
6085                          * remainder of the pattern, and then derecurse;
6086                          * *** HACK *** for now just treat as "no information".
6087                          * See [perl #56690].
6088                          */
6089                         ssc_init(pRExC_state, data->start_class);
6090                     }  else {
6091                         /* AND before and after: combine and continue.  These
6092                          * assertions are zero-length, so can match an EMPTY
6093                          * string */
6094                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6095                         ANYOF_FLAGS(data->start_class)
6096                                                    |= SSC_MATCHES_EMPTY_STRING;
6097                     }
6098                 }
6099             }
6100 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6101             else {
6102                 /* Positive Lookahead/lookbehind
6103                    In this case we can do fixed string optimisation,
6104                    but we must be careful about it. Note in the case of
6105                    lookbehind the positions will be offset by the minimum
6106                    length of the pattern, something we won't know about
6107                    until after the recurse.
6108                 */
6109                 SSize_t deltanext, fake = 0;
6110                 regnode *nscan;
6111                 regnode_ssc intrnl;
6112                 int f = 0;
6113                 /* We use SAVEFREEPV so that when the full compile
6114                     is finished perl will clean up the allocated
6115                     minlens when it's all done. This way we don't
6116                     have to worry about freeing them when we know
6117                     they wont be used, which would be a pain.
6118                  */
6119                 SSize_t *minnextp;
6120                 Newx( minnextp, 1, SSize_t );
6121                 SAVEFREEPV(minnextp);
6122
6123                 if (data) {
6124                     StructCopy(data, &data_fake, scan_data_t);
6125                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6126                         f |= SCF_DO_SUBSTR;
6127                         if (scan->flags)
6128                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6129                         data_fake.last_found=newSVsv(data->last_found);
6130                     }
6131                 }
6132                 else
6133                     data_fake.last_closep = &fake;
6134                 data_fake.flags = 0;
6135                 data_fake.substrs[0].flags = 0;
6136                 data_fake.substrs[1].flags = 0;
6137                 data_fake.pos_delta = delta;
6138                 if (is_inf)
6139                     data_fake.flags |= SF_IS_INF;
6140                 if ( flags & SCF_DO_STCLASS && !scan->flags
6141                      && OP(scan) == IFMATCH ) { /* Lookahead */
6142                     ssc_init(pRExC_state, &intrnl);
6143                     data_fake.start_class = &intrnl;
6144                     f |= SCF_DO_STCLASS_AND;
6145                 }
6146                 if (flags & SCF_WHILEM_VISITED_POS)
6147                     f |= SCF_WHILEM_VISITED_POS;
6148                 next = regnext(scan);
6149                 nscan = NEXTOPER(NEXTOPER(scan));
6150
6151                 /* positive lookahead study_chunk() recursion */
6152                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6153                                         &deltanext, last, &data_fake,
6154                                         stopparen, recursed_depth, NULL,
6155                                         f, depth+1);
6156                 if (scan->flags) {
6157                     assert(0);  /* This code has never been tested since this
6158                                    is normally not compiled */
6159                     if (   deltanext < 0
6160                         || deltanext > (I32) U8_MAX
6161                         || *minnextp > (I32)U8_MAX
6162                         || *minnextp + deltanext > (I32)U8_MAX)
6163                     {
6164                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
6165                               (UV)U8_MAX);
6166                     }
6167
6168                     if (deltanext) {
6169                         scan->next_off = deltanext;
6170                     }
6171                     scan->flags = (U8)*minnextp + deltanext;
6172                 }
6173
6174                 *minnextp += min;
6175
6176                 if (f & SCF_DO_STCLASS_AND) {
6177                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6178                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6179                 }
6180                 if (data) {
6181                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6182                         pars++;
6183                     if (data_fake.flags & SF_HAS_EVAL)
6184                         data->flags |= SF_HAS_EVAL;
6185                     data->whilem_c = data_fake.whilem_c;
6186                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6187                         int i;
6188                         if (RExC_rx->minlen<*minnextp)
6189                             RExC_rx->minlen=*minnextp;
6190                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6191                         SvREFCNT_dec_NN(data_fake.last_found);
6192
6193                         for (i = 0; i < 2; i++) {
6194                             if (data_fake.substrs[i].minlenp != minlenp) {
6195                                 data->substrs[i].min_offset =
6196                                             data_fake.substrs[i].min_offset;
6197                                 data->substrs[i].max_offset =
6198                                             data_fake.substrs[i].max_offset;
6199                                 data->substrs[i].minlenp =
6200                                             data_fake.substrs[i].minlenp;
6201                                 data->substrs[i].lookbehind += scan->flags;
6202                             }
6203                         }
6204                     }
6205                 }
6206             }
6207 #endif
6208         }
6209         else if (OP(scan) == OPEN) {
6210             if (stopparen != (I32)ARG(scan))
6211                 pars++;
6212         }
6213         else if (OP(scan) == CLOSE) {
6214             if (stopparen == (I32)ARG(scan)) {
6215                 break;
6216             }
6217             if ((I32)ARG(scan) == is_par) {
6218                 next = regnext(scan);
6219
6220                 if ( next && (OP(next) != WHILEM) && next < last)
6221                     is_par = 0;         /* Disable optimization */
6222             }
6223             if (data)
6224                 *(data->last_closep) = ARG(scan);
6225         }
6226         else if (OP(scan) == EVAL) {
6227                 if (data)
6228                     data->flags |= SF_HAS_EVAL;
6229         }
6230         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6231             if (flags & SCF_DO_SUBSTR) {
6232                 scan_commit(pRExC_state, data, minlenp, is_inf);
6233                 flags &= ~SCF_DO_SUBSTR;
6234             }
6235             if (data && OP(scan)==ACCEPT) {
6236                 data->flags |= SCF_SEEN_ACCEPT;
6237                 if (stopmin > min)
6238                     stopmin = min;
6239             }
6240         }
6241         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6242         {
6243                 if (flags & SCF_DO_SUBSTR) {
6244                     scan_commit(pRExC_state, data, minlenp, is_inf);
6245                     data->cur_is_floating = 1; /* float */
6246                 }
6247                 is_inf = is_inf_internal = 1;
6248                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6249                     ssc_anything(data->start_class);
6250                 flags &= ~SCF_DO_STCLASS;
6251         }
6252         else if (OP(scan) == GPOS) {
6253             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6254                 !(delta || is_inf || (data && data->pos_delta)))
6255             {
6256                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6257                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
6258                 if (RExC_rx->gofs < (STRLEN)min)
6259                     RExC_rx->gofs = min;
6260             } else {
6261                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6262                 RExC_rx->gofs = 0;
6263             }
6264         }
6265 #ifdef TRIE_STUDY_OPT
6266 #ifdef FULL_TRIE_STUDY
6267         else if (PL_regkind[OP(scan)] == TRIE) {
6268             /* NOTE - There is similar code to this block above for handling
6269                BRANCH nodes on the initial study.  If you change stuff here
6270                check there too. */
6271             regnode *trie_node= scan;
6272             regnode *tail= regnext(scan);
6273             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6274             SSize_t max1 = 0, min1 = SSize_t_MAX;
6275             regnode_ssc accum;
6276
6277             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6278                 /* Cannot merge strings after this. */
6279                 scan_commit(pRExC_state, data, minlenp, is_inf);
6280             }
6281             if (flags & SCF_DO_STCLASS)
6282                 ssc_init_zero(pRExC_state, &accum);
6283
6284             if (!trie->jump) {
6285                 min1= trie->minlen;
6286                 max1= trie->maxlen;
6287             } else {
6288                 const regnode *nextbranch= NULL;
6289                 U32 word;
6290
6291                 for ( word=1 ; word <= trie->wordcount ; word++)
6292                 {
6293                     SSize_t deltanext=0, minnext=0, f = 0, fake;
6294                     regnode_ssc this_class;
6295
6296                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6297                     if (data) {
6298                         data_fake.whilem_c = data->whilem_c;
6299                         data_fake.last_closep = data->last_closep;
6300                     }
6301                     else
6302                         data_fake.last_closep = &fake;
6303                     data_fake.pos_delta = delta;
6304                     if (flags & SCF_DO_STCLASS) {
6305                         ssc_init(pRExC_state, &this_class);
6306                         data_fake.start_class = &this_class;
6307                         f = SCF_DO_STCLASS_AND;
6308                     }
6309                     if (flags & SCF_WHILEM_VISITED_POS)
6310                         f |= SCF_WHILEM_VISITED_POS;
6311
6312                     if (trie->jump[word]) {
6313                         if (!nextbranch)
6314                             nextbranch = trie_node + trie->jump[0];
6315                         scan= trie_node + trie->jump[word];
6316                         /* We go from the jump point to the branch that follows
6317                            it. Note this means we need the vestigal unused
6318                            branches even though they arent otherwise used. */
6319                         /* optimise study_chunk() for TRIE */
6320                         minnext = study_chunk(pRExC_state, &scan, minlenp,
6321                             &deltanext, (regnode *)nextbranch, &data_fake,
6322                             stopparen, recursed_depth, NULL, f, depth+1);
6323                     }
6324                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6325                         nextbranch= regnext((regnode*)nextbranch);
6326
6327                     if (min1 > (SSize_t)(minnext + trie->minlen))
6328                         min1 = minnext + trie->minlen;
6329                     if (deltanext == SSize_t_MAX) {
6330                         is_inf = is_inf_internal = 1;
6331                         max1 = SSize_t_MAX;
6332                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6333                         max1 = minnext + deltanext + trie->maxlen;
6334
6335                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6336                         pars++;
6337                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
6338                         if ( stopmin > min + min1)
6339                             stopmin = min + min1;
6340                         flags &= ~SCF_DO_SUBSTR;
6341                         if (data)
6342                             data->flags |= SCF_SEEN_ACCEPT;
6343                     }
6344                     if (data) {
6345                         if (data_fake.flags & SF_HAS_EVAL)
6346                             data->flags |= SF_HAS_EVAL;
6347                         data->whilem_c = data_fake.whilem_c;
6348                     }
6349                     if (flags & SCF_DO_STCLASS)
6350                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6351                 }
6352             }
6353             if (flags & SCF_DO_SUBSTR) {
6354                 data->pos_min += min1;
6355                 data->pos_delta += max1 - min1;
6356                 if (max1 != min1 || is_inf)
6357                     data->cur_is_floating = 1; /* float */
6358             }
6359             min += min1;
6360             if (delta != SSize_t_MAX) {
6361                 if (SSize_t_MAX - (max1 - min1) >= delta)
6362                     delta += max1 - min1;
6363                 else
6364                     delta = SSize_t_MAX;
6365             }
6366             if (flags & SCF_DO_STCLASS_OR) {
6367                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6368                 if (min1) {
6369                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6370                     flags &= ~SCF_DO_STCLASS;
6371                 }
6372             }
6373             else if (flags & SCF_DO_STCLASS_AND) {
6374                 if (min1) {
6375                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6376                     flags &= ~SCF_DO_STCLASS;
6377                 }
6378                 else {
6379                     /* Switch to OR mode: cache the old value of
6380                      * data->start_class */
6381                     INIT_AND_WITHP;
6382                     StructCopy(data->start_class, and_withp, regnode_ssc);
6383                     flags &= ~SCF_DO_STCLASS_AND;
6384                     StructCopy(&accum, data->start_class, regnode_ssc);
6385                     flags |= SCF_DO_STCLASS_OR;
6386                 }
6387             }
6388             scan= tail;
6389             continue;
6390         }
6391 #else
6392         else if (PL_regkind[OP(scan)] == TRIE) {
6393             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6394             U8*bang=NULL;
6395
6396             min += trie->minlen;
6397             delta += (trie->maxlen - trie->minlen);
6398             flags &= ~SCF_DO_STCLASS; /* xxx */
6399             if (flags & SCF_DO_SUBSTR) {
6400                 /* Cannot expect anything... */
6401                 scan_commit(pRExC_state, data, minlenp, is_inf);
6402                 data->pos_min += trie->minlen;
6403                 data->pos_delta += (trie->maxlen - trie->minlen);
6404                 if (trie->maxlen != trie->minlen)
6405                     data->cur_is_floating = 1; /* float */
6406             }
6407             if (trie->jump) /* no more substrings -- for now /grr*/
6408                flags &= ~SCF_DO_SUBSTR;
6409         }
6410 #endif /* old or new */
6411 #endif /* TRIE_STUDY_OPT */
6412
6413         /* Else: zero-length, ignore. */
6414         scan = regnext(scan);
6415     }
6416
6417   finish:
6418     if (frame) {
6419         /* we need to unwind recursion. */
6420         depth = depth - 1;
6421
6422         DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6423         DEBUG_PEEP("fend", scan, depth, flags);
6424
6425         /* restore previous context */
6426         last = frame->last_regnode;
6427         scan = frame->next_regnode;
6428         stopparen = frame->stopparen;
6429         recursed_depth = frame->prev_recursed_depth;
6430
6431         RExC_frame_last = frame->prev_frame;
6432         frame = frame->this_prev_frame;
6433         goto fake_study_recurse;
6434     }
6435
6436     assert(!frame);
6437     DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6438
6439     *scanp = scan;
6440     *deltap = is_inf_internal ? SSize_t_MAX : delta;
6441
6442     if (flags & SCF_DO_SUBSTR && is_inf)
6443         data->pos_delta = SSize_t_MAX - data->pos_min;
6444     if (is_par > (I32)U8_MAX)
6445         is_par = 0;
6446     if (is_par && pars==1 && data) {
6447         data->flags |= SF_IN_PAR;
6448         data->flags &= ~SF_HAS_PAR;
6449     }
6450     else if (pars && data) {
6451         data->flags |= SF_HAS_PAR;
6452         data->flags &= ~SF_IN_PAR;
6453     }
6454     if (flags & SCF_DO_STCLASS_OR)
6455         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6456     if (flags & SCF_TRIE_RESTUDY)
6457         data->flags |=  SCF_TRIE_RESTUDY;
6458
6459     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6460
6461     {
6462         SSize_t final_minlen= min < stopmin ? min : stopmin;
6463
6464         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6465             if (final_minlen > SSize_t_MAX - delta)
6466                 RExC_maxlen = SSize_t_MAX;
6467             else if (RExC_maxlen < final_minlen + delta)
6468                 RExC_maxlen = final_minlen + delta;
6469         }
6470         return final_minlen;
6471     }
6472     NOT_REACHED; /* NOTREACHED */
6473 }
6474
6475 STATIC U32
6476 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6477 {
6478     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6479
6480     PERL_ARGS_ASSERT_ADD_DATA;
6481
6482     Renewc(RExC_rxi->data,
6483            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6484            char, struct reg_data);
6485     if(count)
6486         Renew(RExC_rxi->data->what, count + n, U8);
6487     else
6488         Newx(RExC_rxi->data->what, n, U8);
6489     RExC_rxi->data->count = count + n;
6490     Copy(s, RExC_rxi->data->what + count, n, U8);
6491     return count;
6492 }
6493
6494 /*XXX: todo make this not included in a non debugging perl, but appears to be
6495  * used anyway there, in 'use re' */
6496 #ifndef PERL_IN_XSUB_RE
6497 void
6498 Perl_reginitcolors(pTHX)
6499 {
6500     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6501     if (s) {
6502         char *t = savepv(s);
6503         int i = 0;
6504         PL_colors[0] = t;
6505         while (++i < 6) {
6506             t = strchr(t, '\t');
6507             if (t) {
6508                 *t = '\0';
6509                 PL_colors[i] = ++t;
6510             }
6511             else
6512                 PL_colors[i] = t = (char *)"";
6513         }
6514     } else {
6515         int i = 0;
6516         while (i < 6)
6517             PL_colors[i++] = (char *)"";
6518     }
6519     PL_colorset = 1;
6520 }
6521 #endif
6522
6523
6524 #ifdef TRIE_STUDY_OPT
6525 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6526     STMT_START {                                            \
6527         if (                                                \
6528               (data.flags & SCF_TRIE_RESTUDY)               \
6529               && ! restudied++                              \
6530         ) {                                                 \
6531             dOsomething;                                    \
6532             goto reStudy;                                   \
6533         }                                                   \
6534     } STMT_END
6535 #else
6536 #define CHECK_RESTUDY_GOTO_butfirst
6537 #endif
6538
6539 /*
6540  * pregcomp - compile a regular expression into internal code
6541  *
6542  * Decides which engine's compiler to call based on the hint currently in
6543  * scope
6544  */
6545
6546 #ifndef PERL_IN_XSUB_RE
6547
6548 /* return the currently in-scope regex engine (or the default if none)  */
6549
6550 regexp_engine const *
6551 Perl_current_re_engine(pTHX)
6552 {
6553     if (IN_PERL_COMPILETIME) {
6554         HV * const table = GvHV(PL_hintgv);
6555         SV **ptr;
6556
6557         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6558             return &PL_core_reg_engine;
6559         ptr = hv_fetchs(table, "regcomp", FALSE);
6560         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6561             return &PL_core_reg_engine;
6562         return INT2PTR(regexp_engine*, SvIV(*ptr));
6563     }
6564     else {
6565         SV *ptr;
6566         if (!PL_curcop->cop_hints_hash)
6567             return &PL_core_reg_engine;
6568         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6569         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6570             return &PL_core_reg_engine;
6571         return INT2PTR(regexp_engine*, SvIV(ptr));
6572     }
6573 }
6574
6575
6576 REGEXP *
6577 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6578 {
6579     regexp_engine const *eng = current_re_engine();
6580     GET_RE_DEBUG_FLAGS_DECL;
6581
6582     PERL_ARGS_ASSERT_PREGCOMP;
6583
6584     /* Dispatch a request to compile a regexp to correct regexp engine. */
6585     DEBUG_COMPILE_r({
6586         Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
6587                         PTR2UV(eng));
6588     });
6589     return CALLREGCOMP_ENG(eng, pattern, flags);
6590 }
6591 #endif
6592
6593 /* public(ish) entry point for the perl core's own regex compiling code.
6594  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6595  * pattern rather than a list of OPs, and uses the internal engine rather
6596  * than the current one */
6597
6598 REGEXP *
6599 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6600 {
6601     SV *pat = pattern; /* defeat constness! */
6602     PERL_ARGS_ASSERT_RE_COMPILE;
6603     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6604 #ifdef PERL_IN_XSUB_RE
6605                                 &my_reg_engine,
6606 #else
6607                                 &PL_core_reg_engine,
6608 #endif
6609                                 NULL, NULL, rx_flags, 0);
6610 }
6611
6612
6613 static void
6614 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6615 {
6616     int n;
6617
6618     if (--cbs->refcnt > 0)
6619         return;
6620     for (n = 0; n < cbs->count; n++) {
6621         REGEXP *rx = cbs->cb[n].src_regex;
6622         if (rx) {
6623             cbs->cb[n].src_regex = NULL;
6624             SvREFCNT_dec_NN(rx);
6625         }
6626     }
6627     Safefree(cbs->cb);
6628     Safefree(cbs);
6629 }
6630
6631
6632 static struct reg_code_blocks *
6633 S_alloc_code_blocks(pTHX_  int ncode)
6634 {
6635      struct reg_code_blocks *cbs;
6636     Newx(cbs, 1, struct reg_code_blocks);
6637     cbs->count = ncode;
6638     cbs->refcnt = 1;
6639     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6640     if (ncode)
6641         Newx(cbs->cb, ncode, struct reg_code_block);
6642     else
6643         cbs->cb = NULL;
6644     return cbs;
6645 }
6646
6647
6648 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6649  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6650  * point to the realloced string and length.
6651  *
6652  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6653  * stuff added */
6654
6655 static void
6656 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6657                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
6658 {
6659     U8 *const src = (U8*)*pat_p;
6660     U8 *dst, *d;
6661     int n=0;
6662     STRLEN s = 0;
6663     bool do_end = 0;
6664     GET_RE_DEBUG_FLAGS_DECL;
6665
6666     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6667         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6668
6669     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6670     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6671     d = dst;
6672
6673     while (s < *plen_p) {
6674         append_utf8_from_native_byte(src[s], &d);
6675
6676         if (n < num_code_blocks) {
6677             assert(pRExC_state->code_blocks);
6678             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6679                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6680                 assert(*(d - 1) == '(');
6681                 do_end = 1;
6682             }
6683             else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6684                 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6685                 assert(*(d - 1) == ')');
6686                 do_end = 0;
6687                 n++;
6688             }
6689         }
6690         s++;
6691     }
6692     *d = '\0';
6693     *plen_p = d - dst;
6694     *pat_p = (char*) dst;
6695     SAVEFREEPV(*pat_p);
6696     RExC_orig_utf8 = RExC_utf8 = 1;
6697 }
6698
6699
6700
6701 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6702  * while recording any code block indices, and handling overloading,
6703  * nested qr// objects etc.  If pat is null, it will allocate a new
6704  * string, or just return the first arg, if there's only one.
6705  *
6706  * Returns the malloced/updated pat.
6707  * patternp and pat_count is the array of SVs to be concatted;
6708  * oplist is the optional list of ops that generated the SVs;
6709  * recompile_p is a pointer to a boolean that will be set if
6710  *   the regex will need to be recompiled.
6711  * delim, if non-null is an SV that will be inserted between each element
6712  */
6713
6714 static SV*
6715 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6716                 SV *pat, SV ** const patternp, int pat_count,
6717                 OP *oplist, bool *recompile_p, SV *delim)
6718 {
6719     SV **svp;
6720     int n = 0;
6721     bool use_delim = FALSE;
6722     bool alloced = FALSE;
6723
6724     /* if we know we have at least two args, create an empty string,
6725      * then concatenate args to that. For no args, return an empty string */
6726     if (!pat && pat_count != 1) {
6727         pat = newSVpvs("");
6728         SAVEFREESV(pat);
6729         alloced = TRUE;
6730     }
6731
6732     for (svp = patternp; svp < patternp + pat_count; svp++) {
6733         SV *sv;
6734         SV *rx  = NULL;
6735         STRLEN orig_patlen = 0;
6736         bool code = 0;
6737         SV *msv = use_delim ? delim : *svp;
6738         if (!msv) msv = &PL_sv_undef;
6739
6740         /* if we've got a delimiter, we go round the loop twice for each
6741          * svp slot (except the last), using the delimiter the second
6742          * time round */
6743         if (use_delim) {
6744             svp--;
6745             use_delim = FALSE;
6746         }
6747         else if (delim)
6748             use_delim = TRUE;
6749
6750         if (SvTYPE(msv) == SVt_PVAV) {
6751             /* we've encountered an interpolated array within
6752              * the pattern, e.g. /...@a..../. Expand the list of elements,
6753              * then recursively append elements.
6754              * The code in this block is based on S_pushav() */
6755
6756             AV *const av = (AV*)msv;
6757             const SSize_t maxarg = AvFILL(av) + 1;
6758             SV **array;
6759
6760             if (oplist) {
6761                 assert(oplist->op_type == OP_PADAV
6762                     || oplist->op_type == OP_RV2AV);
6763                 oplist = OpSIBLING(oplist);
6764             }
6765
6766             if (SvRMAGICAL(av)) {
6767                 SSize_t i;
6768
6769                 Newx(array, maxarg, SV*);
6770                 SAVEFREEPV(array);
6771                 for (i=0; i < maxarg; i++) {
6772                     SV ** const svp = av_fetch(av, i, FALSE);
6773                     array[i] = svp ? *svp : &PL_sv_undef;
6774                 }
6775             }
6776             else
6777                 array = AvARRAY(av);
6778
6779             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6780                                 array, maxarg, NULL, recompile_p,
6781                                 /* $" */
6782                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6783
6784             continue;
6785         }
6786
6787
6788         /* we make the assumption here that each op in the list of
6789          * op_siblings maps to one SV pushed onto the stack,
6790          * except for code blocks, with have both an OP_NULL and
6791          * and OP_CONST.
6792          * This allows us to match up the list of SVs against the
6793          * list of OPs to find the next code block.
6794          *
6795          * Note that       PUSHMARK PADSV PADSV ..
6796          * is optimised to
6797          *                 PADRANGE PADSV  PADSV  ..
6798          * so the alignment still works. */
6799
6800         if (oplist) {
6801             if (oplist->op_type == OP_NULL
6802                 && (oplist->op_flags & OPf_SPECIAL))
6803             {
6804                 assert(n < pRExC_state->code_blocks->count);
6805                 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6806                 pRExC_state->code_blocks->cb[n].block = oplist;
6807                 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6808                 n++;
6809                 code = 1;
6810                 oplist = OpSIBLING(oplist); /* skip CONST */
6811                 assert(oplist);
6812             }
6813             oplist = OpSIBLING(oplist);;
6814         }
6815
6816         /* apply magic and QR overloading to arg */
6817
6818         SvGETMAGIC(msv);
6819         if (SvROK(msv) && SvAMAGIC(msv)) {
6820             SV *sv = AMG_CALLunary(msv, regexp_amg);
6821             if (sv) {
6822                 if (SvROK(sv))
6823                     sv = SvRV(sv);
6824                 if (SvTYPE(sv) != SVt_REGEXP)
6825                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6826                 msv = sv;
6827             }
6828         }
6829
6830         /* try concatenation overload ... */
6831         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6832                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6833         {
6834             sv_setsv(pat, sv);
6835             /* overloading involved: all bets are off over literal
6836              * code. Pretend we haven't seen it */
6837             if (n)
6838                 pRExC_state->code_blocks->count -= n;
6839             n = 0;
6840         }
6841         else  {
6842             /* ... or failing that, try "" overload */
6843             while (SvAMAGIC(msv)
6844                     && (sv = AMG_CALLunary(msv, string_amg))
6845                     && sv != msv
6846                     &&  !(   SvROK(msv)
6847                           && SvROK(sv)
6848                           && SvRV(msv) == SvRV(sv))
6849             ) {
6850                 msv = sv;
6851                 SvGETMAGIC(msv);
6852             }
6853             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6854                 msv = SvRV(msv);
6855
6856             if (pat) {
6857                 /* this is a partially unrolled
6858                  *     sv_catsv_nomg(pat, msv);
6859                  * that allows us to adjust code block indices if
6860                  * needed */
6861                 STRLEN dlen;
6862                 char *dst = SvPV_force_nomg(pat, dlen);
6863                 orig_patlen = dlen;
6864                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6865                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6866                     sv_setpvn(pat, dst, dlen);
6867                     SvUTF8_on(pat);
6868                 }
6869                 sv_catsv_nomg(pat, msv);
6870                 rx = msv;
6871             }
6872             else {
6873                 /* We have only one SV to process, but we need to verify
6874                  * it is properly null terminated or we will fail asserts
6875                  * later. In theory we probably shouldn't get such SV's,
6876                  * but if we do we should handle it gracefully. */
6877                 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6878                     /* not a string, or a string with a trailing null */
6879                     pat = msv;
6880                 } else {
6881                     /* a string with no trailing null, we need to copy it
6882                      * so it has a trailing null */
6883                     pat = sv_2mortal(newSVsv(msv));
6884                 }
6885             }
6886
6887             if (code)
6888                 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6889         }
6890
6891         /* extract any code blocks within any embedded qr//'s */
6892         if (rx && SvTYPE(rx) == SVt_REGEXP
6893             && RX_ENGINE((REGEXP*)rx)->op_comp)
6894         {
6895
6896             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6897             if (ri->code_blocks && ri->code_blocks->count) {
6898                 int i;
6899                 /* the presence of an embedded qr// with code means
6900                  * we should always recompile: the text of the
6901                  * qr// may not have changed, but it may be a
6902                  * different closure than last time */
6903                 *recompile_p = 1;
6904                 if (pRExC_state->code_blocks) {
6905                     int new_count = pRExC_state->code_blocks->count
6906                             + ri->code_blocks->count;
6907                     Renew(pRExC_state->code_blocks->cb,
6908                             new_count, struct reg_code_block);
6909                     pRExC_state->code_blocks->count = new_count;
6910                 }
6911                 else
6912                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6913                                                     ri->code_blocks->count);
6914
6915                 for (i=0; i < ri->code_blocks->count; i++) {
6916                     struct reg_code_block *src, *dst;
6917                     STRLEN offset =  orig_patlen
6918                         + ReANY((REGEXP *)rx)->pre_prefix;
6919                     assert(n < pRExC_state->code_blocks->count);
6920                     src = &ri->code_blocks->cb[i];
6921                     dst = &pRExC_state->code_blocks->cb[n];
6922                     dst->start      = src->start + offset;
6923                     dst->end        = src->end   + offset;
6924                     dst->block      = src->block;
6925                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6926                                             src->src_regex
6927                                                 ? src->src_regex
6928                                                 : (REGEXP*)rx);
6929                     n++;
6930                 }
6931             }
6932         }
6933     }
6934     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6935     if (alloced)
6936         SvSETMAGIC(pat);
6937
6938     return pat;
6939 }
6940
6941
6942
6943 /* see if there are any run-time code blocks in the pattern.
6944  * False positives are allowed */
6945
6946 static bool
6947 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6948                     char *pat, STRLEN plen)
6949 {
6950     int n = 0;
6951     STRLEN s;
6952
6953     PERL_UNUSED_CONTEXT;
6954
6955     for (s = 0; s < plen; s++) {
6956         if (   pRExC_state->code_blocks
6957             && n < pRExC_state->code_blocks->count
6958             && s == pRExC_state->code_blocks->cb[n].start)
6959         {
6960             s = pRExC_state->code_blocks->cb[n].end;
6961             n++;
6962             continue;
6963         }
6964         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6965          * positives here */
6966         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6967             (pat[s+2] == '{'
6968                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6969         )
6970             return 1;
6971     }
6972     return 0;
6973 }
6974
6975 /* Handle run-time code blocks. We will already have compiled any direct
6976  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6977  * copy of it, but with any literal code blocks blanked out and
6978  * appropriate chars escaped; then feed it into
6979  *
6980  *    eval "qr'modified_pattern'"
6981  *
6982  * For example,
6983  *
6984  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6985  *
6986  * becomes
6987  *
6988  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6989  *
6990  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6991  * and merge them with any code blocks of the original regexp.
6992  *
6993  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6994  * instead, just save the qr and return FALSE; this tells our caller that
6995  * the original pattern needs upgrading to utf8.
6996  */
6997
6998 static bool
6999 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7000     char *pat, STRLEN plen)
7001 {
7002     SV *qr;
7003
7004     GET_RE_DEBUG_FLAGS_DECL;
7005
7006     if (pRExC_state->runtime_code_qr) {
7007         /* this is the second time we've been called; this should
7008          * only happen if the main pattern got upgraded to utf8
7009          * during compilation; re-use the qr we compiled first time
7010          * round (which should be utf8 too)
7011          */
7012         qr = pRExC_state->runtime_code_qr;
7013         pRExC_state->runtime_code_qr = NULL;
7014         assert(RExC_utf8 && SvUTF8(qr));
7015     }
7016     else {
7017         int n = 0;
7018         STRLEN s;
7019         char *p, *newpat;
7020         int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7021         SV *sv, *qr_ref;
7022         dSP;
7023
7024         /* determine how many extra chars we need for ' and \ escaping */
7025         for (s = 0; s < plen; s++) {
7026             if (pat[s] == '\'' || pat[s] == '\\')
7027                 newlen++;
7028         }
7029
7030         Newx(newpat, newlen, char);
7031         p = newpat;
7032         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7033
7034         for (s = 0; s < plen; s++) {
7035             if (   pRExC_state->code_blocks
7036                 && n < pRExC_state->code_blocks->count
7037                 && s == pRExC_state->code_blocks->cb[n].start)
7038             {
7039                 /* blank out literal code block so that they aren't
7040                  * recompiled: eg change from/to:
7041                  *     /(?{xyz})/
7042                  *     /(?=====)/
7043                  * and
7044                  *     /(??{xyz})/
7045                  *     /(?======)/
7046                  * and
7047                  *     /(?(?{xyz}))/
7048                  *     /(?(?=====))/
7049                 */
7050                 assert(pat[s]   == '(');
7051                 assert(pat[s+1] == '?');
7052                 *p++ = '(';
7053                 *p++ = '?';
7054                 s += 2;
7055                 while (s < pRExC_state->code_blocks->cb[n].end) {
7056                     *p++ = '=';
7057                     s++;
7058                 }
7059                 *p++ = ')';
7060                 n++;
7061                 continue;
7062             }
7063             if (pat[s] == '\'' || pat[s] == '\\')
7064                 *p++ = '\\';
7065             *p++ = pat[s];
7066         }
7067         *p++ = '\'';
7068         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7069             *p++ = 'x';
7070             if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7071                 *p++ = 'x';
7072             }
7073         }
7074         *p++ = '\0';
7075         DEBUG_COMPILE_r({
7076             Perl_re_printf( aTHX_
7077                 "%sre-parsing pattern for runtime code:%s %s\n",
7078                 PL_colors[4], PL_colors[5], newpat);
7079         });
7080
7081         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7082         Safefree(newpat);
7083
7084         ENTER;
7085         SAVETMPS;
7086         save_re_context();
7087         PUSHSTACKi(PERLSI_REQUIRE);
7088         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7089          * parsing qr''; normally only q'' does this. It also alters
7090          * hints handling */
7091         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7092         SvREFCNT_dec_NN(sv);
7093         SPAGAIN;
7094         qr_ref = POPs;
7095         PUTBACK;
7096         {
7097             SV * const errsv = ERRSV;
7098             if (SvTRUE_NN(errsv))
7099                 /* use croak_sv ? */
7100                 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7101         }
7102         assert(SvROK(qr_ref));
7103         qr = SvRV(qr_ref);
7104         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7105         /* the leaving below frees the tmp qr_ref.
7106          * Give qr a life of its own */
7107         SvREFCNT_inc(qr);
7108         POPSTACK;
7109         FREETMPS;
7110         LEAVE;
7111
7112     }
7113
7114     if (!RExC_utf8 && SvUTF8(qr)) {
7115         /* first time through; the pattern got upgraded; save the
7116          * qr for the next time through */
7117         assert(!pRExC_state->runtime_code_qr);
7118         pRExC_state->runtime_code_qr = qr;
7119         return 0;
7120     }
7121
7122
7123     /* extract any code blocks within the returned qr//  */
7124
7125
7126     /* merge the main (r1) and run-time (r2) code blocks into one */
7127     {
7128         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7129         struct reg_code_block *new_block, *dst;
7130         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7131         int i1 = 0, i2 = 0;
7132         int r1c, r2c;
7133
7134         if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7135         {
7136             SvREFCNT_dec_NN(qr);
7137             return 1;
7138         }
7139
7140         if (!r1->code_blocks)
7141             r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7142
7143         r1c = r1->code_blocks->count;
7144         r2c = r2->code_blocks->count;
7145
7146         Newx(new_block, r1c + r2c, struct reg_code_block);
7147
7148         dst = new_block;
7149
7150         while (i1 < r1c || i2 < r2c) {
7151             struct reg_code_block *src;
7152             bool is_qr = 0;
7153
7154             if (i1 == r1c) {
7155                 src = &r2->code_blocks->cb[i2++];
7156                 is_qr = 1;
7157             }
7158             else if (i2 == r2c)
7159                 src = &r1->code_blocks->cb[i1++];
7160             else if (  r1->code_blocks->cb[i1].start
7161                      < r2->code_blocks->cb[i2].start)
7162             {
7163                 src = &r1->code_blocks->cb[i1++];
7164                 assert(src->end < r2->code_blocks->cb[i2].start);
7165             }
7166             else {
7167                 assert(  r1->code_blocks->cb[i1].start
7168                        > r2->code_blocks->cb[i2].start);
7169                 src = &r2->code_blocks->cb[i2++];
7170                 is_qr = 1;
7171                 assert(src->end < r1->code_blocks->cb[i1].start);
7172             }
7173
7174             assert(pat[src->start] == '(');
7175             assert(pat[src->end]   == ')');
7176             dst->start      = src->start;
7177             dst->end        = src->end;
7178             dst->block      = src->block;
7179             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7180                                     : src->src_regex;
7181             dst++;
7182         }
7183         r1->code_blocks->count += r2c;
7184         Safefree(r1->code_blocks->cb);
7185         r1->code_blocks->cb = new_block;
7186     }
7187
7188     SvREFCNT_dec_NN(qr);
7189     return 1;
7190 }
7191
7192
7193 STATIC bool
7194 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7195                       struct reg_substr_datum  *rsd,
7196                       struct scan_data_substrs *sub,
7197                       STRLEN longest_length)
7198 {
7199     /* This is the common code for setting up the floating and fixed length
7200      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
7201      * as to whether succeeded or not */
7202
7203     I32 t;
7204     SSize_t ml;
7205     bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
7206     bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7207
7208     if (! (longest_length
7209            || (eol /* Can't have SEOL and MULTI */
7210                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7211           )
7212             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7213         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7214     {
7215         return FALSE;
7216     }
7217
7218     /* copy the information about the longest from the reg_scan_data
7219         over to the program. */
7220     if (SvUTF8(sub->str)) {
7221         rsd->substr      = NULL;
7222         rsd->utf8_substr = sub->str;
7223     } else {
7224         rsd->substr      = sub->str;
7225         rsd->utf8_substr = NULL;
7226     }
7227     /* end_shift is how many chars that must be matched that
7228         follow this item. We calculate it ahead of time as once the
7229         lookbehind offset is added in we lose the ability to correctly
7230         calculate it.*/
7231     ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7232     rsd->end_shift = ml - sub->min_offset
7233         - longest_length
7234             /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7235              * intead? - DAPM
7236             + (SvTAIL(sub->str) != 0)
7237             */
7238         + sub->lookbehind;
7239
7240     t = (eol/* Can't have SEOL and MULTI */
7241          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7242     fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7243
7244     return TRUE;
7245 }
7246
7247 STATIC void
7248 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7249 {
7250     /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7251      * properly wrapped with the right modifiers */
7252
7253     bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7254     bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7255                                                 != REGEX_DEPENDS_CHARSET);
7256
7257     /* The caret is output if there are any defaults: if not all the STD
7258         * flags are set, or if no character set specifier is needed */
7259     bool has_default =
7260                 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7261                 || ! has_charset);
7262     bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7263                                                 == REG_RUN_ON_COMMENT_SEEN);
7264     U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7265                         >> RXf_PMf_STD_PMMOD_SHIFT);
7266     const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
7267     char *p;
7268     STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7269
7270     /* We output all the necessary flags; we never output a minus, as all
7271         * those are defaults, so are
7272         * covered by the caret */
7273     const STRLEN wraplen = pat_len + has_p + has_runon
7274         + has_default       /* If needs a caret */
7275         + PL_bitcount[reganch] /* 1 char for each set standard flag */
7276
7277             /* If needs a character set specifier */
7278         + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7279         + (sizeof("(?:)") - 1);
7280
7281     PERL_ARGS_ASSERT_SET_REGEX_PV;
7282
7283     /* make sure PL_bitcount bounds not exceeded */
7284     assert(sizeof(STD_PAT_MODS) <= 8);
7285
7286     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7287     SvPOK_on(Rx);
7288     if (RExC_utf8)
7289         SvFLAGS(Rx) |= SVf_UTF8;
7290     *p++='('; *p++='?';
7291
7292     /* If a default, cover it using the caret */
7293     if (has_default) {
7294         *p++= DEFAULT_PAT_MOD;
7295     }
7296     if (has_charset) {
7297         STRLEN len;
7298         const char* name;
7299
7300         name = get_regex_charset_name(RExC_rx->extflags, &len);
7301         if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
7302             assert(RExC_utf8);
7303             name = UNICODE_PAT_MODS;
7304             len = sizeof(UNICODE_PAT_MODS) - 1;
7305         }
7306         Copy(name, p, len, char);
7307         p += len;
7308     }
7309     if (has_p)
7310         *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7311     {
7312         char ch;
7313         while((ch = *fptr++)) {
7314             if(reganch & 1)
7315                 *p++ = ch;
7316             reganch >>= 1;
7317         }
7318     }
7319
7320     *p++ = ':';
7321     Copy(RExC_precomp, p, pat_len, char);
7322     assert ((RX_WRAPPED(Rx) - p) < 16);
7323     RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7324     p += pat_len;
7325
7326     /* Adding a trailing \n causes this to compile properly:
7327             my $R = qr / A B C # D E/x; /($R)/
7328         Otherwise the parens are considered part of the comment */
7329     if (has_runon)
7330         *p++ = '\n';
7331     *p++ = ')';
7332     *p = 0;
7333     SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7334 }
7335
7336 /*
7337  * Perl_re_op_compile - the perl internal RE engine's function to compile a
7338  * regular expression into internal code.
7339  * The pattern may be passed either as:
7340  *    a list of SVs (patternp plus pat_count)
7341  *    a list of OPs (expr)
7342  * If both are passed, the SV list is used, but the OP list indicates
7343  * which SVs are actually pre-compiled code blocks
7344  *
7345  * The SVs in the list have magic and qr overloading applied to them (and
7346  * the list may be modified in-place with replacement SVs in the latter
7347  * case).
7348  *
7349  * If the pattern hasn't changed from old_re, then old_re will be
7350  * returned.
7351  *
7352  * eng is the current engine. If that engine has an op_comp method, then
7353  * handle directly (i.e. we assume that op_comp was us); otherwise, just
7354  * do the initial concatenation of arguments and pass on to the external
7355  * engine.
7356  *
7357  * If is_bare_re is not null, set it to a boolean indicating whether the
7358  * arg list reduced (after overloading) to a single bare regex which has
7359  * been returned (i.e. /$qr/).
7360  *
7361  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7362  *
7363  * pm_flags contains the PMf_* flags, typically based on those from the
7364  * pm_flags field of the related PMOP. Currently we're only interested in
7365  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7366  *
7367  * For many years this code had an initial sizing pass that calculated
7368  * (sometimes incorrectly, leading to security holes) the size needed for the
7369  * compiled pattern.  That was changed by commit
7370  * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7371  * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
7372  * references to this sizing pass.
7373  *
7374  * Now, an initial crude guess as to the size needed is made, based on the
7375  * length of the pattern.  Patches welcome to improve that guess.  That amount
7376  * of space is malloc'd and then immediately freed, and then clawed back node
7377  * by node.  This design is to minimze, to the extent possible, memory churn
7378  * when doing the the reallocs.
7379  *
7380  * A separate parentheses counting pass may be needed in some cases.
7381  * (Previously the sizing pass did this.)  Patches welcome to reduce the number
7382  * of these cases.
7383  *
7384  * The existence of a sizing pass necessitated design decisions that are no
7385  * longer needed.  There are potential areas of simplification.
7386  *
7387  * Beware that the optimization-preparation code in here knows about some
7388  * of the structure of the compiled regexp.  [I'll say.]
7389  */
7390
7391 REGEXP *
7392 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7393                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
7394                      bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7395 {
7396     dVAR;
7397     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
7398     STRLEN plen;
7399     char *exp;
7400     regnode *scan;
7401     I32 flags;
7402     SSize_t minlen = 0;
7403     U32 rx_flags;
7404     SV *pat;
7405     SV** new_patternp = patternp;
7406
7407     /* these are all flags - maybe they should be turned
7408      * into a single int with different bit masks */
7409     I32 sawlookahead = 0;
7410     I32 sawplus = 0;
7411     I32 sawopen = 0;
7412     I32 sawminmod = 0;
7413
7414     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7415     bool recompile = 0;
7416     bool runtime_code = 0;
7417     scan_data_t data;
7418     RExC_state_t RExC_state;
7419     RExC_state_t * const pRExC_state = &RExC_state;
7420 #ifdef TRIE_STUDY_OPT
7421     int restudied = 0;
7422     RExC_state_t copyRExC_state;
7423 #endif
7424     GET_RE_DEBUG_FLAGS_DECL;
7425
7426     PERL_ARGS_ASSERT_RE_OP_COMPILE;
7427
7428     DEBUG_r(if (!PL_colorset) reginitcolors());
7429
7430
7431     pRExC_state->warn_text = NULL;
7432     pRExC_state->unlexed_names = NULL;
7433     pRExC_state->code_blocks = NULL;
7434
7435     if (is_bare_re)
7436         *is_bare_re = FALSE;
7437
7438     if (expr && (expr->op_type == OP_LIST ||
7439                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7440         /* allocate code_blocks if needed */
7441         OP *o;
7442         int ncode = 0;
7443
7444         for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7445             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7446                 ncode++; /* count of DO blocks */
7447
7448         if (ncode)
7449             pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7450     }
7451
7452     if (!pat_count) {
7453         /* compile-time pattern with just OP_CONSTs and DO blocks */
7454
7455         int n;
7456         OP *o;
7457
7458         /* find how many CONSTs there are */
7459         assert(expr);
7460         n = 0;
7461         if (expr->op_type == OP_CONST)
7462             n = 1;
7463         else
7464             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7465                 if (o->op_type == OP_CONST)
7466                     n++;
7467             }
7468
7469         /* fake up an SV array */
7470
7471         assert(!new_patternp);
7472         Newx(new_patternp, n, SV*);
7473         SAVEFREEPV(new_patternp);
7474         pat_count = n;
7475
7476         n = 0;
7477         if (expr->op_type == OP_CONST)
7478             new_patternp[n] = cSVOPx_sv(expr);
7479         else
7480             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7481                 if (o->op_type == OP_CONST)
7482                     new_patternp[n++] = cSVOPo_sv;
7483             }
7484
7485     }
7486
7487     DEBUG_PARSE_r(Perl_re_printf( aTHX_
7488         "Assembling pattern from %d elements%s\n", pat_count,
7489             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7490
7491     /* set expr to the first arg op */
7492
7493     if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7494          && expr->op_type != OP_CONST)
7495     {
7496             expr = cLISTOPx(expr)->op_first;
7497             assert(   expr->op_type == OP_PUSHMARK
7498                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7499                    || expr->op_type == OP_PADRANGE);
7500             expr = OpSIBLING(expr);
7501     }
7502
7503     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7504                         expr, &recompile, NULL);
7505
7506     /* handle bare (possibly after overloading) regex: foo =~ $re */
7507     {
7508         SV *re = pat;
7509         if (SvROK(re))
7510             re = SvRV(re);
7511         if (SvTYPE(re) == SVt_REGEXP) {
7512             if (is_bare_re)
7513                 *is_bare_re = TRUE;
7514             SvREFCNT_inc(re);
7515             DEBUG_PARSE_r(Perl_re_printf( aTHX_
7516                 "Precompiled pattern%s\n",
7517                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7518
7519             return (REGEXP*)re;
7520         }
7521     }
7522
7523     exp = SvPV_nomg(pat, plen);
7524
7525     if (!eng->op_comp) {
7526         if ((SvUTF8(pat) && IN_BYTES)
7527                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7528         {
7529             /* make a temporary copy; either to convert to bytes,
7530              * or to avoid repeating get-magic / overloaded stringify */
7531             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7532                                         (IN_BYTES ? 0 : SvUTF8(pat)));
7533         }
7534         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7535     }
7536
7537     /* ignore the utf8ness if the pattern is 0 length */
7538     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7539     RExC_uni_semantics = 0;
7540     RExC_contains_locale = 0;
7541     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7542     RExC_in_script_run = 0;
7543     RExC_study_started = 0;
7544     pRExC_state->runtime_code_qr = NULL;
7545     RExC_frame_head= NULL;
7546     RExC_frame_last= NULL;
7547     RExC_frame_count= 0;
7548     RExC_latest_warn_offset = 0;
7549     RExC_use_BRANCHJ = 0;
7550     RExC_total_parens = 0;
7551     RExC_open_parens = NULL;
7552     RExC_close_parens = NULL;
7553     RExC_paren_names = NULL;
7554     RExC_size = 0;
7555     RExC_seen_d_op = FALSE;
7556 #ifdef DEBUGGING
7557     RExC_paren_name_list = NULL;
7558 #endif
7559
7560     DEBUG_r({
7561         RExC_mysv1= sv_newmortal();
7562         RExC_mysv2= sv_newmortal();
7563     });
7564
7565     DEBUG_COMPILE_r({
7566             SV *dsv= sv_newmortal();
7567             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7568             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
7569                           PL_colors[4], PL_colors[5], s);
7570         });
7571
7572     /* we jump here if we have to recompile, e.g., from upgrading the pattern
7573      * to utf8 */
7574
7575     if ((pm_flags & PMf_USE_RE_EVAL)
7576                 /* this second condition covers the non-regex literal case,
7577                  * i.e.  $foo =~ '(?{})'. */
7578                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7579     )
7580         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7581
7582   redo_parse:
7583     /* return old regex if pattern hasn't changed */
7584     /* XXX: note in the below we have to check the flags as well as the
7585      * pattern.
7586      *
7587      * Things get a touch tricky as we have to compare the utf8 flag
7588      * independently from the compile flags.  */
7589
7590     if (   old_re
7591         && !recompile
7592         && !!RX_UTF8(old_re) == !!RExC_utf8
7593         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7594         && RX_PRECOMP(old_re)
7595         && RX_PRELEN(old_re) == plen
7596         && memEQ(RX_PRECOMP(old_re), exp, plen)
7597         && !runtime_code /* with runtime code, always recompile */ )
7598     {
7599         DEBUG_COMPILE_r({
7600             SV *dsv= sv_newmortal();
7601             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7602             Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
7603                           PL_colors[4], PL_colors[5], s);
7604         });
7605         return old_re;
7606     }
7607
7608     /* Allocate the pattern's SV */
7609     RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7610     RExC_rx = ReANY(Rx);
7611     if ( RExC_rx == NULL )
7612         FAIL("Regexp out of space");
7613
7614     rx_flags = orig_rx_flags;
7615
7616     if (   (UTF || RExC_uni_semantics)
7617         && initial_charset == REGEX_DEPENDS_CHARSET)
7618     {
7619
7620         /* Set to use unicode semantics if the pattern is in utf8 and has the
7621          * 'depends' charset specified, as it means unicode when utf8  */
7622         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7623         RExC_uni_semantics = 1;
7624     }
7625
7626     RExC_pm_flags = pm_flags;
7627
7628     if (runtime_code) {
7629         assert(TAINTING_get || !TAINT_get);
7630         if (TAINT_get)
7631             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7632
7633         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7634             /* whoops, we have a non-utf8 pattern, whilst run-time code
7635              * got compiled as utf8. Try again with a utf8 pattern */
7636             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7637                 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7638             goto redo_parse;
7639         }
7640     }
7641     assert(!pRExC_state->runtime_code_qr);
7642
7643     RExC_sawback = 0;
7644
7645     RExC_seen = 0;
7646     RExC_maxlen = 0;
7647     RExC_in_lookbehind = 0;
7648     RExC_in_lookahead = 0;
7649     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7650     RExC_recode_x_to_native = 0;
7651     RExC_in_multi_char_class = 0;
7652
7653     RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7654     RExC_precomp_end = RExC_end = exp + plen;
7655     RExC_nestroot = 0;
7656     RExC_whilem_seen = 0;
7657     RExC_end_op = NULL;
7658     RExC_recurse = NULL;
7659     RExC_study_chunk_recursed = NULL;
7660     RExC_study_chunk_recursed_bytes= 0;
7661     RExC_recurse_count = 0;
7662     pRExC_state->code_index = 0;
7663
7664     /* Initialize the string in the compiled pattern.  This is so that there is
7665      * something to output if necessary */
7666     set_regex_pv(pRExC_state, Rx);
7667
7668     DEBUG_PARSE_r({
7669         Perl_re_printf( aTHX_
7670             "Starting parse and generation\n");
7671         RExC_lastnum=0;
7672         RExC_lastparse=NULL;
7673     });
7674
7675     /* Allocate space and zero-initialize. Note, the two step process
7676        of zeroing when in debug mode, thus anything assigned has to
7677        happen after that */
7678     if (!  RExC_size) {
7679
7680         /* On the first pass of the parse, we guess how big this will be.  Then
7681          * we grow in one operation to that amount and then give it back.  As
7682          * we go along, we re-allocate what we need.
7683          *
7684          * XXX Currently the guess is essentially that the pattern will be an
7685          * EXACT node with one byte input, one byte output.  This is crude, and
7686          * better heuristics are welcome.
7687          *
7688          * On any subsequent passes, we guess what we actually computed in the
7689          * latest earlier pass.  Such a pass probably didn't complete so is
7690          * missing stuff.  We could improve those guesses by knowing where the
7691          * parse stopped, and use the length so far plus apply the above
7692          * assumption to what's left. */
7693         RExC_size = STR_SZ(RExC_end - RExC_start);
7694     }
7695
7696     Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7697     if ( RExC_rxi == NULL )
7698         FAIL("Regexp out of space");
7699
7700     Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7701     RXi_SET( RExC_rx, RExC_rxi );
7702
7703     /* We start from 0 (over from 0 in the case this is a reparse.  The first
7704      * node parsed will give back any excess memory we have allocated so far).
7705      * */
7706     RExC_size = 0;
7707
7708     /* non-zero initialization begins here */
7709     RExC_rx->engine= eng;
7710     RExC_rx->extflags = rx_flags;
7711     RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7712
7713     if (pm_flags & PMf_IS_QR) {
7714         RExC_rxi->code_blocks = pRExC_state->code_blocks;
7715         if (RExC_rxi->code_blocks) {
7716             RExC_rxi->code_blocks->refcnt++;
7717         }
7718     }
7719
7720     RExC_rx->intflags = 0;
7721
7722     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
7723     RExC_parse = exp;
7724
7725     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7726      * code makes sure the final byte is an uncounted NUL.  But should this
7727      * ever not be the case, lots of things could read beyond the end of the
7728      * buffer: loops like
7729      *      while(isFOO(*RExC_parse)) RExC_parse++;
7730      *      strchr(RExC_parse, "foo");
7731      * etc.  So it is worth noting. */
7732     assert(*RExC_end == '\0');
7733
7734     RExC_naughty = 0;
7735     RExC_npar = 1;
7736     RExC_parens_buf_size = 0;
7737     RExC_emit_start = RExC_rxi->program;
7738     pRExC_state->code_index = 0;
7739
7740     *((char*) RExC_emit_start) = (char) REG_MAGIC;
7741     RExC_emit = 1;
7742
7743     /* Do the parse */
7744     if (reg(pRExC_state, 0, &flags, 1)) {
7745
7746         /* Success!, But we may need to redo the parse knowing how many parens
7747          * there actually are */
7748         if (IN_PARENS_PASS) {
7749             flags |= RESTART_PARSE;
7750         }
7751
7752         /* We have that number in RExC_npar */
7753         RExC_total_parens = RExC_npar;
7754     }
7755     else if (! MUST_RESTART(flags)) {
7756         ReREFCNT_dec(Rx);
7757         Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7758     }
7759
7760     /* Here, we either have success, or we have to redo the parse for some reason */
7761     if (MUST_RESTART(flags)) {
7762
7763         /* It's possible to write a regexp in ascii that represents Unicode
7764         codepoints outside of the byte range, such as via \x{100}. If we
7765         detect such a sequence we have to convert the entire pattern to utf8
7766         and then recompile, as our sizing calculation will have been based
7767         on 1 byte == 1 character, but we will need to use utf8 to encode
7768         at least some part of the pattern, and therefore must convert the whole
7769         thing.
7770         -- dmq */
7771         if (flags & NEED_UTF8) {
7772
7773             /* We have stored the offset of the final warning output so far.
7774              * That must be adjusted.  Any variant characters between the start
7775              * of the pattern and this warning count for 2 bytes in the final,
7776              * so just add them again */
7777             if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7778                 RExC_latest_warn_offset +=
7779                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
7780                                                 + RExC_latest_warn_offset);
7781             }
7782             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7783             pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7784             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7785         }
7786         else {
7787             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7788         }
7789
7790         if (ALL_PARENS_COUNTED) {
7791             /* Make enough room for all the known parens, and zero it */
7792             Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7793             Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7794             RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
7795
7796             Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7797             Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7798         }
7799         else { /* Parse did not complete.  Reinitialize the parentheses
7800                   structures */
7801             RExC_total_parens = 0;
7802             if (RExC_open_parens) {
7803                 Safefree(RExC_open_parens);
7804                 RExC_open_parens = NULL;
7805             }
7806             if (RExC_close_parens) {
7807                 Safefree(RExC_close_parens);
7808                 RExC_close_parens = NULL;
7809             }
7810         }
7811
7812         /* Clean up what we did in this parse */
7813         SvREFCNT_dec_NN(RExC_rx_sv);
7814
7815         goto redo_parse;
7816     }
7817
7818     /* Here, we have successfully parsed and generated the pattern's program
7819      * for the regex engine.  We are ready to finish things up and look for
7820      * optimizations. */
7821
7822     /* Update the string to compile, with correct modifiers, etc */
7823     set_regex_pv(pRExC_state, Rx);
7824
7825     RExC_rx->nparens = RExC_total_parens - 1;
7826
7827     /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7828     if (RExC_whilem_seen > 15)
7829         RExC_whilem_seen = 15;
7830
7831     DEBUG_PARSE_r({
7832         Perl_re_printf( aTHX_
7833             "Required size %" IVdf " nodes\n", (IV)RExC_size);
7834         RExC_lastnum=0;
7835         RExC_lastparse=NULL;
7836     });
7837
7838 #ifdef RE_TRACK_PATTERN_OFFSETS
7839     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7840                           "%s %" UVuf " bytes for offset annotations.\n",
7841                           RExC_offsets ? "Got" : "Couldn't get",
7842                           (UV)((RExC_offsets[0] * 2 + 1))));
7843     DEBUG_OFFSETS_r(if (RExC_offsets) {
7844         const STRLEN len = RExC_offsets[0];
7845         STRLEN i;
7846         GET_RE_DEBUG_FLAGS_DECL;
7847         Perl_re_printf( aTHX_
7848                       "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7849         for (i = 1; i <= len; i++) {
7850             if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7851                 Perl_re_printf( aTHX_  "%" UVuf ":%" UVuf "[%" UVuf "] ",
7852                 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7853         }
7854         Perl_re_printf( aTHX_  "\n");
7855     });
7856
7857 #else
7858     SetProgLen(RExC_rxi,RExC_size);
7859 #endif
7860
7861     DEBUG_DUMP_PRE_OPTIMIZE_r({
7862         SV * const sv = sv_newmortal();
7863         RXi_GET_DECL(RExC_rx, ri);
7864         DEBUG_RExC_seen();
7865         Perl_re_printf( aTHX_ "Program before optimization:\n");
7866
7867         (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7868                         sv, 0, 0);
7869     });
7870
7871     DEBUG_OPTIMISE_r(
7872         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7873     );
7874
7875     /* XXXX To minimize changes to RE engine we always allocate
7876        3-units-long substrs field. */
7877     Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7878     if (RExC_recurse_count) {
7879         Newx(RExC_recurse, RExC_recurse_count, regnode *);
7880         SAVEFREEPV(RExC_recurse);
7881     }
7882
7883     if (RExC_seen & REG_RECURSE_SEEN) {
7884         /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7885          * So its 1 if there are no parens. */
7886         RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7887                                          ((RExC_total_parens & 0x07) != 0);
7888         Newx(RExC_study_chunk_recursed,
7889              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7890         SAVEFREEPV(RExC_study_chunk_recursed);
7891     }
7892
7893   reStudy:
7894     RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7895     DEBUG_r(
7896         RExC_study_chunk_recursed_count= 0;
7897     );
7898     Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7899     if (RExC_study_chunk_recursed) {
7900         Zero(RExC_study_chunk_recursed,
7901              RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7902     }
7903
7904
7905 #ifdef TRIE_STUDY_OPT
7906     if (!restudied) {
7907         StructCopy(&zero_scan_data, &data, scan_data_t);
7908         copyRExC_state = RExC_state;
7909     } else {
7910         U32 seen=RExC_seen;
7911         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7912
7913         RExC_state = copyRExC_state;
7914         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7915             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7916         else
7917             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7918         StructCopy(&zero_scan_data, &data, scan_data_t);
7919     }
7920 #else
7921     StructCopy(&zero_scan_data, &data, scan_data_t);
7922 #endif
7923
7924     /* Dig out information for optimizations. */
7925     RExC_rx->extflags = RExC_flags; /* was pm_op */
7926     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7927
7928     if (UTF)
7929         SvUTF8_on(Rx);  /* Unicode in it? */
7930     RExC_rxi->regstclass = NULL;
7931     if (RExC_naughty >= TOO_NAUGHTY)    /* Probably an expensive pattern. */
7932         RExC_rx->intflags |= PREGf_NAUGHTY;
7933     scan = RExC_rxi->program + 1;               /* First BRANCH. */
7934
7935     /* testing for BRANCH here tells us whether there is "must appear"
7936        data in the pattern. If there is then we can use it for optimisations */
7937     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7938                                                   */
7939         SSize_t fake;
7940         STRLEN longest_length[2];
7941         regnode_ssc ch_class; /* pointed to by data */
7942         int stclass_flag;
7943         SSize_t last_close = 0; /* pointed to by data */
7944         regnode *first= scan;
7945         regnode *first_next= regnext(first);
7946         int i;
7947
7948         /*
7949          * Skip introductions and multiplicators >= 1
7950          * so that we can extract the 'meat' of the pattern that must
7951          * match in the large if() sequence following.
7952          * NOTE that EXACT is NOT covered here, as it is normally
7953          * picked up by the optimiser separately.
7954          *
7955          * This is unfortunate as the optimiser isnt handling lookahead
7956          * properly currently.
7957          *
7958          */
7959         while ((OP(first) == OPEN && (sawopen = 1)) ||
7960                /* An OR of *one* alternative - should not happen now. */
7961             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7962             /* for now we can't handle lookbehind IFMATCH*/
7963             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7964             (OP(first) == PLUS) ||
7965             (OP(first) == MINMOD) ||
7966                /* An {n,m} with n>0 */
7967             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7968             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7969         {
7970                 /*
7971                  * the only op that could be a regnode is PLUS, all the rest
7972                  * will be regnode_1 or regnode_2.
7973                  *
7974                  * (yves doesn't think this is true)
7975                  */
7976                 if (OP(first) == PLUS)
7977                     sawplus = 1;
7978                 else {
7979                     if (OP(first) == MINMOD)
7980                         sawminmod = 1;
7981                     first += regarglen[OP(first)];
7982                 }
7983                 first = NEXTOPER(first);
7984                 first_next= regnext(first);
7985         }
7986
7987         /* Starting-point info. */
7988       again:
7989         DEBUG_PEEP("first:", first, 0, 0);
7990         /* Ignore EXACT as we deal with it later. */
7991         if (PL_regkind[OP(first)] == EXACT) {
7992             if (   OP(first) == EXACT
7993                 || OP(first) == LEXACT
7994                 || OP(first) == EXACT_REQ8
7995                 || OP(first) == LEXACT_REQ8
7996                 || OP(first) == EXACTL)
7997             {
7998                 NOOP;   /* Empty, get anchored substr later. */
7999             }
8000             else
8001                 RExC_rxi->regstclass = first;
8002         }
8003 #ifdef TRIE_STCLASS
8004         else if (PL_regkind[OP(first)] == TRIE &&
8005                 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8006         {
8007             /* this can happen only on restudy */
8008             RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8009         }
8010 #endif
8011         else if (REGNODE_SIMPLE(OP(first)))
8012             RExC_rxi->regstclass = first;
8013         else if (PL_regkind[OP(first)] == BOUND ||
8014                  PL_regkind[OP(first)] == NBOUND)
8015             RExC_rxi->regstclass = first;
8016         else if (PL_regkind[OP(first)] == BOL) {
8017             RExC_rx->intflags |= (OP(first) == MBOL
8018                            ? PREGf_ANCH_MBOL
8019                            : PREGf_ANCH_SBOL);
8020             first = NEXTOPER(first);
8021             goto again;
8022         }
8023         else if (OP(first) == GPOS) {
8024             RExC_rx->intflags |= PREGf_ANCH_GPOS;
8025             first = NEXTOPER(first);
8026             goto again;
8027         }
8028         else if ((!sawopen || !RExC_sawback) &&
8029             !sawlookahead &&
8030             (OP(first) == STAR &&
8031             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8032             !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8033         {
8034             /* turn .* into ^.* with an implied $*=1 */
8035             const int type =
8036                 (OP(NEXTOPER(first)) == REG_ANY)
8037                     ? PREGf_ANCH_MBOL
8038                     : PREGf_ANCH_SBOL;
8039             RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8040             first = NEXTOPER(first);
8041             goto again;
8042         }
8043         if (sawplus && !sawminmod && !sawlookahead
8044             && (!sawopen || !RExC_sawback)
8045             && !pRExC_state->code_blocks) /* May examine pos and $& */
8046             /* x+ must match at the 1st pos of run of x's */
8047             RExC_rx->intflags |= PREGf_SKIP;
8048
8049         /* Scan is after the zeroth branch, first is atomic matcher. */
8050 #ifdef TRIE_STUDY_OPT
8051         DEBUG_PARSE_r(
8052             if (!restudied)
8053                 Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8054                               (IV)(first - scan + 1))
8055         );
8056 #else
8057         DEBUG_PARSE_r(
8058             Perl_re_printf( aTHX_  "first at %" IVdf "\n",
8059                 (IV)(first - scan + 1))
8060         );
8061 #endif
8062
8063
8064         /*
8065         * If there's something expensive in the r.e., find the
8066         * longest literal string that must appear and make it the
8067         * regmust.  Resolve ties in favor of later strings, since
8068         * the regstart check works with the beginning of the r.e.
8069         * and avoiding duplication strengthens checking.  Not a
8070         * strong reason, but sufficient in the absence of others.
8071         * [Now we resolve ties in favor of the earlier string if
8072         * it happens that c_offset_min has been invalidated, since the
8073         * earlier string may buy us something the later one won't.]
8074         */
8075
8076         data.substrs[0].str = newSVpvs("");
8077         data.substrs[1].str = newSVpvs("");
8078         data.last_found = newSVpvs("");
8079         data.cur_is_floating = 0; /* initially any found substring is fixed */
8080         ENTER_with_name("study_chunk");
8081         SAVEFREESV(data.substrs[0].str);
8082         SAVEFREESV(data.substrs[1].str);
8083         SAVEFREESV(data.last_found);
8084         first = scan;
8085         if (!RExC_rxi->regstclass) {
8086             ssc_init(pRExC_state, &ch_class);
8087             data.start_class = &ch_class;
8088             stclass_flag = SCF_DO_STCLASS_AND;
8089         } else                          /* XXXX Check for BOUND? */
8090             stclass_flag = 0;
8091         data.last_closep = &last_close;
8092
8093         DEBUG_RExC_seen();
8094         /*
8095          * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8096          * (NO top level branches)
8097          */
8098         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8099                              scan + RExC_size, /* Up to end */
8100             &data, -1, 0, NULL,
8101             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8102                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8103             0);
8104
8105
8106         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8107
8108
8109         if ( RExC_total_parens == 1 && !data.cur_is_floating
8110              && data.last_start_min == 0 && data.last_end > 0
8111              && !RExC_seen_zerolen
8112              && !(RExC_seen & REG_VERBARG_SEEN)
8113              && !(RExC_seen & REG_GPOS_SEEN)
8114         ){
8115             RExC_rx->extflags |= RXf_CHECK_ALL;
8116         }
8117         scan_commit(pRExC_state, &data,&minlen, 0);
8118
8119
8120         /* XXX this is done in reverse order because that's the way the
8121          * code was before it was parameterised. Don't know whether it
8122          * actually needs doing in reverse order. DAPM */
8123         for (i = 1; i >= 0; i--) {
8124             longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8125
8126             if (   !(   i
8127                      && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
8128                      &&    data.substrs[0].min_offset
8129                         == data.substrs[1].min_offset
8130                      &&    SvCUR(data.substrs[0].str)
8131                         == SvCUR(data.substrs[1].str)
8132                     )
8133                 && S_setup_longest (aTHX_ pRExC_state,
8134                                         &(RExC_rx->substrs->data[i]),
8135                                         &(data.substrs[i]),
8136                                         longest_length[i]))
8137             {
8138                 RExC_rx->substrs->data[i].min_offset =
8139                         data.substrs[i].min_offset - data.substrs[i].lookbehind;
8140
8141                 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8142                 /* Don't offset infinity */
8143                 if (data.substrs[i].max_offset < SSize_t_MAX)
8144                     RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8145                 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8146             }
8147             else {
8148                 RExC_rx->substrs->data[i].substr      = NULL;
8149                 RExC_rx->substrs->data[i].utf8_substr = NULL;
8150                 longest_length[i] = 0;
8151             }
8152         }
8153
8154         LEAVE_with_name("study_chunk");
8155
8156         if (RExC_rxi->regstclass
8157             && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8158             RExC_rxi->regstclass = NULL;
8159
8160         if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8161               || RExC_rx->substrs->data[0].min_offset)
8162             && stclass_flag
8163             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8164             && is_ssc_worth_it(pRExC_state, data.start_class))
8165         {
8166             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8167
8168             ssc_finalize(pRExC_state, data.start_class);
8169
8170             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8171             StructCopy(data.start_class,
8172                        (regnode_ssc*)RExC_rxi->data->data[n],
8173                        regnode_ssc);
8174             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8175             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8176             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8177                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8178                       Perl_re_printf( aTHX_
8179                                     "synthetic stclass \"%s\".\n",
8180                                     SvPVX_const(sv));});
8181             data.start_class = NULL;
8182         }
8183
8184         /* A temporary algorithm prefers floated substr to fixed one of
8185          * same length to dig more info. */
8186         i = (longest_length[0] <= longest_length[1]);
8187         RExC_rx->substrs->check_ix = i;
8188         RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
8189         RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
8190         RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
8191         RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8192         RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8193         if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8194             RExC_rx->intflags |= PREGf_NOSCAN;
8195
8196         if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8197             RExC_rx->extflags |= RXf_USE_INTUIT;
8198             if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8199                 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8200         }
8201
8202         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8203         if ( (STRLEN)minlen < longest_length[1] )
8204             minlen= longest_length[1];
8205         if ( (STRLEN)minlen < longest_length[0] )
8206             minlen= longest_length[0];
8207         */
8208     }
8209     else {
8210         /* Several toplevels. Best we can is to set minlen. */
8211         SSize_t fake;
8212         regnode_ssc ch_class;
8213         SSize_t last_close = 0;
8214
8215         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
8216
8217         scan = RExC_rxi->program + 1;
8218         ssc_init(pRExC_state, &ch_class);
8219         data.start_class = &ch_class;
8220         data.last_closep = &last_close;
8221
8222         DEBUG_RExC_seen();
8223         /*
8224          * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8225          * (patterns WITH top level branches)
8226          */
8227         minlen = study_chunk(pRExC_state,
8228             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8229             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8230                                                       ? SCF_TRIE_DOING_RESTUDY
8231                                                       : 0),
8232             0);
8233
8234         CHECK_RESTUDY_GOTO_butfirst(NOOP);
8235
8236         RExC_rx->check_substr = NULL;
8237         RExC_rx->check_utf8 = NULL;
8238         RExC_rx->substrs->data[0].substr      = NULL;
8239         RExC_rx->substrs->data[0].utf8_substr = NULL;
8240         RExC_rx->substrs->data[1].substr      = NULL;
8241         RExC_rx->substrs->data[1].utf8_substr = NULL;
8242
8243         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8244             && is_ssc_worth_it(pRExC_state, data.start_class))
8245         {
8246             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8247
8248             ssc_finalize(pRExC_state, data.start_class);
8249
8250             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8251             StructCopy(data.start_class,
8252                        (regnode_ssc*)RExC_rxi->data->data[n],
8253                        regnode_ssc);
8254             RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8255             RExC_rx->intflags &= ~PREGf_SKIP;   /* Used in find_byclass(). */
8256             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8257                       regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8258                       Perl_re_printf( aTHX_
8259                                     "synthetic stclass \"%s\".\n",
8260                                     SvPVX_const(sv));});
8261             data.start_class = NULL;
8262         }
8263     }
8264
8265     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8266         RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8267         RExC_rx->maxlen = REG_INFTY;
8268     }
8269     else {
8270         RExC_rx->maxlen = RExC_maxlen;
8271     }
8272
8273     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8274        the "real" pattern. */
8275     DEBUG_OPTIMISE_r({
8276         Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8277                       (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8278     });
8279     RExC_rx->minlenret = minlen;
8280     if (RExC_rx->minlen < minlen)
8281         RExC_rx->minlen = minlen;
8282
8283     if (RExC_seen & REG_RECURSE_SEEN ) {
8284         RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8285         Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8286     }
8287     if (RExC_seen & REG_GPOS_SEEN)
8288         RExC_rx->intflags |= PREGf_GPOS_SEEN;
8289     if (RExC_seen & REG_LOOKBEHIND_SEEN)
8290         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8291                                                 lookbehind */
8292     if (pRExC_state->code_blocks)
8293         RExC_rx->extflags |= RXf_EVAL_SEEN;
8294     if (RExC_seen & REG_VERBARG_SEEN)
8295     {
8296         RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8297         RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8298     }
8299     if (RExC_seen & REG_CUTGROUP_SEEN)
8300         RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8301     if (pm_flags & PMf_USE_RE_EVAL)
8302         RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8303     if (RExC_paren_names)
8304         RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8305     else
8306         RXp_PAREN_NAMES(RExC_rx) = NULL;
8307
8308     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8309      * so it can be used in pp.c */
8310     if (RExC_rx->intflags & PREGf_ANCH)
8311         RExC_rx->extflags |= RXf_IS_ANCHORED;
8312
8313
8314     {
8315         /* this is used to identify "special" patterns that might result
8316          * in Perl NOT calling the regex engine and instead doing the match "itself",
8317          * particularly special cases in split//. By having the regex compiler
8318          * do this pattern matching at a regop level (instead of by inspecting the pattern)
8319          * we avoid weird issues with equivalent patterns resulting in different behavior,
8320          * AND we allow non Perl engines to get the same optimizations by the setting the
8321          * flags appropriately - Yves */
8322         regnode *first = RExC_rxi->program + 1;
8323         U8 fop = OP(first);
8324         regnode *next = regnext(first);
8325         U8 nop = OP(next);
8326
8327         if (PL_regkind[fop] == NOTHING && nop == END)
8328             RExC_rx->extflags |= RXf_NULL;
8329         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8330             /* when fop is SBOL first->flags will be true only when it was
8331              * produced by parsing /\A/, and not when parsing /^/. This is
8332              * very important for the split code as there we want to
8333              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8334              * See rt #122761 for more details. -- Yves */
8335             RExC_rx->extflags |= RXf_START_ONLY;
8336         else if (fop == PLUS
8337                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8338                  && nop == END)
8339             RExC_rx->extflags |= RXf_WHITE;
8340         else if ( RExC_rx->extflags & RXf_SPLIT
8341                   && (   fop == EXACT || fop == LEXACT
8342                       || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8343                       || fop == EXACTL)
8344                   && STR_LEN(first) == 1
8345                   && *(STRING(first)) == ' '
8346                   && nop == END )
8347             RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8348
8349     }
8350
8351     if (RExC_contains_locale) {
8352         RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8353     }
8354
8355 #ifdef DEBUGGING
8356     if (RExC_paren_names) {
8357         RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8358         RExC_rxi->data->data[RExC_rxi->name_list_idx]
8359                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
8360     } else
8361 #endif
8362     RExC_rxi->name_list_idx = 0;
8363
8364     while ( RExC_recurse_count > 0 ) {
8365         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8366         /*
8367          * This data structure is set up in study_chunk() and is used
8368          * to calculate the distance between a GOSUB regopcode and
8369          * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8370          * it refers to.
8371          *
8372          * If for some reason someone writes code that optimises
8373          * away a GOSUB opcode then the assert should be changed to
8374          * an if(scan) to guard the ARG2L_SET() - Yves
8375          *
8376          */
8377         assert(scan && OP(scan) == GOSUB);
8378         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8379     }
8380
8381     Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8382     /* assume we don't need to swap parens around before we match */
8383     DEBUG_TEST_r({
8384         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8385             (unsigned long)RExC_study_chunk_recursed_count);
8386     });
8387     DEBUG_DUMP_r({
8388         DEBUG_RExC_seen();
8389         Perl_re_printf( aTHX_ "Final program:\n");
8390         regdump(RExC_rx);
8391     });
8392
8393     if (RExC_open_parens) {
8394         Safefree(RExC_open_parens);
8395         RExC_open_parens = NULL;
8396     }
8397     if (RExC_close_parens) {
8398         Safefree(RExC_close_parens);
8399         RExC_close_parens = NULL;
8400     }
8401
8402 #ifdef USE_ITHREADS
8403     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8404      * by setting the regexp SV to readonly-only instead. If the
8405      * pattern's been recompiled, the USEDness should remain. */
8406     if (old_re && SvREADONLY(old_re))
8407         SvREADONLY_on(Rx);
8408 #endif
8409     return Rx;
8410 }
8411
8412
8413 SV*
8414 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8415                     const U32 flags)
8416 {
8417     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8418
8419     PERL_UNUSED_ARG(value);
8420
8421     if (flags & RXapif_FETCH) {
8422         return reg_named_buff_fetch(rx, key, flags);
8423     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8424         Perl_croak_no_modify();
8425         return NULL;
8426     } else if (flags & RXapif_EXISTS) {
8427         return reg_named_buff_exists(rx, key, flags)
8428             ? &PL_sv_yes
8429             : &PL_sv_no;
8430     } else if (flags & RXapif_REGNAMES) {
8431         return reg_named_buff_all(rx, flags);
8432     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8433         return reg_named_buff_scalar(rx, flags);
8434     } else {
8435         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8436         return NULL;
8437     }
8438 }
8439
8440 SV*
8441 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8442                          const U32 flags)
8443 {
8444     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8445     PERL_UNUSED_ARG(lastkey);
8446
8447     if (flags & RXapif_FIRSTKEY)
8448         return reg_named_buff_firstkey(rx, flags);
8449     else if (flags & RXapif_NEXTKEY)
8450         return reg_named_buff_nextkey(rx, flags);
8451     else {
8452         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8453                                             (int)flags);
8454         return NULL;
8455     }
8456 }
8457
8458 SV*
8459 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8460                           const U32 flags)
8461 {
8462     SV *ret;
8463     struct regexp *const rx = ReANY(r);
8464
8465     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8466
8467     if (rx && RXp_PAREN_NAMES(rx)) {
8468         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8469         if (he_str) {
8470             IV i;
8471             SV* sv_dat=HeVAL(he_str);
8472             I32 *nums=(I32*)SvPVX(sv_dat);
8473             AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8474             for ( i=0; i<SvIVX(sv_dat); i++ ) {
8475                 if ((I32)(rx->nparens) >= nums[i]
8476                     && rx->offs[nums[i]].start != -1
8477                     && rx->offs[nums[i]].end != -1)
8478                 {
8479                     ret = newSVpvs("");
8480                     CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8481                     if (!retarray)
8482                         return ret;
8483                 } else {
8484                     if (retarray)
8485                         ret = newSVsv(&PL_sv_undef);
8486                 }
8487                 if (retarray)
8488                     av_push(retarray, ret);
8489             }
8490             if (retarray)
8491                 return newRV_noinc(MUTABLE_SV(retarray));
8492         }
8493     }
8494     return NULL;
8495 }
8496
8497 bool
8498 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8499                            const U32 flags)
8500 {
8501     struct regexp *const rx = ReANY(r);
8502
8503     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8504
8505     if (rx && RXp_PAREN_NAMES(rx)) {
8506         if (flags & RXapif_ALL) {
8507             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8508         } else {
8509             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8510             if (sv) {
8511                 SvREFCNT_dec_NN(sv);
8512                 return TRUE;
8513             } else {
8514                 return FALSE;
8515             }
8516         }
8517     } else {
8518         return FALSE;
8519     }
8520 }
8521
8522 SV*
8523 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8524 {
8525     struct regexp *const rx = ReANY(r);
8526
8527     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8528
8529     if ( rx && RXp_PAREN_NAMES(rx) ) {
8530         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8531
8532         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8533     } else {
8534         return FALSE;
8535     }
8536 }
8537
8538 SV*
8539 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8540 {
8541     struct regexp *const rx = ReANY(r);
8542     GET_RE_DEBUG_FLAGS_DECL;
8543
8544     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8545
8546     if (rx && RXp_PAREN_NAMES(rx)) {
8547         HV *hv = RXp_PAREN_NAMES(rx);
8548         HE *temphe;
8549         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8550             IV i;
8551             IV parno = 0;
8552             SV* sv_dat = HeVAL(temphe);
8553             I32 *nums = (I32*)SvPVX(sv_dat);
8554             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8555                 if ((I32)(rx->lastparen) >= nums[i] &&
8556                     rx->offs[nums[i]].start != -1 &&
8557                     rx->offs[nums[i]].end != -1)
8558                 {
8559                     parno = nums[i];
8560                     break;
8561                 }
8562             }
8563             if (parno || flags & RXapif_ALL) {
8564                 return newSVhek(HeKEY_hek(temphe));
8565             }
8566         }
8567     }
8568     return NULL;
8569 }
8570
8571 SV*
8572 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8573 {
8574     SV *ret;
8575     AV *av;
8576     SSize_t length;
8577     struct regexp *const rx = ReANY(r);
8578
8579     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8580
8581     if (rx && RXp_PAREN_NAMES(rx)) {
8582         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8583             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8584         } else if (flags & RXapif_ONE) {
8585             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8586             av = MUTABLE_AV(SvRV(ret));
8587             length = av_tindex(av);
8588             SvREFCNT_dec_NN(ret);
8589             return newSViv(length + 1);
8590         } else {
8591             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8592                                                 (int)flags);
8593             return NULL;
8594         }
8595     }
8596     return &PL_sv_undef;
8597 }
8598
8599 SV*
8600 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8601 {
8602     struct regexp *const rx = ReANY(r);
8603     AV *av = newAV();
8604
8605     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8606
8607     if (rx && RXp_PAREN_NAMES(rx)) {
8608         HV *hv= RXp_PAREN_NAMES(rx);
8609         HE *temphe;
8610         (void)hv_iterinit(hv);
8611         while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8612             IV i;
8613             IV parno = 0;
8614             SV* sv_dat = HeVAL(temphe);
8615             I32 *nums = (I32*)SvPVX(sv_dat);
8616             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8617                 if ((I32)(rx->lastparen) >= nums[i] &&
8618                     rx->offs[nums[i]].start != -1 &&
8619                     rx->offs[nums[i]].end != -1)
8620                 {
8621                     parno = nums[i];
8622                     break;
8623                 }
8624             }
8625             if (parno || flags & RXapif_ALL) {
8626                 av_push(av, newSVhek(HeKEY_hek(temphe)));
8627             }
8628         }
8629     }
8630
8631     return newRV_noinc(MUTABLE_SV(av));
8632 }
8633
8634 void
8635 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8636                              SV * const sv)
8637 {
8638     struct regexp *const rx = ReANY(r);
8639     char *s = NULL;
8640     SSize_t i = 0;
8641     SSize_t s1, t1;
8642     I32 n = paren;
8643
8644     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8645
8646     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8647            || n == RX_BUFF_IDX_CARET_FULLMATCH
8648            || n == RX_BUFF_IDX_CARET_POSTMATCH
8649        )
8650     {
8651         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8652         if (!keepcopy) {
8653             /* on something like
8654              *    $r = qr/.../;
8655              *    /$qr/p;
8656              * the KEEPCOPY is set on the PMOP rather than the regex */
8657             if (PL_curpm && r == PM_GETRE(PL_curpm))
8658                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8659         }
8660         if (!keepcopy)
8661             goto ret_undef;
8662     }
8663
8664     if (!rx->subbeg)
8665         goto ret_undef;
8666
8667     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8668         /* no need to distinguish between them any more */
8669         n = RX_BUFF_IDX_FULLMATCH;
8670
8671     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8672         && rx->offs[0].start != -1)
8673     {
8674         /* $`, ${^PREMATCH} */
8675         i = rx->offs[0].start;
8676         s = rx->subbeg;
8677     }
8678     else
8679     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8680         && rx->offs[0].end != -1)
8681     {
8682         /* $', ${^POSTMATCH} */
8683         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8684         i = rx->sublen + rx->suboffset - rx->offs[0].end;
8685     }
8686     else
8687     if (inRANGE(n, 0, (I32)rx->nparens) &&
8688         (s1 = rx->offs[n].start) != -1  &&
8689         (t1 = rx->offs[n].end) != -1)
8690     {
8691         /* $&, ${^MATCH},  $1 ... */
8692         i = t1 - s1;
8693         s = rx->subbeg + s1 - rx->suboffset;
8694     } else {
8695         goto ret_undef;
8696     }
8697
8698     assert(s >= rx->subbeg);
8699     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8700     if (i >= 0) {
8701 #ifdef NO_TAINT_SUPPORT
8702         sv_setpvn(sv, s, i);
8703 #else
8704         const int oldtainted = TAINT_get;
8705         TAINT_NOT;
8706         sv_setpvn(sv, s, i);
8707         TAINT_set(oldtainted);
8708 #endif
8709         if (RXp_MATCH_UTF8(rx))
8710             SvUTF8_on(sv);
8711         else
8712             SvUTF8_off(sv);
8713         if (TAINTING_get) {
8714             if (RXp_MATCH_TAINTED(rx)) {
8715                 if (SvTYPE(sv) >= SVt_PVMG) {
8716                     MAGIC* const mg = SvMAGIC(sv);
8717                     MAGIC* mgt;
8718                     TAINT;
8719                     SvMAGIC_set(sv, mg->mg_moremagic);
8720                     SvTAINT(sv);
8721                     if ((mgt = SvMAGIC(sv))) {
8722                         mg->mg_moremagic = mgt;
8723                         SvMAGIC_set(sv, mg);
8724                     }
8725                 } else {
8726                     TAINT;
8727                     SvTAINT(sv);
8728                 }
8729             } else
8730                 SvTAINTED_off(sv);
8731         }
8732     } else {
8733       ret_undef:
8734         sv_set_undef(sv);
8735         return;
8736     }
8737 }
8738
8739 void
8740 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8741                                                          SV const * const value)
8742 {
8743     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8744
8745     PERL_UNUSED_ARG(rx);
8746     PERL_UNUSED_ARG(paren);
8747     PERL_UNUSED_ARG(value);
8748
8749     if (!PL_localizing)
8750         Perl_croak_no_modify();
8751 }
8752
8753 I32
8754 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8755                               const I32 paren)
8756 {
8757     struct regexp *const rx = ReANY(r);
8758     I32 i;
8759     I32 s1, t1;
8760
8761     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8762
8763     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8764         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8765         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8766     )
8767     {
8768         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8769         if (!keepcopy) {
8770             /* on something like
8771              *    $r = qr/.../;
8772              *    /$qr/p;
8773              * the KEEPCOPY is set on the PMOP rather than the regex */
8774             if (PL_curpm && r == PM_GETRE(PL_curpm))
8775                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8776         }
8777         if (!keepcopy)
8778             goto warn_undef;
8779     }
8780
8781     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8782     switch (paren) {
8783       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8784       case RX_BUFF_IDX_PREMATCH:       /* $` */
8785         if (rx->offs[0].start != -1) {
8786                         i = rx->offs[0].start;
8787                         if (i > 0) {
8788                                 s1 = 0;
8789                                 t1 = i;
8790                                 goto getlen;
8791                         }
8792             }
8793         return 0;
8794
8795       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8796       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8797             if (rx->offs[0].end != -1) {
8798                         i = rx->sublen - rx->offs[0].end;
8799                         if (i > 0) {
8800                                 s1 = rx->offs[0].end;
8801                                 t1 = rx->sublen;
8802                                 goto getlen;
8803                         }
8804             }
8805         return 0;
8806
8807       default: /* $& / ${^MATCH}, $1, $2, ... */
8808             if (paren <= (I32)rx->nparens &&
8809             (s1 = rx->offs[paren].start) != -1 &&
8810             (t1 = rx->offs[paren].end) != -1)
8811             {
8812             i = t1 - s1;
8813             goto getlen;
8814         } else {
8815           warn_undef:
8816             if (ckWARN(WARN_UNINITIALIZED))
8817                 report_uninit((const SV *)sv);
8818             return 0;
8819         }
8820     }
8821   getlen:
8822     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8823         const char * const s = rx->subbeg - rx->suboffset + s1;
8824         const U8 *ep;
8825         STRLEN el;
8826
8827         i = t1 - s1;
8828         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8829                         i = el;
8830     }
8831     return i;
8832 }
8833
8834 SV*
8835 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8836 {
8837     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8838         PERL_UNUSED_ARG(rx);
8839         if (0)
8840             return NULL;
8841         else
8842             return newSVpvs("Regexp");
8843 }
8844
8845 /* Scans the name of a named buffer from the pattern.
8846  * If flags is REG_RSN_RETURN_NULL returns null.
8847  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8848  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8849  * to the parsed name as looked up in the RExC_paren_names hash.
8850  * If there is an error throws a vFAIL().. type exception.
8851  */
8852
8853 #define REG_RSN_RETURN_NULL    0
8854 #define REG_RSN_RETURN_NAME    1
8855 #define REG_RSN_RETURN_DATA    2
8856
8857 STATIC SV*
8858 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8859 {
8860     char *name_start = RExC_parse;
8861     SV* sv_name;
8862
8863     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8864
8865     assert (RExC_parse <= RExC_end);
8866     if (RExC_parse == RExC_end) NOOP;
8867     else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8868          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8869           * using do...while */
8870         if (UTF)
8871             do {
8872                 RExC_parse += UTF8SKIP(RExC_parse);
8873             } while (   RExC_parse < RExC_end
8874                      && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8875         else
8876             do {
8877                 RExC_parse++;
8878             } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8879     } else {
8880         RExC_parse++; /* so the <- from the vFAIL is after the offending
8881                          character */
8882         vFAIL("Group name must start with a non-digit word character");
8883     }
8884     sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8885                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8886     if ( flags == REG_RSN_RETURN_NAME)
8887         return sv_name;
8888     else if (flags==REG_RSN_RETURN_DATA) {
8889         HE *he_str = NULL;
8890         SV *sv_dat = NULL;
8891         if ( ! sv_name )      /* should not happen*/
8892             Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8893         if (RExC_paren_names)
8894             he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8895         if ( he_str )
8896             sv_dat = HeVAL(he_str);
8897         if ( ! sv_dat ) {   /* Didn't find group */
8898
8899             /* It might be a forward reference; we can't fail until we
8900                 * know, by completing the parse to get all the groups, and
8901                 * then reparsing */
8902             if (ALL_PARENS_COUNTED)  {
8903                 vFAIL("Reference to nonexistent named group");
8904             }
8905             else {
8906                 REQUIRE_PARENS_PASS;
8907             }
8908         }
8909         return sv_dat;
8910     }
8911
8912     Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8913                      (unsigned long) flags);
8914 }
8915
8916 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8917     if (RExC_lastparse!=RExC_parse) {                           \
8918         Perl_re_printf( aTHX_  "%s",                            \
8919             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8920                 RExC_end - RExC_parse, 16,                      \
8921                 "", "",                                         \
8922                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8923                 PERL_PV_PRETTY_ELLIPSES   |                     \
8924                 PERL_PV_PRETTY_LTGT       |                     \
8925                 PERL_PV_ESCAPE_RE         |                     \
8926                 PERL_PV_PRETTY_EXACTSIZE                        \
8927             )                                                   \
8928         );                                                      \
8929     } else                                                      \
8930         Perl_re_printf( aTHX_ "%16s","");                       \
8931                                                                 \
8932     if (RExC_lastnum!=RExC_emit)                                \
8933        Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
8934     else                                                        \
8935        Perl_re_printf( aTHX_ "|%4s","");                        \
8936     Perl_re_printf( aTHX_ "|%*s%-4s",                           \
8937         (int)((depth*2)), "",                                   \
8938         (funcname)                                              \
8939     );                                                          \
8940     RExC_lastnum=RExC_emit;                                     \
8941     RExC_lastparse=RExC_parse;                                  \
8942 })
8943
8944
8945
8946 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8947     DEBUG_PARSE_MSG((funcname));                            \
8948     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8949 })
8950 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8951     DEBUG_PARSE_MSG((funcname));                            \
8952     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8953 })
8954
8955 /* This section of code defines the inversion list object and its methods.  The
8956  * interfaces are highly subject to change, so as much as possible is static to
8957  * this file.  An inversion list is here implemented as a malloc'd C UV array
8958  * as an SVt_INVLIST scalar.
8959  *
8960  * An inversion list for Unicode is an array of code points, sorted by ordinal
8961  * number.  Each element gives the code point that begins a range that extends
8962  * up-to but not including the code point given by the next element.  The final
8963  * element gives the first code point of a range that extends to the platform's
8964  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
8965  * ...) give ranges whose code points are all in the inversion list.  We say
8966  * that those ranges are in the set.  The odd-numbered elements give ranges
8967  * whose code points are not in the inversion list, and hence not in the set.
8968  * Thus, element [0] is the first code point in the list.  Element [1]
8969  * is the first code point beyond that not in the list; and element [2] is the
8970  * first code point beyond that that is in the list.  In other words, the first
8971  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8972  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
8973  * all code points in that range are not in the inversion list.  The third
8974  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8975  * list, and so forth.  Thus every element whose index is divisible by two
8976  * gives the beginning of a range that is in the list, and every element whose
8977  * index is not divisible by two gives the beginning of a range not in the
8978  * list.  If the final element's index is divisible by two, the inversion list
8979  * extends to the platform's infinity; otherwise the highest code point in the
8980  * inversion list is the contents of that element minus 1.
8981  *
8982  * A range that contains just a single code point N will look like
8983  *  invlist[i]   == N
8984  *  invlist[i+1] == N+1
8985  *
8986  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8987  * impossible to represent, so element [i+1] is omitted.  The single element
8988  * inversion list
8989  *  invlist[0] == UV_MAX
8990  * contains just UV_MAX, but is interpreted as matching to infinity.
8991  *
8992  * Taking the complement (inverting) an inversion list is quite simple, if the
8993  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8994  * This implementation reserves an element at the beginning of each inversion
8995  * list to always contain 0; there is an additional flag in the header which
8996  * indicates if the list begins at the 0, or is offset to begin at the next
8997  * element.  This means that the inversion list can be inverted without any
8998  * copying; just flip the flag.
8999  *
9000  * More about inversion lists can be found in "Unicode Demystified"
9001  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9002  *
9003  * The inversion list data structure is currently implemented as an SV pointing
9004  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
9005  * array of UV whose memory management is automatically handled by the existing
9006  * facilities for SV's.
9007  *
9008  * Some of the methods should always be private to the implementation, and some
9009  * should eventually be made public */
9010
9011 /* The header definitions are in F<invlist_inline.h> */
9012
9013 #ifndef PERL_IN_XSUB_RE
9014
9015 PERL_STATIC_INLINE UV*
9016 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9017 {
9018     /* Returns a pointer to the first element in the inversion list's array.
9019      * This is called upon initialization of an inversion list.  Where the
9020      * array begins depends on whether the list has the code point U+0000 in it
9021      * or not.  The other parameter tells it whether the code that follows this
9022      * call is about to put a 0 in the inversion list or not.  The first
9023      * element is either the element reserved for 0, if TRUE, or the element
9024      * after it, if FALSE */
9025
9026     bool* offset = get_invlist_offset_addr(invlist);
9027     UV* zero_addr = (UV *) SvPVX(invlist);
9028
9029     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9030
9031     /* Must be empty */
9032     assert(! _invlist_len(invlist));
9033
9034     *zero_addr = 0;
9035
9036     /* 1^1 = 0; 1^0 = 1 */
9037     *offset = 1 ^ will_have_0;
9038     return zero_addr + *offset;
9039 }
9040
9041 STATIC void
9042 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9043 {
9044     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
9045      * steals the list from 'src', so 'src' is made to have a NULL list.  This
9046      * is similar to what SvSetMagicSV() would do, if it were implemented on
9047      * inversion lists, though this routine avoids a copy */
9048
9049     const UV src_len          = _invlist_len(src);
9050     const bool src_offset     = *get_invlist_offset_addr(src);
9051     const STRLEN src_byte_len = SvLEN(src);
9052     char * array              = SvPVX(src);
9053
9054     const int oldtainted = TAINT_get;
9055
9056     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9057
9058     assert(is_invlist(src));
9059     assert(is_invlist(dest));
9060     assert(! invlist_is_iterating(src));
9061     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9062
9063     /* Make sure it ends in the right place with a NUL, as our inversion list
9064      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9065      * asserts it */
9066     array[src_byte_len - 1] = '\0';
9067
9068     TAINT_NOT;      /* Otherwise it breaks */
9069     sv_usepvn_flags(dest,
9070                     (char *) array,
9071                     src_byte_len - 1,
9072
9073                     /* This flag is documented to cause a copy to be avoided */
9074                     SV_HAS_TRAILING_NUL);
9075     TAINT_set(oldtainted);
9076     SvPV_set(src, 0);
9077     SvLEN_set(src, 0);
9078     SvCUR_set(src, 0);
9079
9080     /* Finish up copying over the other fields in an inversion list */
9081     *get_invlist_offset_addr(dest) = src_offset;
9082     invlist_set_len(dest, src_len, src_offset);
9083     *get_invlist_previous_index_addr(dest) = 0;
9084     invlist_iterfinish(dest);
9085 }
9086
9087 PERL_STATIC_INLINE IV*
9088 S_get_invlist_previous_index_addr(SV* invlist)
9089 {
9090     /* Return the address of the IV that is reserved to hold the cached index
9091      * */
9092     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9093
9094     assert(is_invlist(invlist));
9095
9096     return &(((XINVLIST*) SvANY(invlist))->prev_index);
9097 }
9098
9099 PERL_STATIC_INLINE IV
9100 S_invlist_previous_index(SV* const invlist)
9101 {
9102     /* Returns cached index of previous search */
9103
9104     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9105
9106     return *get_invlist_previous_index_addr(invlist);
9107 }
9108
9109 PERL_STATIC_INLINE void
9110 S_invlist_set_previous_index(SV* const invlist, const IV index)
9111 {
9112     /* Caches <index> for later retrieval */
9113
9114     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9115
9116     assert(index == 0 || index < (int) _invlist_len(invlist));
9117
9118     *get_invlist_previous_index_addr(invlist) = index;
9119 }
9120
9121 PERL_STATIC_INLINE void
9122 S_invlist_trim(SV* invlist)
9123 {
9124     /* Free the not currently-being-used space in an inversion list */
9125
9126     /* But don't free up the space needed for the 0 UV that is always at the
9127      * beginning of the list, nor the trailing NUL */
9128     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9129
9130     PERL_ARGS_ASSERT_INVLIST_TRIM;
9131
9132     assert(is_invlist(invlist));
9133
9134     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9135 }
9136
9137 PERL_STATIC_INLINE void
9138 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
9139 {
9140     PERL_ARGS_ASSERT_INVLIST_CLEAR;
9141
9142     assert(is_invlist(invlist));
9143
9144     invlist_set_len(invlist, 0, 0);
9145     invlist_trim(invlist);
9146 }
9147
9148 #endif /* ifndef PERL_IN_XSUB_RE */
9149
9150 PERL_STATIC_INLINE bool
9151 S_invlist_is_iterating(SV* const invlist)
9152 {
9153     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9154
9155     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9156 }
9157
9158 #ifndef PERL_IN_XSUB_RE
9159
9160 PERL_STATIC_INLINE UV
9161 S_invlist_max(SV* const invlist)
9162 {
9163     /* Returns the maximum number of elements storable in the inversion list's
9164      * array, without having to realloc() */
9165
9166     PERL_ARGS_ASSERT_INVLIST_MAX;
9167
9168     assert(is_invlist(invlist));
9169
9170     /* Assumes worst case, in which the 0 element is not counted in the
9171      * inversion list, so subtracts 1 for that */
9172     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
9173            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9174            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9175 }
9176
9177 STATIC void
9178 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9179 {
9180     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9181
9182     /* First 1 is in case the zero element isn't in the list; second 1 is for
9183      * trailing NUL */
9184     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9185     invlist_set_len(invlist, 0, 0);
9186
9187     /* Force iterinit() to be used to get iteration to work */
9188     invlist_iterfinish(invlist);
9189
9190     *get_invlist_previous_index_addr(invlist) = 0;
9191     SvPOK_on(invlist);  /* This allows B to extract the PV */
9192 }
9193
9194 SV*
9195 Perl__new_invlist(pTHX_ IV initial_size)
9196 {
9197
9198     /* Return a pointer to a newly constructed inversion list, with enough
9199      * space to store 'initial_size' elements.  If that number is negative, a
9200      * system default is used instead */
9201
9202     SV* new_list;
9203
9204     if (initial_size < 0) {
9205         initial_size = 10;
9206     }
9207
9208     new_list = newSV_type(SVt_INVLIST);
9209     initialize_invlist_guts(new_list, initial_size);
9210
9211     return new_list;
9212 }
9213
9214 SV*
9215 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9216 {
9217     /* Return a pointer to a newly constructed inversion list, initialized to
9218      * point to <list>, which has to be in the exact correct inversion list
9219      * form, including internal fields.  Thus this is a dangerous routine that
9220      * should not be used in the wrong hands.  The passed in 'list' contains
9221      * several header fields at the beginning that are not part of the
9222      * inversion list body proper */
9223
9224     const STRLEN length = (STRLEN) list[0];
9225     const UV version_id =          list[1];
9226     const bool offset   =    cBOOL(list[2]);
9227 #define HEADER_LENGTH 3
9228     /* If any of the above changes in any way, you must change HEADER_LENGTH
9229      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9230      *      perl -E 'say int(rand 2**31-1)'
9231      */
9232 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9233                                         data structure type, so that one being
9234                                         passed in can be validated to be an
9235                                         inversion list of the correct vintage.
9236                                        */
9237
9238     SV* invlist = newSV_type(SVt_INVLIST);
9239
9240     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9241
9242     if (version_id != INVLIST_VERSION_ID) {
9243         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9244     }
9245
9246     /* The generated array passed in includes header elements that aren't part
9247      * of the list proper, so start it just after them */
9248     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9249
9250     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
9251                                shouldn't touch it */
9252
9253     *(get_invlist_offset_addr(invlist)) = offset;
9254
9255     /* The 'length' passed to us is the physical number of elements in the
9256      * inversion list.  But if there is an offset the logical number is one
9257      * less than that */
9258     invlist_set_len(invlist, length  - offset, offset);
9259
9260     invlist_set_previous_index(invlist, 0);
9261
9262     /* Initialize the iteration pointer. */
9263     invlist_iterfinish(invlist);
9264
9265     SvREADONLY_on(invlist);
9266     SvPOK_on(invlist);
9267
9268     return invlist;
9269 }
9270
9271 STATIC void
9272 S__append_range_to_invlist(pTHX_ SV* const invlist,
9273                                  const UV start, const UV end)
9274 {
9275    /* Subject to change or removal.  Append the range from 'start' to 'end' at
9276     * the end of the inversion list.  The range must be above any existing
9277     * ones. */
9278
9279     UV* array;
9280     UV max = invlist_max(invlist);
9281     UV len = _invlist_len(invlist);
9282     bool offset;
9283
9284     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9285
9286     if (len == 0) { /* Empty lists must be initialized */
9287         offset = start != 0;
9288         array = _invlist_array_init(invlist, ! offset);
9289     }
9290     else {
9291         /* Here, the existing list is non-empty. The current max entry in the
9292          * list is generally the first value not in the set, except when the
9293          * set extends to the end of permissible values, in which case it is
9294          * the first entry in that final set, and so this call is an attempt to
9295          * append out-of-order */
9296
9297         UV final_element = len - 1;
9298         array = invlist_array(invlist);
9299         if (   array[final_element] > start
9300             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9301         {
9302             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9303                      array[final_element], start,
9304                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9305         }
9306
9307         /* Here, it is a legal append.  If the new range begins 1 above the end
9308          * of the range below it, it is extending the range below it, so the
9309          * new first value not in the set is one greater than the newly
9310          * extended range.  */
9311         offset = *get_invlist_offset_addr(invlist);
9312         if (array[final_element] == start) {
9313             if (end != UV_MAX) {
9314                 array[final_element] = end + 1;
9315             }
9316             else {
9317                 /* But if the end is the maximum representable on the machine,
9318                  * assume that infinity was actually what was meant.  Just let
9319                  * the range that this would extend to have no end */
9320                 invlist_set_len(invlist, len - 1, offset);
9321             }
9322             return;
9323         }
9324     }
9325
9326     /* Here the new range doesn't extend any existing set.  Add it */
9327
9328     len += 2;   /* Includes an element each for the start and end of range */
9329
9330     /* If wll overflow the existing space, extend, which may cause the array to
9331      * be moved */
9332     if (max < len) {
9333         invlist_extend(invlist, len);
9334
9335         /* Have to set len here to avoid assert failure in invlist_array() */
9336         invlist_set_len(invlist, len, offset);
9337
9338         array = invlist_array(invlist);
9339     }
9340     else {
9341         invlist_set_len(invlist, len, offset);
9342     }
9343
9344     /* The next item on the list starts the range, the one after that is
9345      * one past the new range.  */
9346     array[len - 2] = start;
9347     if (end != UV_MAX) {
9348         array[len - 1] = end + 1;
9349     }
9350     else {
9351         /* But if the end is the maximum representable on the machine, just let
9352          * the range have no end */
9353         invlist_set_len(invlist, len - 1, offset);
9354     }
9355 }
9356
9357 SSize_t
9358 Perl__invlist_search(SV* const invlist, const UV cp)
9359 {
9360     /* Searches the inversion list for the entry that contains the input code
9361      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
9362      * return value is the index into the list's array of the range that
9363      * contains <cp>, that is, 'i' such that
9364      *  array[i] <= cp < array[i+1]
9365      */
9366
9367     IV low = 0;
9368     IV mid;
9369     IV high = _invlist_len(invlist);
9370     const IV highest_element = high - 1;
9371     const UV* array;
9372
9373     PERL_ARGS_ASSERT__INVLIST_SEARCH;
9374
9375     /* If list is empty, return failure. */
9376     if (high == 0) {
9377         return -1;
9378     }
9379
9380     /* (We can't get the array unless we know the list is non-empty) */
9381     array = invlist_array(invlist);
9382
9383     mid = invlist_previous_index(invlist);
9384     assert(mid >=0);
9385     if (mid > highest_element) {
9386         mid = highest_element;
9387     }
9388
9389     /* <mid> contains the cache of the result of the previous call to this
9390      * function (0 the first time).  See if this call is for the same result,
9391      * or if it is for mid-1.  This is under the theory that calls to this
9392      * function will often be for related code points that are near each other.
9393      * And benchmarks show that caching gives better results.  We also test
9394      * here if the code point is within the bounds of the list.  These tests
9395      * replace others that would have had to be made anyway to make sure that
9396      * the array bounds were not exceeded, and these give us extra information
9397      * at the same time */
9398     if (cp >= array[mid]) {
9399         if (cp >= array[highest_element]) {
9400             return highest_element;
9401         }
9402
9403         /* Here, array[mid] <= cp < array[highest_element].  This means that
9404          * the final element is not the answer, so can exclude it; it also
9405          * means that <mid> is not the final element, so can refer to 'mid + 1'
9406          * safely */
9407         if (cp < array[mid + 1]) {
9408             return mid;
9409         }
9410         high--;
9411         low = mid + 1;
9412     }
9413     else { /* cp < aray[mid] */
9414         if (cp < array[0]) { /* Fail if outside the array */
9415             return -1;
9416         }
9417         high = mid;
9418         if (cp >= array[mid - 1]) {
9419             goto found_entry;
9420         }
9421     }
9422
9423     /* Binary search.  What we are looking for is <i> such that
9424      *  array[i] <= cp < array[i+1]
9425      * The loop below converges on the i+1.  Note that there may not be an
9426      * (i+1)th element in the array, and things work nonetheless */
9427     while (low < high) {
9428         mid = (low + high) / 2;
9429         assert(mid <= highest_element);
9430         if (array[mid] <= cp) { /* cp >= array[mid] */
9431             low = mid + 1;
9432
9433             /* We could do this extra test to exit the loop early.
9434             if (cp < array[low]) {
9435                 return mid;
9436             }
9437             */
9438         }
9439         else { /* cp < array[mid] */
9440             high = mid;
9441         }
9442     }
9443
9444   found_entry:
9445     high--;
9446     invlist_set_previous_index(invlist, high);
9447     return high;
9448 }
9449
9450 void
9451 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9452                                          const bool complement_b, SV** output)
9453 {
9454     /* Take the union of two inversion lists and point '*output' to it.  On
9455      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9456      * even 'a' or 'b').  If to an inversion list, the contents of the original
9457      * list will be replaced by the union.  The first list, 'a', may be
9458      * NULL, in which case a copy of the second list is placed in '*output'.
9459      * If 'complement_b' is TRUE, the union is taken of the complement
9460      * (inversion) of 'b' instead of b itself.
9461      *
9462      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9463      * Richard Gillam, published by Addison-Wesley, and explained at some
9464      * length there.  The preface says to incorporate its examples into your
9465      * code at your own risk.
9466      *
9467      * The algorithm is like a merge sort. */
9468
9469     const UV* array_a;    /* a's array */
9470     const UV* array_b;
9471     UV len_a;       /* length of a's array */
9472     UV len_b;
9473
9474     SV* u;                      /* the resulting union */
9475     UV* array_u;
9476     UV len_u = 0;
9477
9478     UV i_a = 0;             /* current index into a's array */
9479     UV i_b = 0;
9480     UV i_u = 0;
9481
9482     /* running count, as explained in the algorithm source book; items are
9483      * stopped accumulating and are output when the count changes to/from 0.
9484      * The count is incremented when we start a range that's in an input's set,
9485      * and decremented when we start a range that's not in a set.  So this
9486      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
9487      * and hence nothing goes into the union; 1, just one of the inputs is in
9488      * its set (and its current range gets added to the union); and 2 when both
9489      * inputs are in their sets.  */
9490     UV count = 0;
9491
9492     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9493     assert(a != b);
9494     assert(*output == NULL || is_invlist(*output));
9495
9496     len_b = _invlist_len(b);
9497     if (len_b == 0) {
9498
9499         /* Here, 'b' is empty, hence it's complement is all possible code
9500          * points.  So if the union includes the complement of 'b', it includes
9501          * everything, and we need not even look at 'a'.  It's easiest to
9502          * create a new inversion list that matches everything.  */
9503         if (complement_b) {
9504             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9505
9506             if (*output == NULL) { /* If the output didn't exist, just point it
9507                                       at the new list */
9508                 *output = everything;
9509             }
9510             else { /* Otherwise, replace its contents with the new list */
9511                 invlist_replace_list_destroys_src(*output, everything);
9512                 SvREFCNT_dec_NN(everything);
9513             }
9514
9515             return;
9516         }
9517
9518         /* Here, we don't want the complement of 'b', and since 'b' is empty,
9519          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
9520          * output will be empty */
9521
9522         if (a == NULL || _invlist_len(a) == 0) {
9523             if (*output == NULL) {
9524                 *output = _new_invlist(0);
9525             }
9526             else {
9527                 invlist_clear(*output);
9528             }
9529             return;
9530         }
9531
9532         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9533          * union.  We can just return a copy of 'a' if '*output' doesn't point
9534          * to an existing list */
9535         if (*output == NULL) {
9536             *output = invlist_clone(a, NULL);
9537             return;
9538         }
9539
9540         /* If the output is to overwrite 'a', we have a no-op, as it's
9541          * already in 'a' */
9542         if (*output == a) {
9543             return;
9544         }
9545
9546         /* Here, '*output' is to be overwritten by 'a' */
9547         u = invlist_clone(a, NULL);
9548         invlist_replace_list_destroys_src(*output, u);
9549         SvREFCNT_dec_NN(u);
9550
9551         return;
9552     }
9553
9554     /* Here 'b' is not empty.  See about 'a' */
9555
9556     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9557
9558         /* Here, 'a' is empty (and b is not).  That means the union will come
9559          * entirely from 'b'.  If '*output' is NULL, we can directly return a
9560          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
9561          * the clone */
9562
9563         SV ** dest = (*output == NULL) ? output : &u;
9564         *dest = invlist_clone(b, NULL);
9565         if (complement_b) {
9566             _invlist_invert(*dest);
9567         }
9568
9569         if (dest == &u) {
9570             invlist_replace_list_destroys_src(*output, u);
9571             SvREFCNT_dec_NN(u);
9572         }
9573
9574         return;
9575     }
9576
9577     /* Here both lists exist and are non-empty */
9578     array_a = invlist_array(a);
9579     array_b = invlist_array(b);
9580
9581     /* If are to take the union of 'a' with the complement of b, set it
9582      * up so are looking at b's complement. */
9583     if (complement_b) {
9584
9585         /* To complement, we invert: if the first element is 0, remove it.  To
9586          * do this, we just pretend the array starts one later */
9587         if (array_b[0] == 0) {
9588             array_b++;
9589             len_b--;
9590         }
9591         else {
9592
9593             /* But if the first element is not zero, we pretend the list starts
9594              * at the 0 that is always stored immediately before the array. */
9595             array_b--;
9596             len_b++;
9597         }
9598     }
9599
9600     /* Size the union for the worst case: that the sets are completely
9601      * disjoint */
9602     u = _new_invlist(len_a + len_b);
9603
9604     /* Will contain U+0000 if either component does */
9605     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
9606                                       || (len_b > 0 && array_b[0] == 0));
9607
9608     /* Go through each input list item by item, stopping when have exhausted
9609      * one of them */
9610     while (i_a < len_a && i_b < len_b) {
9611         UV cp;      /* The element to potentially add to the union's array */
9612         bool cp_in_set;   /* is it in the the input list's set or not */
9613
9614         /* We need to take one or the other of the two inputs for the union.
9615          * Since we are merging two sorted lists, we take the smaller of the
9616          * next items.  In case of a tie, we take first the one that is in its
9617          * set.  If we first took the one not in its set, it would decrement
9618          * the count, possibly to 0 which would cause it to be output as ending
9619          * the range, and the next time through we would take the same number,
9620          * and output it again as beginning the next range.  By doing it the
9621          * opposite way, there is no possibility that the count will be
9622          * momentarily decremented to 0, and thus the two adjoining ranges will
9623          * be seamlessly merged.  (In a tie and both are in the set or both not
9624          * in the set, it doesn't matter which we take first.) */
9625         if (       array_a[i_a] < array_b[i_b]
9626             || (   array_a[i_a] == array_b[i_b]
9627                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9628         {
9629             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9630             cp = array_a[i_a++];
9631         }
9632         else {
9633             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9634             cp = array_b[i_b++];
9635         }
9636
9637         /* Here, have chosen which of the two inputs to look at.  Only output
9638          * if the running count changes to/from 0, which marks the
9639          * beginning/end of a range that's in the set */
9640         if (cp_in_set) {
9641             if (count == 0) {
9642                 array_u[i_u++] = cp;
9643             }
9644             count++;
9645         }
9646         else {
9647             count--;
9648             if (count == 0) {
9649                 array_u[i_u++] = cp;
9650             }
9651         }
9652     }
9653
9654
9655     /* The loop above increments the index into exactly one of the input lists
9656      * each iteration, and ends when either index gets to its list end.  That
9657      * means the other index is lower than its end, and so something is
9658      * remaining in that one.  We decrement 'count', as explained below, if
9659      * that list is in its set.  (i_a and i_b each currently index the element
9660      * beyond the one we care about.) */
9661     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9662         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9663     {
9664         count--;
9665     }
9666
9667     /* Above we decremented 'count' if the list that had unexamined elements in
9668      * it was in its set.  This has made it so that 'count' being non-zero
9669      * means there isn't anything left to output; and 'count' equal to 0 means
9670      * that what is left to output is precisely that which is left in the
9671      * non-exhausted input list.
9672      *
9673      * To see why, note first that the exhausted input obviously has nothing
9674      * left to add to the union.  If it was in its set at its end, that means
9675      * the set extends from here to the platform's infinity, and hence so does
9676      * the union and the non-exhausted set is irrelevant.  The exhausted set
9677      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
9678      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9679      * 'count' remains at 1.  This is consistent with the decremented 'count'
9680      * != 0 meaning there's nothing left to add to the union.
9681      *
9682      * But if the exhausted input wasn't in its set, it contributed 0 to
9683      * 'count', and the rest of the union will be whatever the other input is.
9684      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9685      * otherwise it gets decremented to 0.  This is consistent with 'count'
9686      * == 0 meaning the remainder of the union is whatever is left in the
9687      * non-exhausted list. */
9688     if (count != 0) {
9689         len_u = i_u;
9690     }
9691     else {
9692         IV copy_count = len_a - i_a;
9693         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
9694             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9695         }
9696         else { /* The non-exhausted input is b */
9697             copy_count = len_b - i_b;
9698             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9699         }
9700         len_u = i_u + copy_count;
9701     }
9702
9703     /* Set the result to the final length, which can change the pointer to
9704      * array_u, so re-find it.  (Note that it is unlikely that this will
9705      * change, as we are shrinking the space, not enlarging it) */
9706     if (len_u != _invlist_len(u)) {
9707         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9708         invlist_trim(u);
9709         array_u = invlist_array(u);
9710     }
9711
9712     if (*output == NULL) {  /* Simply return the new inversion list */
9713         *output = u;
9714     }
9715     else {
9716         /* Otherwise, overwrite the inversion list that was in '*output'.  We
9717          * could instead free '*output', and then set it to 'u', but experience
9718          * has shown [perl #127392] that if the input is a mortal, we can get a
9719          * huge build-up of these during regex compilation before they get
9720          * freed. */
9721         invlist_replace_list_destroys_src(*output, u);
9722         SvREFCNT_dec_NN(u);
9723     }
9724
9725     return;
9726 }
9727
9728 void
9729 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9730                                                const bool complement_b, SV** i)
9731 {
9732     /* Take the intersection of two inversion lists and point '*i' to it.  On
9733      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9734      * even 'a' or 'b').  If to an inversion list, the contents of the original
9735      * list will be replaced by the intersection.  The first list, 'a', may be
9736      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
9737      * TRUE, the result will be the intersection of 'a' and the complement (or
9738      * inversion) of 'b' instead of 'b' directly.
9739      *
9740      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9741      * Richard Gillam, published by Addison-Wesley, and explained at some
9742      * length there.  The preface says to incorporate its examples into your
9743      * code at your own risk.  In fact, it had bugs
9744      *
9745      * The algorithm is like a merge sort, and is essentially the same as the
9746      * union above
9747      */
9748
9749     const UV* array_a;          /* a's array */
9750     const UV* array_b;
9751     UV len_a;   /* length of a's array */
9752     UV len_b;
9753
9754     SV* r;                   /* the resulting intersection */
9755     UV* array_r;
9756     UV len_r = 0;
9757
9758     UV i_a = 0;             /* current index into a's array */
9759     UV i_b = 0;
9760     UV i_r = 0;
9761
9762     /* running count of how many of the two inputs are postitioned at ranges
9763      * that are in their sets.  As explained in the algorithm source book,
9764      * items are stopped accumulating and are output when the count changes
9765      * to/from 2.  The count is incremented when we start a range that's in an
9766      * input's set, and decremented when we start a range that's not in a set.
9767      * Only when it is 2 are we in the intersection. */
9768     UV count = 0;
9769
9770     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9771     assert(a != b);
9772     assert(*i == NULL || is_invlist(*i));
9773
9774     /* Special case if either one is empty */
9775     len_a = (a == NULL) ? 0 : _invlist_len(a);
9776     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9777         if (len_a != 0 && complement_b) {
9778
9779             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9780              * must be empty.  Here, also we are using 'b's complement, which
9781              * hence must be every possible code point.  Thus the intersection
9782              * is simply 'a'. */
9783
9784             if (*i == a) {  /* No-op */
9785                 return;
9786             }
9787
9788             if (*i == NULL) {
9789                 *i = invlist_clone(a, NULL);
9790                 return;
9791             }
9792
9793             r = invlist_clone(a, NULL);
9794             invlist_replace_list_destroys_src(*i, r);
9795             SvREFCNT_dec_NN(r);
9796             return;
9797         }
9798
9799         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9800          * intersection must be empty */
9801         if (*i == NULL) {
9802             *i = _new_invlist(0);
9803             return;
9804         }
9805
9806         invlist_clear(*i);
9807         return;
9808     }
9809
9810     /* Here both lists exist and are non-empty */
9811     array_a = invlist_array(a);
9812     array_b = invlist_array(b);
9813
9814     /* If are to take the intersection of 'a' with the complement of b, set it
9815      * up so are looking at b's complement. */
9816     if (complement_b) {
9817
9818         /* To complement, we invert: if the first element is 0, remove it.  To
9819          * do this, we just pretend the array starts one later */
9820         if (array_b[0] == 0) {
9821             array_b++;
9822             len_b--;
9823         }
9824         else {
9825
9826             /* But if the first element is not zero, we pretend the list starts
9827              * at the 0 that is always stored immediately before the array. */
9828             array_b--;
9829             len_b++;
9830         }
9831     }
9832
9833     /* Size the intersection for the worst case: that the intersection ends up
9834      * fragmenting everything to be completely disjoint */
9835     r= _new_invlist(len_a + len_b);
9836
9837     /* Will contain U+0000 iff both components do */
9838     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
9839                                      && len_b > 0 && array_b[0] == 0);
9840
9841     /* Go through each list item by item, stopping when have exhausted one of
9842      * them */
9843     while (i_a < len_a && i_b < len_b) {
9844         UV cp;      /* The element to potentially add to the intersection's
9845                        array */
9846         bool cp_in_set; /* Is it in the input list's set or not */
9847
9848         /* We need to take one or the other of the two inputs for the
9849          * intersection.  Since we are merging two sorted lists, we take the
9850          * smaller of the next items.  In case of a tie, we take first the one
9851          * that is not in its set (a difference from the union algorithm).  If
9852          * we first took the one in its set, it would increment the count,
9853          * possibly to 2 which would cause it to be output as starting a range
9854          * in the intersection, and the next time through we would take that
9855          * same number, and output it again as ending the set.  By doing the
9856          * opposite of this, there is no possibility that the count will be
9857          * momentarily incremented to 2.  (In a tie and both are in the set or
9858          * both not in the set, it doesn't matter which we take first.) */
9859         if (       array_a[i_a] < array_b[i_b]
9860             || (   array_a[i_a] == array_b[i_b]
9861                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9862         {
9863             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9864             cp = array_a[i_a++];
9865         }
9866         else {
9867             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9868             cp= array_b[i_b++];
9869         }
9870
9871         /* Here, have chosen which of the two inputs to look at.  Only output
9872          * if the running count changes to/from 2, which marks the
9873          * beginning/end of a range that's in the intersection */
9874         if (cp_in_set) {
9875             count++;
9876             if (count == 2) {
9877                 array_r[i_r++] = cp;
9878             }
9879         }
9880         else {
9881             if (count == 2) {
9882                 array_r[i_r++] = cp;
9883             }
9884             count--;
9885         }
9886
9887     }
9888
9889     /* The loop above increments the index into exactly one of the input lists
9890      * each iteration, and ends when either index gets to its list end.  That
9891      * means the other index is lower than its end, and so something is
9892      * remaining in that one.  We increment 'count', as explained below, if the
9893      * exhausted list was in its set.  (i_a and i_b each currently index the
9894      * element beyond the one we care about.) */
9895     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9896         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9897     {
9898         count++;
9899     }
9900
9901     /* Above we incremented 'count' if the exhausted list was in its set.  This
9902      * has made it so that 'count' being below 2 means there is nothing left to
9903      * output; otheriwse what's left to add to the intersection is precisely
9904      * that which is left in the non-exhausted input list.
9905      *
9906      * To see why, note first that the exhausted input obviously has nothing
9907      * left to affect the intersection.  If it was in its set at its end, that
9908      * means the set extends from here to the platform's infinity, and hence
9909      * anything in the non-exhausted's list will be in the intersection, and
9910      * anything not in it won't be.  Hence, the rest of the intersection is
9911      * precisely what's in the non-exhausted list  The exhausted set also
9912      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
9913      * it means 'count' is now at least 2.  This is consistent with the
9914      * incremented 'count' being >= 2 means to add the non-exhausted list to
9915      * the intersection.
9916      *
9917      * But if the exhausted input wasn't in its set, it contributed 0 to
9918      * 'count', and the intersection can't include anything further; the
9919      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
9920      * incremented.  This is consistent with 'count' being < 2 meaning nothing
9921      * further to add to the intersection. */
9922     if (count < 2) { /* Nothing left to put in the intersection. */
9923         len_r = i_r;
9924     }
9925     else { /* copy the non-exhausted list, unchanged. */
9926         IV copy_count = len_a - i_a;
9927         if (copy_count > 0) {   /* a is the one with stuff left */
9928             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9929         }
9930         else {  /* b is the one with stuff left */
9931             copy_count = len_b - i_b;
9932             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9933         }
9934         len_r = i_r + copy_count;
9935     }
9936
9937     /* Set the result to the final length, which can change the pointer to
9938      * array_r, so re-find it.  (Note that it is unlikely that this will
9939      * change, as we are shrinking the space, not enlarging it) */
9940     if (len_r != _invlist_len(r)) {
9941         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9942         invlist_trim(r);
9943         array_r = invlist_array(r);
9944     }
9945
9946     if (*i == NULL) { /* Simply return the calculated intersection */
9947         *i = r;
9948     }
9949     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
9950               instead free '*i', and then set it to 'r', but experience has
9951               shown [perl #127392] that if the input is a mortal, we can get a
9952               huge build-up of these during regex compilation before they get
9953               freed. */
9954         if (len_r) {
9955             invlist_replace_list_destroys_src(*i, r);
9956         }
9957         else {
9958             invlist_clear(*i);
9959         }
9960         SvREFCNT_dec_NN(r);
9961     }
9962
9963     return;
9964 }
9965
9966 SV*
9967 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9968 {
9969     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9970      * set.  A pointer to the inversion list is returned.  This may actually be
9971      * a new list, in which case the passed in one has been destroyed.  The
9972      * passed-in inversion list can be NULL, in which case a new one is created
9973      * with just the one range in it.  The new list is not necessarily
9974      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
9975      * result of this function.  The gain would not be large, and in many
9976      * cases, this is called multiple times on a single inversion list, so
9977      * anything freed may almost immediately be needed again.
9978      *
9979      * This used to mostly call the 'union' routine, but that is much more
9980      * heavyweight than really needed for a single range addition */
9981
9982     UV* array;              /* The array implementing the inversion list */
9983     UV len;                 /* How many elements in 'array' */
9984     SSize_t i_s;            /* index into the invlist array where 'start'
9985                                should go */
9986     SSize_t i_e = 0;        /* And the index where 'end' should go */
9987     UV cur_highest;         /* The highest code point in the inversion list
9988                                upon entry to this function */
9989
9990     /* This range becomes the whole inversion list if none already existed */
9991     if (invlist == NULL) {
9992         invlist = _new_invlist(2);
9993         _append_range_to_invlist(invlist, start, end);
9994         return invlist;
9995     }
9996
9997     /* Likewise, if the inversion list is currently empty */
9998     len = _invlist_len(invlist);
9999     if (len == 0) {
10000         _append_range_to_invlist(invlist, start, end);
10001         return invlist;
10002     }
10003
10004     /* Starting here, we have to know the internals of the list */
10005     array = invlist_array(invlist);
10006
10007     /* If the new range ends higher than the current highest ... */
10008     cur_highest = invlist_highest(invlist);
10009     if (end > cur_highest) {
10010
10011         /* If the whole range is higher, we can just append it */
10012         if (start > cur_highest) {
10013             _append_range_to_invlist(invlist, start, end);
10014             return invlist;
10015         }
10016
10017         /* Otherwise, add the portion that is higher ... */
10018         _append_range_to_invlist(invlist, cur_highest + 1, end);
10019
10020         /* ... and continue on below to handle the rest.  As a result of the
10021          * above append, we know that the index of the end of the range is the
10022          * final even numbered one of the array.  Recall that the final element
10023          * always starts a range that extends to infinity.  If that range is in
10024          * the set (meaning the set goes from here to infinity), it will be an
10025          * even index, but if it isn't in the set, it's odd, and the final
10026          * range in the set is one less, which is even. */
10027         if (end == UV_MAX) {
10028             i_e = len;
10029         }
10030         else {
10031             i_e = len - 2;
10032         }
10033     }
10034
10035     /* We have dealt with appending, now see about prepending.  If the new
10036      * range starts lower than the current lowest ... */
10037     if (start < array[0]) {
10038
10039         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10040          * Let the union code handle it, rather than having to know the
10041          * trickiness in two code places.  */
10042         if (UNLIKELY(start == 0)) {
10043             SV* range_invlist;
10044
10045             range_invlist = _new_invlist(2);
10046             _append_range_to_invlist(range_invlist, start, end);
10047
10048             _invlist_union(invlist, range_invlist, &invlist);
10049
10050             SvREFCNT_dec_NN(range_invlist);
10051
10052             return invlist;
10053         }
10054
10055         /* If the whole new range comes before the first entry, and doesn't
10056          * extend it, we have to insert it as an additional range */
10057         if (end < array[0] - 1) {
10058             i_s = i_e = -1;
10059             goto splice_in_new_range;
10060         }
10061
10062         /* Here the new range adjoins the existing first range, extending it
10063          * downwards. */
10064         array[0] = start;
10065
10066         /* And continue on below to handle the rest.  We know that the index of
10067          * the beginning of the range is the first one of the array */
10068         i_s = 0;
10069     }
10070     else { /* Not prepending any part of the new range to the existing list.
10071             * Find where in the list it should go.  This finds i_s, such that:
10072             *     invlist[i_s] <= start < array[i_s+1]
10073             */
10074         i_s = _invlist_search(invlist, start);
10075     }
10076
10077     /* At this point, any extending before the beginning of the inversion list
10078      * and/or after the end has been done.  This has made it so that, in the
10079      * code below, each endpoint of the new range is either in a range that is
10080      * in the set, or is in a gap between two ranges that are.  This means we
10081      * don't have to worry about exceeding the array bounds.
10082      *
10083      * Find where in the list the new range ends (but we can skip this if we
10084      * have already determined what it is, or if it will be the same as i_s,
10085      * which we already have computed) */
10086     if (i_e == 0) {
10087         i_e = (start == end)
10088               ? i_s
10089               : _invlist_search(invlist, end);
10090     }
10091
10092     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
10093      * is a range that goes to infinity there is no element at invlist[i_e+1],
10094      * so only the first relation holds. */
10095
10096     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10097
10098         /* Here, the ranges on either side of the beginning of the new range
10099          * are in the set, and this range starts in the gap between them.
10100          *
10101          * The new range extends the range above it downwards if the new range
10102          * ends at or above that range's start */
10103         const bool extends_the_range_above = (   end == UV_MAX
10104                                               || end + 1 >= array[i_s+1]);
10105
10106         /* The new range extends the range below it upwards if it begins just
10107          * after where that range ends */
10108         if (start == array[i_s]) {
10109
10110             /* If the new range fills the entire gap between the other ranges,
10111              * they will get merged together.  Other ranges may also get
10112              * merged, depending on how many of them the new range spans.  In
10113              * the general case, we do the merge later, just once, after we
10114              * figure out how many to merge.  But in the case where the new
10115              * range exactly spans just this one gap (possibly extending into
10116              * the one above), we do the merge here, and an early exit.  This
10117              * is done here to avoid having to special case later. */
10118             if (i_e - i_s <= 1) {
10119
10120                 /* If i_e - i_s == 1, it means that the new range terminates
10121                  * within the range above, and hence 'extends_the_range_above'
10122                  * must be true.  (If the range above it extends to infinity,
10123                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10124                  * will be 0, so no harm done.) */
10125                 if (extends_the_range_above) {
10126                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10127                     invlist_set_len(invlist,
10128                                     len - 2,
10129                                     *(get_invlist_offset_addr(invlist)));
10130                     return invlist;
10131                 }
10132
10133                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
10134                  * to the same range, and below we are about to decrement i_s
10135                  * */
10136                 i_e--;
10137             }
10138
10139             /* Here, the new range is adjacent to the one below.  (It may also
10140              * span beyond the range above, but that will get resolved later.)
10141              * Extend the range below to include this one. */
10142             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10143             i_s--;
10144             start = array[i_s];
10145         }
10146         else if (extends_the_range_above) {
10147
10148             /* Here the new range only extends the range above it, but not the
10149              * one below.  It merges with the one above.  Again, we keep i_e
10150              * and i_s in sync if they point to the same range */
10151             if (i_e == i_s) {
10152                 i_e++;
10153             }
10154             i_s++;
10155             array[i_s] = start;
10156         }
10157     }
10158
10159     /* Here, we've dealt with the new range start extending any adjoining
10160      * existing ranges.
10161      *
10162      * If the new range extends to infinity, it is now the final one,
10163      * regardless of what was there before */
10164     if (UNLIKELY(end == UV_MAX)) {
10165         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10166         return invlist;
10167     }
10168
10169     /* If i_e started as == i_s, it has also been dealt with,
10170      * and been updated to the new i_s, which will fail the following if */
10171     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10172
10173         /* Here, the ranges on either side of the end of the new range are in
10174          * the set, and this range ends in the gap between them.
10175          *
10176          * If this range is adjacent to (hence extends) the range above it, it
10177          * becomes part of that range; likewise if it extends the range below,
10178          * it becomes part of that range */
10179         if (end + 1 == array[i_e+1]) {
10180             i_e++;
10181             array[i_e] = start;
10182         }
10183         else if (start <= array[i_e]) {
10184             array[i_e] = end + 1;
10185             i_e--;
10186         }
10187     }
10188
10189     if (i_s == i_e) {
10190
10191         /* If the range fits entirely in an existing range (as possibly already
10192          * extended above), it doesn't add anything new */
10193         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10194             return invlist;
10195         }
10196
10197         /* Here, no part of the range is in the list.  Must add it.  It will
10198          * occupy 2 more slots */
10199       splice_in_new_range:
10200
10201         invlist_extend(invlist, len + 2);
10202         array = invlist_array(invlist);
10203         /* Move the rest of the array down two slots. Don't include any
10204          * trailing NUL */
10205         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10206
10207         /* Do the actual splice */
10208         array[i_e+1] = start;
10209         array[i_e+2] = end + 1;
10210         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10211         return invlist;
10212     }
10213
10214     /* Here the new range crossed the boundaries of a pre-existing range.  The
10215      * code above has adjusted things so that both ends are in ranges that are
10216      * in the set.  This means everything in between must also be in the set.
10217      * Just squash things together */
10218     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10219     invlist_set_len(invlist,
10220                     len - i_e + i_s,
10221                     *(get_invlist_offset_addr(invlist)));
10222
10223     return invlist;
10224 }
10225
10226 SV*
10227 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10228                                  UV** other_elements_ptr)
10229 {
10230     /* Create and return an inversion list whose contents are to be populated
10231      * by the caller.  The caller gives the number of elements (in 'size') and
10232      * the very first element ('element0').  This function will set
10233      * '*other_elements_ptr' to an array of UVs, where the remaining elements
10234      * are to be placed.
10235      *
10236      * Obviously there is some trust involved that the caller will properly
10237      * fill in the other elements of the array.
10238      *
10239      * (The first element needs to be passed in, as the underlying code does
10240      * things differently depending on whether it is zero or non-zero) */
10241
10242     SV* invlist = _new_invlist(size);
10243     bool offset;
10244
10245     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10246
10247     invlist = add_cp_to_invlist(invlist, element0);
10248     offset = *get_invlist_offset_addr(invlist);
10249
10250     invlist_set_len(invlist, size, offset);
10251     *other_elements_ptr = invlist_array(invlist) + 1;
10252     return invlist;
10253 }
10254
10255 #endif
10256
10257 #ifndef PERL_IN_XSUB_RE
10258 void
10259 Perl__invlist_invert(pTHX_ SV* const invlist)
10260 {
10261     /* Complement the input inversion list.  This adds a 0 if the list didn't
10262      * have a zero; removes it otherwise.  As described above, the data
10263      * structure is set up so that this is very efficient */
10264
10265     PERL_ARGS_ASSERT__INVLIST_INVERT;
10266
10267     assert(! invlist_is_iterating(invlist));
10268
10269     /* The inverse of matching nothing is matching everything */
10270     if (_invlist_len(invlist) == 0) {
10271         _append_range_to_invlist(invlist, 0, UV_MAX);
10272         return;
10273     }
10274
10275     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10276 }
10277
10278 SV*
10279 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10280 {
10281     /* Return a new inversion list that is a copy of the input one, which is
10282      * unchanged.  The new list will not be mortal even if the old one was. */
10283
10284     const STRLEN nominal_length = _invlist_len(invlist);
10285     const STRLEN physical_length = SvCUR(invlist);
10286     const bool offset = *(get_invlist_offset_addr(invlist));
10287
10288     PERL_ARGS_ASSERT_INVLIST_CLONE;
10289
10290     if (new_invlist == NULL) {
10291         new_invlist = _new_invlist(nominal_length);
10292     }
10293     else {
10294         sv_upgrade(new_invlist, SVt_INVLIST);
10295         initialize_invlist_guts(new_invlist, nominal_length);
10296     }
10297
10298     *(get_invlist_offset_addr(new_invlist)) = offset;
10299     invlist_set_len(new_invlist, nominal_length, offset);
10300     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10301
10302     return new_invlist;
10303 }
10304
10305 #endif
10306
10307 PERL_STATIC_INLINE UV
10308 S_invlist_lowest(SV* const invlist)
10309 {
10310     /* Returns the lowest code point that matches an inversion list.  This API
10311      * has an ambiguity, as it returns 0 under either the lowest is actually
10312      * 0, or if the list is empty.  If this distinction matters to you, check
10313      * for emptiness before calling this function */
10314
10315     UV len = _invlist_len(invlist);
10316     UV *array;
10317
10318     PERL_ARGS_ASSERT_INVLIST_LOWEST;
10319
10320     if (len == 0) {
10321         return 0;
10322     }
10323
10324     array = invlist_array(invlist);
10325
10326     return array[0];
10327 }
10328
10329 STATIC SV *
10330 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10331 {
10332     /* Get the contents of an inversion list into a string SV so that they can
10333      * be printed out.  If 'traditional_style' is TRUE, it uses the format
10334      * traditionally done for debug tracing; otherwise it uses a format
10335      * suitable for just copying to the output, with blanks between ranges and
10336      * a dash between range components */
10337
10338     UV start, end;
10339     SV* output;
10340     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10341     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10342
10343     if (traditional_style) {
10344         output = newSVpvs("\n");
10345     }
10346     else {
10347         output = newSVpvs("");
10348     }
10349
10350     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10351
10352     assert(! invlist_is_iterating(invlist));
10353
10354     invlist_iterinit(invlist);
10355     while (invlist_iternext(invlist, &start, &end)) {
10356         if (end == UV_MAX) {
10357             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10358                                           start, intra_range_delimiter,
10359                                                  inter_range_delimiter);
10360         }
10361         else if (end != start) {
10362             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10363                                           start,
10364                                                    intra_range_delimiter,
10365                                                   end, inter_range_delimiter);
10366         }
10367         else {
10368             Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10369                                           start, inter_range_delimiter);
10370         }
10371     }
10372
10373     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10374         SvCUR_set(output, SvCUR(output) - 1);
10375     }
10376
10377     return output;
10378 }
10379
10380 #ifndef PERL_IN_XSUB_RE
10381 void
10382 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10383                          const char * const indent, SV* const invlist)
10384 {
10385     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
10386      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
10387      * the string 'indent'.  The output looks like this:
10388          [0] 0x000A .. 0x000D
10389          [2] 0x0085
10390          [4] 0x2028 .. 0x2029
10391          [6] 0x3104 .. INFTY
10392      * This means that the first range of code points matched by the list are
10393      * 0xA through 0xD; the second range contains only the single code point
10394      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
10395      * are used to define each range (except if the final range extends to
10396      * infinity, only a single element is needed).  The array index of the
10397      * first element for the corresponding range is given in brackets. */
10398
10399     UV start, end;
10400     STRLEN count = 0;
10401
10402     PERL_ARGS_ASSERT__INVLIST_DUMP;
10403
10404     if (invlist_is_iterating(invlist)) {
10405         Perl_dump_indent(aTHX_ level, file,
10406              "%sCan't dump inversion list because is in middle of iterating\n",
10407              indent);
10408         return;
10409     }
10410
10411     invlist_iterinit(invlist);
10412     while (invlist_iternext(invlist, &start, &end)) {
10413         if (end == UV_MAX) {
10414             Perl_dump_indent(aTHX_ level, file,
10415                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10416                                    indent, (UV)count, start);
10417         }
10418         else if (end != start) {
10419             Perl_dump_indent(aTHX_ level, file,
10420                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10421                                 indent, (UV)count, start,         end);
10422         }
10423         else {
10424             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10425                                             indent, (UV)count, start);
10426         }
10427         count += 2;
10428     }
10429 }
10430
10431 #endif
10432
10433 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10434 bool
10435 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10436 {
10437     /* Return a boolean as to if the two passed in inversion lists are
10438      * identical.  The final argument, if TRUE, says to take the complement of
10439      * the second inversion list before doing the comparison */
10440
10441     const UV len_a = _invlist_len(a);
10442     UV len_b = _invlist_len(b);
10443
10444     const UV* array_a = NULL;
10445     const UV* array_b = NULL;
10446
10447     PERL_ARGS_ASSERT__INVLISTEQ;
10448
10449     /* This code avoids accessing the arrays unless it knows the length is
10450      * non-zero */
10451
10452     if (len_a == 0) {
10453         if (len_b == 0) {
10454             return ! complement_b;
10455         }
10456     }
10457     else {
10458         array_a = invlist_array(a);
10459     }
10460
10461     if (len_b != 0) {
10462         array_b = invlist_array(b);
10463     }
10464
10465     /* If are to compare 'a' with the complement of b, set it
10466      * up so are looking at b's complement. */
10467     if (complement_b) {
10468
10469         /* The complement of nothing is everything, so <a> would have to have
10470          * just one element, starting at zero (ending at infinity) */
10471         if (len_b == 0) {
10472             return (len_a == 1 && array_a[0] == 0);
10473         }
10474         if (array_b[0] == 0) {
10475
10476             /* Otherwise, to complement, we invert.  Here, the first element is
10477              * 0, just remove it.  To do this, we just pretend the array starts
10478              * one later */
10479
10480             array_b++;
10481             len_b--;
10482         }
10483         else {
10484
10485             /* But if the first element is not zero, we pretend the list starts
10486              * at the 0 that is always stored immediately before the array. */
10487             array_b--;
10488             len_b++;
10489         }
10490     }
10491
10492     return    len_a == len_b
10493            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10494
10495 }
10496 #endif
10497
10498 /*
10499  * As best we can, determine the characters that can match the start of
10500  * the given EXACTF-ish node.  This is for use in creating ssc nodes, so there
10501  * can be false positive matches
10502  *
10503  * Returns the invlist as a new SV*; it is the caller's responsibility to
10504  * call SvREFCNT_dec() when done with it.
10505  */
10506 STATIC SV*
10507 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10508 {
10509     dVAR;
10510     const U8 * s = (U8*)STRING(node);
10511     SSize_t bytelen = STR_LEN(node);
10512     UV uc;
10513     /* Start out big enough for 2 separate code points */
10514     SV* invlist = _new_invlist(4);
10515
10516     PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10517
10518     if (! UTF) {
10519         uc = *s;
10520
10521         /* We punt and assume can match anything if the node begins
10522          * with a multi-character fold.  Things are complicated.  For
10523          * example, /ffi/i could match any of:
10524          *  "\N{LATIN SMALL LIGATURE FFI}"
10525          *  "\N{LATIN SMALL LIGATURE FF}I"
10526          *  "F\N{LATIN SMALL LIGATURE FI}"
10527          *  plus several other things; and making sure we have all the
10528          *  possibilities is hard. */
10529         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10530             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10531         }
10532         else {
10533             /* Any Latin1 range character can potentially match any
10534              * other depending on the locale, and in Turkic locales, U+130 and
10535              * U+131 */
10536             if (OP(node) == EXACTFL) {
10537                 _invlist_union(invlist, PL_Latin1, &invlist);
10538                 invlist = add_cp_to_invlist(invlist,
10539                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10540                 invlist = add_cp_to_invlist(invlist,
10541                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10542             }
10543             else {
10544                 /* But otherwise, it matches at least itself.  We can
10545                  * quickly tell if it has a distinct fold, and if so,
10546                  * it matches that as well */
10547                 invlist = add_cp_to_invlist(invlist, uc);
10548                 if (IS_IN_SOME_FOLD_L1(uc))
10549                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10550             }
10551
10552             /* Some characters match above-Latin1 ones under /i.  This
10553              * is true of EXACTFL ones when the locale is UTF-8 */
10554             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10555                 && (! isASCII(uc) || (OP(node) != EXACTFAA
10556                                     && OP(node) != EXACTFAA_NO_TRIE)))
10557             {
10558                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10559             }
10560         }
10561     }
10562     else {  /* Pattern is UTF-8 */
10563         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10564         const U8* e = s + bytelen;
10565         IV fc;
10566
10567         fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10568
10569         /* The only code points that aren't folded in a UTF EXACTFish
10570          * node are are the problematic ones in EXACTFL nodes */
10571         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10572             /* We need to check for the possibility that this EXACTFL
10573              * node begins with a multi-char fold.  Therefore we fold
10574              * the first few characters of it so that we can make that
10575              * check */
10576             U8 *d = folded;
10577             int i;
10578
10579             fc = -1;
10580             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10581                 if (isASCII(*s)) {
10582                     *(d++) = (U8) toFOLD(*s);
10583                     if (fc < 0) {       /* Save the first fold */
10584                         fc = *(d-1);
10585                     }
10586                     s++;
10587                 }
10588                 else {
10589                     STRLEN len;
10590                     UV fold = toFOLD_utf8_safe(s, e, d, &len);
10591                     if (fc < 0) {       /* Save the first fold */
10592                         fc = fold;
10593                     }
10594                     d += len;
10595                     s += UTF8SKIP(s);
10596                 }
10597             }
10598
10599             /* And set up so the code below that looks in this folded
10600              * buffer instead of the node's string */
10601             e = d;
10602             s = folded;
10603         }
10604
10605         /* When we reach here 's' points to the fold of the first
10606          * character(s) of the node; and 'e' points to far enough along
10607          * the folded string to be just past any possible multi-char
10608          * fold.
10609          *
10610          * Unlike the non-UTF-8 case, the macro for determining if a
10611          * string is a multi-char fold requires all the characters to
10612          * already be folded.  This is because of all the complications
10613          * if not.  Note that they are folded anyway, except in EXACTFL
10614          * nodes.  Like the non-UTF case above, we punt if the node
10615          * begins with a multi-char fold  */
10616
10617         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10618             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10619         }
10620         else {  /* Single char fold */
10621             unsigned int k;
10622             unsigned int first_fold;
10623             const unsigned int * remaining_folds;
10624             Size_t folds_count;
10625
10626             /* It matches itself */
10627             invlist = add_cp_to_invlist(invlist, fc);
10628
10629             /* ... plus all the things that fold to it, which are found in
10630              * PL_utf8_foldclosures */
10631             folds_count = _inverse_folds(fc, &first_fold,
10632                                                 &remaining_folds);
10633             for (k = 0; k < folds_count; k++) {
10634                 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10635
10636                 /* /aa doesn't allow folds between ASCII and non- */
10637                 if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10638                     && isASCII(c) != isASCII(fc))
10639                 {
10640                     continue;
10641                 }
10642
10643                 invlist = add_cp_to_invlist(invlist, c);
10644             }
10645
10646             if (OP(node) == EXACTFL) {
10647
10648                 /* If either [iI] are present in an EXACTFL node the above code
10649                  * should have added its normal case pair, but under a Turkish
10650                  * locale they could match instead the case pairs from it.  Add
10651                  * those as potential matches as well */
10652                 if (isALPHA_FOLD_EQ(fc, 'I')) {
10653                     invlist = add_cp_to_invlist(invlist,
10654                                                 LATIN_SMALL_LETTER_DOTLESS_I);
10655                     invlist = add_cp_to_invlist(invlist,
10656                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10657                 }
10658                 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10659                     invlist = add_cp_to_invlist(invlist, 'I');
10660                 }
10661                 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10662                     invlist = add_cp_to_invlist(invlist, 'i');
10663                 }
10664             }
10665         }
10666     }
10667
10668     return invlist;
10669 }
10670
10671 #undef HEADER_LENGTH
10672 #undef TO_INTERNAL_SIZE
10673 #undef FROM_INTERNAL_SIZE
10674 #undef INVLIST_VERSION_ID
10675
10676 /* End of inversion list object */
10677
10678 STATIC void
10679 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10680 {
10681     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10682      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10683      * should point to the first flag; it is updated on output to point to the
10684      * final ')' or ':'.  There needs to be at least one flag, or this will
10685      * abort */
10686
10687     /* for (?g), (?gc), and (?o) warnings; warning
10688        about (?c) will warn about (?g) -- japhy    */
10689
10690 #define WASTED_O  0x01
10691 #define WASTED_G  0x02
10692 #define WASTED_C  0x04
10693 #define WASTED_GC (WASTED_G|WASTED_C)
10694     I32 wastedflags = 0x00;
10695     U32 posflags = 0, negflags = 0;
10696     U32 *flagsp = &posflags;
10697     char has_charset_modifier = '\0';
10698     regex_charset cs;
10699     bool has_use_defaults = FALSE;
10700     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10701     int x_mod_count = 0;
10702
10703     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10704
10705     /* '^' as an initial flag sets certain defaults */
10706     if (UCHARAT(RExC_parse) == '^') {
10707         RExC_parse++;
10708         has_use_defaults = TRUE;
10709         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10710         cs = (RExC_uni_semantics)
10711              ? REGEX_UNICODE_CHARSET
10712              : REGEX_DEPENDS_CHARSET;
10713         set_regex_charset(&RExC_flags, cs);
10714     }
10715     else {
10716         cs = get_regex_charset(RExC_flags);
10717         if (   cs == REGEX_DEPENDS_CHARSET
10718             && RExC_uni_semantics)
10719         {
10720             cs = REGEX_UNICODE_CHARSET;
10721         }
10722     }
10723
10724     while (RExC_parse < RExC_end) {
10725         /* && strchr("iogcmsx", *RExC_parse) */
10726         /* (?g), (?gc) and (?o) are useless here
10727            and must be globally applied -- japhy */
10728         switch (*RExC_parse) {
10729
10730             /* Code for the imsxn flags */
10731             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10732
10733             case LOCALE_PAT_MOD:
10734                 if (has_charset_modifier) {
10735                     goto excess_modifier;
10736                 }
10737                 else if (flagsp == &negflags) {
10738                     goto neg_modifier;
10739                 }
10740                 cs = REGEX_LOCALE_CHARSET;
10741                 has_charset_modifier = LOCALE_PAT_MOD;
10742                 break;
10743             case UNICODE_PAT_MOD:
10744                 if (has_charset_modifier) {
10745                     goto excess_modifier;
10746                 }
10747                 else if (flagsp == &negflags) {
10748                     goto neg_modifier;
10749                 }
10750                 cs = REGEX_UNICODE_CHARSET;
10751                 has_charset_modifier = UNICODE_PAT_MOD;
10752                 break;
10753             case ASCII_RESTRICT_PAT_MOD:
10754                 if (flagsp == &negflags) {
10755                     goto neg_modifier;
10756                 }
10757                 if (has_charset_modifier) {
10758                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10759                         goto excess_modifier;
10760                     }
10761                     /* Doubled modifier implies more restricted */
10762                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10763                 }
10764                 else {
10765                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10766                 }
10767                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10768                 break;
10769             case DEPENDS_PAT_MOD:
10770                 if (has_use_defaults) {
10771                     goto fail_modifiers;
10772                 }
10773                 else if (flagsp == &negflags) {
10774                     goto neg_modifier;
10775                 }
10776                 else if (has_charset_modifier) {
10777                     goto excess_modifier;
10778                 }
10779
10780                 /* The dual charset means unicode semantics if the
10781                  * pattern (or target, not known until runtime) are
10782                  * utf8, or something in the pattern indicates unicode
10783                  * semantics */
10784                 cs = (RExC_uni_semantics)
10785                      ? REGEX_UNICODE_CHARSET
10786                      : REGEX_DEPENDS_CHARSET;
10787                 has_charset_modifier = DEPENDS_PAT_MOD;
10788                 break;
10789               excess_modifier:
10790                 RExC_parse++;
10791                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10792                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10793                 }
10794                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10795                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10796                                         *(RExC_parse - 1));
10797                 }
10798                 else {
10799                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10800                 }
10801                 NOT_REACHED; /*NOTREACHED*/
10802               neg_modifier:
10803                 RExC_parse++;
10804                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10805                                     *(RExC_parse - 1));
10806                 NOT_REACHED; /*NOTREACHED*/
10807             case ONCE_PAT_MOD: /* 'o' */
10808             case GLOBAL_PAT_MOD: /* 'g' */
10809                 if (ckWARN(WARN_REGEXP)) {
10810                     const I32 wflagbit = *RExC_parse == 'o'
10811                                          ? WASTED_O
10812                                          : WASTED_G;
10813                     if (! (wastedflags & wflagbit) ) {
10814                         wastedflags |= wflagbit;
10815                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10816                         vWARN5(
10817                             RExC_parse + 1,
10818                             "Useless (%s%c) - %suse /%c modifier",
10819                             flagsp == &negflags ? "?-" : "?",
10820                             *RExC_parse,
10821                             flagsp == &negflags ? "don't " : "",
10822                             *RExC_parse
10823                         );
10824                     }
10825                 }
10826                 break;
10827
10828             case CONTINUE_PAT_MOD: /* 'c' */
10829                 if (ckWARN(WARN_REGEXP)) {
10830                     if (! (wastedflags & WASTED_C) ) {
10831                         wastedflags |= WASTED_GC;
10832                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10833                         vWARN3(
10834                             RExC_parse + 1,
10835                             "Useless (%sc) - %suse /gc modifier",
10836                             flagsp == &negflags ? "?-" : "?",
10837                             flagsp == &negflags ? "don't " : ""
10838                         );
10839                     }
10840                 }
10841                 break;
10842             case KEEPCOPY_PAT_MOD: /* 'p' */
10843                 if (flagsp == &negflags) {
10844                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10845                 } else {
10846                     *flagsp |= RXf_PMf_KEEPCOPY;
10847                 }
10848                 break;
10849             case '-':
10850                 /* A flag is a default iff it is following a minus, so
10851                  * if there is a minus, it means will be trying to
10852                  * re-specify a default which is an error */
10853                 if (has_use_defaults || flagsp == &negflags) {
10854                     goto fail_modifiers;
10855                 }
10856                 flagsp = &negflags;
10857                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10858                 x_mod_count = 0;
10859                 break;
10860             case ':':
10861             case ')':
10862
10863                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10864                     negflags |= RXf_PMf_EXTENDED_MORE;
10865                 }
10866                 RExC_flags |= posflags;
10867
10868                 if (negflags & RXf_PMf_EXTENDED) {
10869                     negflags |= RXf_PMf_EXTENDED_MORE;
10870                 }
10871                 RExC_flags &= ~negflags;
10872                 set_regex_charset(&RExC_flags, cs);
10873
10874                 return;
10875             default:
10876               fail_modifiers:
10877                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
10878                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10879                 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10880                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10881                 NOT_REACHED; /*NOTREACHED*/
10882         }
10883
10884         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10885     }
10886
10887     vFAIL("Sequence (?... not terminated");
10888 }
10889
10890 /*
10891  - reg - regular expression, i.e. main body or parenthesized thing
10892  *
10893  * Caller must absorb opening parenthesis.
10894  *
10895  * Combining parenthesis handling with the base level of regular expression
10896  * is a trifle forced, but the need to tie the tails of the branches to what
10897  * follows makes it hard to avoid.
10898  */
10899 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10900 #ifdef DEBUGGING
10901 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10902 #else
10903 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10904 #endif
10905
10906 PERL_STATIC_INLINE regnode_offset
10907 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10908                              I32 *flagp,
10909                              char * parse_start,
10910                              char ch
10911                       )
10912 {
10913     regnode_offset ret;
10914     char* name_start = RExC_parse;
10915     U32 num = 0;
10916     SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10917     GET_RE_DEBUG_FLAGS_DECL;
10918
10919     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10920
10921     if (RExC_parse == name_start || *RExC_parse != ch) {
10922         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10923         vFAIL2("Sequence %.3s... not terminated", parse_start);
10924     }
10925
10926     if (sv_dat) {
10927         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10928         RExC_rxi->data->data[num]=(void*)sv_dat;
10929         SvREFCNT_inc_simple_void_NN(sv_dat);
10930     }
10931     RExC_sawback = 1;
10932     ret = reganode(pRExC_state,
10933                    ((! FOLD)
10934                      ? REFN
10935                      : (ASCII_FOLD_RESTRICTED)
10936                        ? REFFAN
10937                        : (AT_LEAST_UNI_SEMANTICS)
10938                          ? REFFUN
10939                          : (LOC)
10940                            ? REFFLN
10941                            : REFFN),
10942                     num);
10943     *flagp |= HASWIDTH;
10944
10945     Set_Node_Offset(REGNODE_p(ret), parse_start+1);
10946     Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
10947
10948     nextchar(pRExC_state);
10949     return ret;
10950 }
10951
10952 /* On success, returns the offset at which any next node should be placed into
10953  * the regex engine program being compiled.
10954  *
10955  * Returns 0 otherwise, with *flagp set to indicate why:
10956  *  TRYAGAIN        at the end of (?) that only sets flags.
10957  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
10958  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
10959  *  Otherwise would only return 0 if regbranch() returns 0, which cannot
10960  *  happen.  */
10961 STATIC regnode_offset
10962 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
10963     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10964      * 2 is like 1, but indicates that nextchar() has been called to advance
10965      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10966      * this flag alerts us to the need to check for that */
10967 {
10968     regnode_offset ret = 0;    /* Will be the head of the group. */
10969     regnode_offset br;
10970     regnode_offset lastbr;
10971     regnode_offset ender = 0;
10972     I32 parno = 0;
10973     I32 flags;
10974     U32 oregflags = RExC_flags;
10975     bool have_branch = 0;
10976     bool is_open = 0;
10977     I32 freeze_paren = 0;
10978     I32 after_freeze = 0;
10979     I32 num; /* numeric backreferences */
10980     SV * max_open;  /* Max number of unclosed parens */
10981
10982     char * parse_start = RExC_parse; /* MJD */
10983     char * const oregcomp_parse = RExC_parse;
10984
10985     GET_RE_DEBUG_FLAGS_DECL;
10986
10987     PERL_ARGS_ASSERT_REG;
10988     DEBUG_PARSE("reg ");
10989
10990     max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
10991     assert(max_open);
10992     if (!SvIOK(max_open)) {
10993         sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
10994     }
10995     if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
10996                                               open paren */
10997         vFAIL("Too many nested open parens");
10998     }
10999
11000     *flagp = 0;                         /* Tentatively. */
11001
11002     if (RExC_in_lookbehind) {
11003         RExC_in_lookbehind++;
11004     }
11005     if (RExC_in_lookahead) {
11006         RExC_in_lookahead++;
11007     }
11008
11009     /* Having this true makes it feasible to have a lot fewer tests for the
11010      * parse pointer being in scope.  For example, we can write
11011      *      while(isFOO(*RExC_parse)) RExC_parse++;
11012      * instead of
11013      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11014      */
11015     assert(*RExC_end == '\0');
11016
11017     /* Make an OPEN node, if parenthesized. */
11018     if (paren) {
11019
11020         /* Under /x, space and comments can be gobbled up between the '(' and
11021          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
11022          * intervening space, as the sequence is a token, and a token should be
11023          * indivisible */
11024         bool has_intervening_patws = (paren == 2)
11025                                   && *(RExC_parse - 1) != '(';
11026
11027         if (RExC_parse >= RExC_end) {
11028             vFAIL("Unmatched (");
11029         }
11030
11031         if (paren == 'r') {     /* Atomic script run */
11032             paren = '>';
11033             goto parse_rest;
11034         }
11035         else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11036             char *start_verb = RExC_parse + 1;
11037             STRLEN verb_len;
11038             char *start_arg = NULL;
11039             unsigned char op = 0;
11040             int arg_required = 0;
11041             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11042             bool has_upper = FALSE;
11043
11044             if (has_intervening_patws) {
11045                 RExC_parse++;   /* past the '*' */
11046
11047                 /* For strict backwards compatibility, don't change the message
11048                  * now that we also have lowercase operands */
11049                 if (isUPPER(*RExC_parse)) {
11050                     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11051                 }
11052                 else {
11053                     vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11054                 }
11055             }
11056             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11057                 if ( *RExC_parse == ':' ) {
11058                     start_arg = RExC_parse + 1;
11059                     break;
11060                 }
11061                 else if (! UTF) {
11062                     if (isUPPER(*RExC_parse)) {
11063                         has_upper = TRUE;
11064                     }
11065                     RExC_parse++;
11066                 }
11067                 else {
11068                     RExC_parse += UTF8SKIP(RExC_parse);
11069                 }
11070             }
11071             verb_len = RExC_parse - start_verb;
11072             if ( start_arg ) {
11073                 if (RExC_parse >= RExC_end) {
11074                     goto unterminated_verb_pattern;
11075                 }
11076
11077                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11078                 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11079                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11080                 }
11081                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11082                   unterminated_verb_pattern:
11083                     if (has_upper) {
11084                         vFAIL("Unterminated verb pattern argument");
11085                     }
11086                     else {
11087                         vFAIL("Unterminated '(*...' argument");
11088                     }
11089                 }
11090             } else {
11091                 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11092                     if (has_upper) {
11093                         vFAIL("Unterminated verb pattern");
11094                     }
11095                     else {
11096                         vFAIL("Unterminated '(*...' construct");
11097                     }
11098                 }
11099             }
11100
11101             /* Here, we know that RExC_parse < RExC_end */
11102
11103             switch ( *start_verb ) {
11104             case 'A':  /* (*ACCEPT) */
11105                 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11106                     op = ACCEPT;
11107                     internal_argval = RExC_nestroot;
11108                 }
11109                 break;
11110             case 'C':  /* (*COMMIT) */
11111                 if ( memEQs(start_verb, verb_len,"COMMIT") )
11112                     op = COMMIT;
11113                 break;
11114             case 'F':  /* (*FAIL) */
11115                 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11116                     op = OPFAIL;
11117                 }
11118                 break;
11119             case ':':  /* (*:NAME) */
11120             case 'M':  /* (*MARK:NAME) */
11121                 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11122                     op = MARKPOINT;
11123                     arg_required = 1;
11124                 }
11125                 break;
11126             case 'P':  /* (*PRUNE) */
11127                 if ( memEQs(start_verb, verb_len,"PRUNE") )
11128                     op = PRUNE;
11129                 break;
11130             case 'S':   /* (*SKIP) */
11131                 if ( memEQs(start_verb, verb_len,"SKIP") )
11132                     op = SKIP;
11133                 break;
11134             case 'T':  /* (*THEN) */
11135                 /* [19:06] <TimToady> :: is then */
11136                 if ( memEQs(start_verb, verb_len,"THEN") ) {
11137                     op = CUTGROUP;
11138                     RExC_seen |= REG_CUTGROUP_SEEN;
11139                 }
11140                 break;
11141             case 'a':
11142                 if (   memEQs(start_verb, verb_len, "asr")
11143                     || memEQs(start_verb, verb_len, "atomic_script_run"))
11144                 {
11145                     paren = 'r';        /* Mnemonic: recursed run */
11146                     goto script_run;
11147                 }
11148                 else if (memEQs(start_verb, verb_len, "atomic")) {
11149                     paren = 't';    /* AtOMIC */
11150                     goto alpha_assertions;
11151                 }
11152                 break;
11153             case 'p':
11154                 if (   memEQs(start_verb, verb_len, "plb")
11155                     || memEQs(start_verb, verb_len, "positive_lookbehind"))
11156                 {
11157                     paren = 'b';
11158                     goto lookbehind_alpha_assertions;
11159                 }
11160                 else if (   memEQs(start_verb, verb_len, "pla")
11161                          || memEQs(start_verb, verb_len, "positive_lookahead"))
11162                 {
11163                     paren = 'a';
11164                     goto alpha_assertions;
11165                 }
11166                 break;
11167             case 'n':
11168                 if (   memEQs(start_verb, verb_len, "nlb")
11169                     || memEQs(start_verb, verb_len, "negative_lookbehind"))
11170                 {
11171                     paren = 'B';
11172                     goto lookbehind_alpha_assertions;
11173                 }
11174                 else if (   memEQs(start_verb, verb_len, "nla")
11175                          || memEQs(start_verb, verb_len, "negative_lookahead"))
11176                 {
11177                     paren = 'A';
11178                     goto alpha_assertions;
11179                 }
11180                 break;
11181             case 's':
11182                 if (   memEQs(start_verb, verb_len, "sr")
11183                     || memEQs(start_verb, verb_len, "script_run"))
11184                 {
11185                     regnode_offset atomic;
11186
11187                     paren = 's';
11188
11189                    script_run:
11190
11191                     /* This indicates Unicode rules. */
11192                     REQUIRE_UNI_RULES(flagp, 0);
11193
11194                     if (! start_arg) {
11195                         goto no_colon;
11196                     }
11197
11198                     RExC_parse = start_arg;
11199
11200                     if (RExC_in_script_run) {
11201
11202                         /*  Nested script runs are treated as no-ops, because
11203                          *  if the nested one fails, the outer one must as
11204                          *  well.  It could fail sooner, and avoid (??{} with
11205                          *  side effects, but that is explicitly documented as
11206                          *  undefined behavior. */
11207
11208                         ret = 0;
11209
11210                         if (paren == 's') {
11211                             paren = ':';
11212                             goto parse_rest;
11213                         }
11214
11215                         /* But, the atomic part of a nested atomic script run
11216                          * isn't a no-op, but can be treated just like a '(?>'
11217                          * */
11218                         paren = '>';
11219                         goto parse_rest;
11220                     }
11221
11222                     if (paren == 's') {
11223                         /* Here, we're starting a new regular script run */
11224                         ret = reg_node(pRExC_state, SROPEN);
11225                         RExC_in_script_run = 1;
11226                         is_open = 1;
11227                         goto parse_rest;
11228                     }
11229
11230                     /* Here, we are starting an atomic script run.  This is
11231                      * handled by recursing to deal with the atomic portion
11232                      * separately, enclosed in SROPEN ... SRCLOSE nodes */
11233
11234                     ret = reg_node(pRExC_state, SROPEN);
11235
11236                     RExC_in_script_run = 1;
11237
11238                     atomic = reg(pRExC_state, 'r', &flags, depth);
11239                     if (flags & (RESTART_PARSE|NEED_UTF8)) {
11240                         *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11241                         return 0;
11242                     }
11243
11244                     if (! REGTAIL(pRExC_state, ret, atomic)) {
11245                         REQUIRE_BRANCHJ(flagp, 0);
11246                     }
11247
11248                     if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11249                                                                 SRCLOSE)))
11250                     {
11251                         REQUIRE_BRANCHJ(flagp, 0);
11252                     }
11253
11254                     RExC_in_script_run = 0;
11255                     return ret;
11256                 }
11257
11258                 break;
11259
11260             lookbehind_alpha_assertions:
11261                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11262                 RExC_in_lookbehind++;
11263                 /*FALLTHROUGH*/
11264
11265             alpha_assertions:
11266
11267                 RExC_seen_zerolen++;
11268
11269                 if (! start_arg) {
11270                     goto no_colon;
11271                 }
11272
11273                 /* An empty negative lookahead assertion simply is failure */
11274                 if (paren == 'A' && RExC_parse == start_arg) {
11275                     ret=reganode(pRExC_state, OPFAIL, 0);
11276                     nextchar(pRExC_state);
11277                     return ret;
11278                 }
11279
11280                 RExC_parse = start_arg;
11281                 goto parse_rest;
11282
11283               no_colon:
11284                 vFAIL2utf8f(
11285                 "'(*%" UTF8f "' requires a terminating ':'",
11286                 UTF8fARG(UTF, verb_len, start_verb));
11287                 NOT_REACHED; /*NOTREACHED*/
11288
11289             } /* End of switch */
11290             if ( ! op ) {
11291                 RExC_parse += UTF
11292                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11293                               : 1;
11294                 if (has_upper || verb_len == 0) {
11295                     vFAIL2utf8f(
11296                     "Unknown verb pattern '%" UTF8f "'",
11297                     UTF8fARG(UTF, verb_len, start_verb));
11298                 }
11299                 else {
11300                     vFAIL2utf8f(
11301                     "Unknown '(*...)' construct '%" UTF8f "'",
11302                     UTF8fARG(UTF, verb_len, start_verb));
11303                 }
11304             }
11305             if ( RExC_parse == start_arg ) {
11306                 start_arg = NULL;
11307             }
11308             if ( arg_required && !start_arg ) {
11309                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11310                     verb_len, start_verb);
11311             }
11312             if (internal_argval == -1) {
11313                 ret = reganode(pRExC_state, op, 0);
11314             } else {
11315                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11316             }
11317             RExC_seen |= REG_VERBARG_SEEN;
11318             if (start_arg) {
11319                 SV *sv = newSVpvn( start_arg,
11320                                     RExC_parse - start_arg);
11321                 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11322                                         STR_WITH_LEN("S"));
11323                 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11324                 FLAGS(REGNODE_p(ret)) = 1;
11325             } else {
11326                 FLAGS(REGNODE_p(ret)) = 0;
11327             }
11328             if ( internal_argval != -1 )
11329                 ARG2L_SET(REGNODE_p(ret), internal_argval);
11330             nextchar(pRExC_state);
11331             return ret;
11332         }
11333         else if (*RExC_parse == '?') { /* (?...) */
11334             bool is_logical = 0;
11335             const char * const seqstart = RExC_parse;
11336             const char * endptr;
11337             if (has_intervening_patws) {
11338                 RExC_parse++;
11339                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11340             }
11341
11342             RExC_parse++;           /* past the '?' */
11343             paren = *RExC_parse;    /* might be a trailing NUL, if not
11344                                        well-formed */
11345             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11346             if (RExC_parse > RExC_end) {
11347                 paren = '\0';
11348             }
11349             ret = 0;                    /* For look-ahead/behind. */
11350             switch (paren) {
11351
11352             case 'P':   /* (?P...) variants for those used to PCRE/Python */
11353                 paren = *RExC_parse;
11354                 if ( paren == '<') {    /* (?P<...>) named capture */
11355                     RExC_parse++;
11356                     if (RExC_parse >= RExC_end) {
11357                         vFAIL("Sequence (?P<... not terminated");
11358                     }
11359                     goto named_capture;
11360                 }
11361                 else if (paren == '>') {   /* (?P>name) named recursion */
11362                     RExC_parse++;
11363                     if (RExC_parse >= RExC_end) {
11364                         vFAIL("Sequence (?P>... not terminated");
11365                     }
11366                     goto named_recursion;
11367                 }
11368                 else if (paren == '=') {   /* (?P=...)  named backref */
11369                     RExC_parse++;
11370                     return handle_named_backref(pRExC_state, flagp,
11371                                                 parse_start, ')');
11372                 }
11373                 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11374                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11375                 vFAIL3("Sequence (%.*s...) not recognized",
11376                                 RExC_parse-seqstart, seqstart);
11377                 NOT_REACHED; /*NOTREACHED*/
11378             case '<':           /* (?<...) */
11379                 /* If you want to support (?<*...), first reconcile with GH #17363 */
11380                 if (*RExC_parse == '!')
11381                     paren = ',';
11382                 else if (*RExC_parse != '=')
11383               named_capture:
11384                 {               /* (?<...>) */
11385                     char *name_start;
11386                     SV *svname;
11387                     paren= '>';
11388                 /* FALLTHROUGH */
11389             case '\'':          /* (?'...') */
11390                     name_start = RExC_parse;
11391                     svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11392                     if (   RExC_parse == name_start
11393                         || RExC_parse >= RExC_end
11394                         || *RExC_parse != paren)
11395                     {
11396                         vFAIL2("Sequence (?%c... not terminated",
11397                             paren=='>' ? '<' : paren);
11398                     }
11399                     {
11400                         HE *he_str;
11401                         SV *sv_dat = NULL;
11402                         if (!svname) /* shouldn't happen */
11403                             Perl_croak(aTHX_
11404                                 "panic: reg_scan_name returned NULL");
11405                         if (!RExC_paren_names) {
11406                             RExC_paren_names= newHV();
11407                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
11408 #ifdef DEBUGGING
11409                             RExC_paren_name_list= newAV();
11410                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11411 #endif
11412                         }
11413                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11414                         if ( he_str )
11415                             sv_dat = HeVAL(he_str);
11416                         if ( ! sv_dat ) {
11417                             /* croak baby croak */
11418                             Perl_croak(aTHX_
11419                                 "panic: paren_name hash element allocation failed");
11420                         } else if ( SvPOK(sv_dat) ) {
11421                             /* (?|...) can mean we have dupes so scan to check
11422                                its already been stored. Maybe a flag indicating
11423                                we are inside such a construct would be useful,
11424                                but the arrays are likely to be quite small, so
11425                                for now we punt -- dmq */
11426                             IV count = SvIV(sv_dat);
11427                             I32 *pv = (I32*)SvPVX(sv_dat);
11428                             IV i;
11429                             for ( i = 0 ; i < count ; i++ ) {
11430                                 if ( pv[i] == RExC_npar ) {
11431                                     count = 0;
11432                                     break;
11433                                 }
11434                             }
11435                             if ( count ) {
11436                                 pv = (I32*)SvGROW(sv_dat,
11437                                                 SvCUR(sv_dat) + sizeof(I32)+1);
11438                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11439                                 pv[count] = RExC_npar;
11440                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11441                             }
11442                         } else {
11443                             (void)SvUPGRADE(sv_dat, SVt_PVNV);
11444                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
11445                                                                 sizeof(I32));
11446                             SvIOK_on(sv_dat);
11447                             SvIV_set(sv_dat, 1);
11448                         }
11449 #ifdef DEBUGGING
11450                         /* Yes this does cause a memory leak in debugging Perls
11451                          * */
11452                         if (!av_store(RExC_paren_name_list,
11453                                       RExC_npar, SvREFCNT_inc_NN(svname)))
11454                             SvREFCNT_dec_NN(svname);
11455 #endif
11456
11457                         /*sv_dump(sv_dat);*/
11458                     }
11459                     nextchar(pRExC_state);
11460                     paren = 1;
11461                     goto capturing_parens;
11462                 }
11463
11464                 RExC_seen |= REG_LOOKBEHIND_SEEN;
11465                 RExC_in_lookbehind++;
11466                 RExC_parse++;
11467                 if (RExC_parse >= RExC_end) {
11468                     vFAIL("Sequence (?... not terminated");
11469                 }
11470                 RExC_seen_zerolen++;
11471                 break;
11472             case '=':           /* (?=...) */
11473                 RExC_seen_zerolen++;
11474                 RExC_in_lookahead++;
11475                 break;
11476             case '!':           /* (?!...) */
11477                 RExC_seen_zerolen++;
11478                 /* check if we're really just a "FAIL" assertion */
11479                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11480                                         FALSE /* Don't force to /x */ );
11481                 if (*RExC_parse == ')') {
11482                     ret=reganode(pRExC_state, OPFAIL, 0);
11483                     nextchar(pRExC_state);
11484                     return ret;
11485                 }
11486                 break;
11487             case '|':           /* (?|...) */
11488                 /* branch reset, behave like a (?:...) except that
11489                    buffers in alternations share the same numbers */
11490                 paren = ':';
11491                 after_freeze = freeze_paren = RExC_npar;
11492
11493                 /* XXX This construct currently requires an extra pass.
11494                  * Investigation would be required to see if that could be
11495                  * changed */
11496                 REQUIRE_PARENS_PASS;
11497                 break;
11498             case ':':           /* (?:...) */
11499             case '>':           /* (?>...) */
11500                 break;
11501             case '$':           /* (?$...) */
11502             case '@':           /* (?@...) */
11503                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11504                 break;
11505             case '0' :           /* (?0) */
11506             case 'R' :           /* (?R) */
11507                 if (RExC_parse == RExC_end || *RExC_parse != ')')
11508                     FAIL("Sequence (?R) not terminated");
11509                 num = 0;
11510                 RExC_seen |= REG_RECURSE_SEEN;
11511
11512                 /* XXX These constructs currently require an extra pass.
11513                  * It probably could be changed */
11514                 REQUIRE_PARENS_PASS;
11515
11516                 *flagp |= POSTPONED;
11517                 goto gen_recurse_regop;
11518                 /*notreached*/
11519             /* named and numeric backreferences */
11520             case '&':            /* (?&NAME) */
11521                 parse_start = RExC_parse - 1;
11522               named_recursion:
11523                 {
11524                     SV *sv_dat = reg_scan_name(pRExC_state,
11525                                                REG_RSN_RETURN_DATA);
11526                    num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11527                 }
11528                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11529                     vFAIL("Sequence (?&... not terminated");
11530                 goto gen_recurse_regop;
11531                 /* NOTREACHED */
11532             case '+':
11533                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11534                     RExC_parse++;
11535                     vFAIL("Illegal pattern");
11536                 }
11537                 goto parse_recursion;
11538                 /* NOTREACHED*/
11539             case '-': /* (?-1) */
11540                 if (! inRANGE(RExC_parse[0], '1', '9')) {
11541                     RExC_parse--; /* rewind to let it be handled later */
11542                     goto parse_flags;
11543                 }
11544                 /* FALLTHROUGH */
11545             case '1': case '2': case '3': case '4': /* (?1) */
11546             case '5': case '6': case '7': case '8': case '9':
11547                 RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
11548               parse_recursion:
11549                 {
11550                     bool is_neg = FALSE;
11551                     UV unum;
11552                     parse_start = RExC_parse - 1; /* MJD */
11553                     if (*RExC_parse == '-') {
11554                         RExC_parse++;
11555                         is_neg = TRUE;
11556                     }
11557                     endptr = RExC_end;
11558                     if (grok_atoUV(RExC_parse, &unum, &endptr)
11559                         && unum <= I32_MAX
11560                     ) {
11561                         num = (I32)unum;
11562                         RExC_parse = (char*)endptr;
11563                     } else
11564                         num = I32_MAX;
11565                     if (is_neg) {
11566                         /* Some limit for num? */
11567                         num = -num;
11568                     }
11569                 }
11570                 if (*RExC_parse!=')')
11571                     vFAIL("Expecting close bracket");
11572
11573               gen_recurse_regop:
11574                 if ( paren == '-' ) {
11575                     /*
11576                     Diagram of capture buffer numbering.
11577                     Top line is the normal capture buffer numbers
11578                     Bottom line is the negative indexing as from
11579                     the X (the (?-2))
11580
11581                     +   1 2    3 4 5 X          6 7
11582                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11583                     -   5 4    3 2 1 X          x x
11584
11585                     */
11586                     num = RExC_npar + num;
11587                     if (num < 1)  {
11588
11589                         /* It might be a forward reference; we can't fail until
11590                          * we know, by completing the parse to get all the
11591                          * groups, and then reparsing */
11592                         if (ALL_PARENS_COUNTED)  {
11593                             RExC_parse++;
11594                             vFAIL("Reference to nonexistent group");
11595                         }
11596                         else {
11597                             REQUIRE_PARENS_PASS;
11598                         }
11599                     }
11600                 } else if ( paren == '+' ) {
11601                     num = RExC_npar + num - 1;
11602                 }
11603                 /* We keep track how many GOSUB items we have produced.
11604                    To start off the ARG2L() of the GOSUB holds its "id",
11605                    which is used later in conjunction with RExC_recurse
11606                    to calculate the offset we need to jump for the GOSUB,
11607                    which it will store in the final representation.
11608                    We have to defer the actual calculation until much later
11609                    as the regop may move.
11610                  */
11611
11612                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11613                 if (num >= RExC_npar) {
11614
11615                     /* It might be a forward reference; we can't fail until we
11616                      * know, by completing the parse to get all the groups, and
11617                      * then reparsing */
11618                     if (ALL_PARENS_COUNTED)  {
11619                         if (num >= RExC_total_parens) {
11620                             RExC_parse++;
11621                             vFAIL("Reference to nonexistent group");
11622                         }
11623                     }
11624                     else {
11625                         REQUIRE_PARENS_PASS;
11626                     }
11627                 }
11628                 RExC_recurse_count++;
11629                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11630                     "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11631                             22, "|    |", (int)(depth * 2 + 1), "",
11632                             (UV)ARG(REGNODE_p(ret)),
11633                             (IV)ARG2L(REGNODE_p(ret))));
11634                 RExC_seen |= REG_RECURSE_SEEN;
11635
11636                 Set_Node_Length(REGNODE_p(ret),
11637                                 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11638                 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11639
11640                 *flagp |= POSTPONED;
11641                 assert(*RExC_parse == ')');
11642                 nextchar(pRExC_state);
11643                 return ret;
11644
11645             /* NOTREACHED */
11646
11647             case '?':           /* (??...) */
11648                 is_logical = 1;
11649                 if (*RExC_parse != '{') {
11650                     RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11651                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11652                     vFAIL2utf8f(
11653                         "Sequence (%" UTF8f "...) not recognized",
11654                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11655                     NOT_REACHED; /*NOTREACHED*/
11656                 }
11657                 *flagp |= POSTPONED;
11658                 paren = '{';
11659                 RExC_parse++;
11660                 /* FALLTHROUGH */
11661             case '{':           /* (?{...}) */
11662             {
11663                 U32 n = 0;
11664                 struct reg_code_block *cb;
11665                 OP * o;
11666
11667                 RExC_seen_zerolen++;
11668
11669                 if (   !pRExC_state->code_blocks
11670                     || pRExC_state->code_index
11671                                         >= pRExC_state->code_blocks->count
11672                     || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11673                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11674                             - RExC_start)
11675                 ) {
11676                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
11677                         FAIL("panic: Sequence (?{...}): no code block found\n");
11678                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
11679                 }
11680                 /* this is a pre-compiled code block (?{...}) */
11681                 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11682                 RExC_parse = RExC_start + cb->end;
11683                 o = cb->block;
11684                 if (cb->src_regex) {
11685                     n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11686                     RExC_rxi->data->data[n] =
11687                         (void*)SvREFCNT_inc((SV*)cb->src_regex);
11688                     RExC_rxi->data->data[n+1] = (void*)o;
11689                 }
11690                 else {
11691                     n = add_data(pRExC_state,
11692                             (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11693                     RExC_rxi->data->data[n] = (void*)o;
11694                 }
11695                 pRExC_state->code_index++;
11696                 nextchar(pRExC_state);
11697
11698                 if (is_logical) {
11699                     regnode_offset eval;
11700                     ret = reg_node(pRExC_state, LOGICAL);
11701
11702                     eval = reg2Lanode(pRExC_state, EVAL,
11703                                        n,
11704
11705                                        /* for later propagation into (??{})
11706                                         * return value */
11707                                        RExC_flags & RXf_PMf_COMPILETIME
11708                                       );
11709                     FLAGS(REGNODE_p(ret)) = 2;
11710                     if (! REGTAIL(pRExC_state, ret, eval)) {
11711                         REQUIRE_BRANCHJ(flagp, 0);
11712                     }
11713                     /* deal with the length of this later - MJD */
11714                     return ret;
11715                 }
11716                 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11717                 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11718                 Set_Node_Offset(REGNODE_p(ret), parse_start);
11719                 return ret;
11720             }
11721             case '(':           /* (?(?{...})...) and (?(?=...)...) */
11722             {
11723                 int is_define= 0;
11724                 const int DEFINE_len = sizeof("DEFINE") - 1;
11725                 if (    RExC_parse < RExC_end - 1
11726                     && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
11727                             && (   RExC_parse[1] == '='
11728                                 || RExC_parse[1] == '!'
11729                                 || RExC_parse[1] == '<'
11730                                 || RExC_parse[1] == '{'))
11731                         || (       RExC_parse[0] == '*'        /* (?(*...)) */
11732                             && (   memBEGINs(RExC_parse + 1,
11733                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11734                                          "pla:")
11735                                 || memBEGINs(RExC_parse + 1,
11736                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11737                                          "plb:")
11738                                 || memBEGINs(RExC_parse + 1,
11739                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11740                                          "nla:")
11741                                 || memBEGINs(RExC_parse + 1,
11742                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11743                                          "nlb:")
11744                                 || memBEGINs(RExC_parse + 1,
11745                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11746                                          "positive_lookahead:")
11747                                 || memBEGINs(RExC_parse + 1,
11748                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11749                                          "positive_lookbehind:")
11750                                 || memBEGINs(RExC_parse + 1,
11751                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11752                                          "negative_lookahead:")
11753                                 || memBEGINs(RExC_parse + 1,
11754                                          (Size_t) (RExC_end - (RExC_parse + 1)),
11755                                          "negative_lookbehind:"))))
11756                 ) { /* Lookahead or eval. */
11757                     I32 flag;
11758                     regnode_offset tail;
11759
11760                     ret = reg_node(pRExC_state, LOGICAL);
11761                     FLAGS(REGNODE_p(ret)) = 1;
11762
11763                     tail = reg(pRExC_state, 1, &flag, depth+1);
11764                     RETURN_FAIL_ON_RESTART(flag, flagp);
11765                     if (! REGTAIL(pRExC_state, ret, tail)) {
11766                         REQUIRE_BRANCHJ(flagp, 0);
11767                     }
11768                     goto insert_if;
11769                 }
11770                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
11771                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11772                 {
11773                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
11774                     char *name_start= RExC_parse++;
11775                     U32 num = 0;
11776                     SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11777                     if (   RExC_parse == name_start
11778                         || RExC_parse >= RExC_end
11779                         || *RExC_parse != ch)
11780                     {
11781                         vFAIL2("Sequence (?(%c... not terminated",
11782                             (ch == '>' ? '<' : ch));
11783                     }
11784                     RExC_parse++;
11785                     if (sv_dat) {
11786                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
11787                         RExC_rxi->data->data[num]=(void*)sv_dat;
11788                         SvREFCNT_inc_simple_void_NN(sv_dat);
11789                     }
11790                     ret = reganode(pRExC_state, GROUPPN, num);
11791                     goto insert_if_check_paren;
11792                 }
11793                 else if (memBEGINs(RExC_parse,
11794                                    (STRLEN) (RExC_end - RExC_parse),
11795                                    "DEFINE"))
11796                 {
11797                     ret = reganode(pRExC_state, DEFINEP, 0);
11798                     RExC_parse += DEFINE_len;
11799                     is_define = 1;
11800                     goto insert_if_check_paren;
11801                 }
11802                 else if (RExC_parse[0] == 'R') {
11803                     RExC_parse++;
11804                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
11805                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11806                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11807                      */
11808                     parno = 0;
11809                     if (RExC_parse[0] == '0') {
11810                         parno = 1;
11811                         RExC_parse++;
11812                     }
11813                     else if (inRANGE(RExC_parse[0], '1', '9')) {
11814                         UV uv;
11815                         endptr = RExC_end;
11816                         if (grok_atoUV(RExC_parse, &uv, &endptr)
11817                             && uv <= I32_MAX
11818                         ) {
11819                             parno = (I32)uv + 1;
11820                             RExC_parse = (char*)endptr;
11821                         }
11822                         /* else "Switch condition not recognized" below */
11823                     } else if (RExC_parse[0] == '&') {
11824                         SV *sv_dat;
11825                         RExC_parse++;
11826                         sv_dat = reg_scan_name(pRExC_state,
11827                                                REG_RSN_RETURN_DATA);
11828                         if (sv_dat)
11829                             parno = 1 + *((I32 *)SvPVX(sv_dat));
11830                     }
11831                     ret = reganode(pRExC_state, INSUBP, parno);
11832                     goto insert_if_check_paren;
11833                 }
11834                 else if (inRANGE(RExC_parse[0], '1', '9')) {
11835                     /* (?(1)...) */
11836                     char c;
11837                     UV uv;
11838                     endptr = RExC_end;
11839                     if (grok_atoUV(RExC_parse, &uv, &endptr)
11840                         && uv <= I32_MAX
11841                     ) {
11842                         parno = (I32)uv;
11843                         RExC_parse = (char*)endptr;
11844                     }
11845                     else {
11846                         vFAIL("panic: grok_atoUV returned FALSE");
11847                     }
11848                     ret = reganode(pRExC_state, GROUPP, parno);
11849
11850                  insert_if_check_paren:
11851                     if (UCHARAT(RExC_parse) != ')') {
11852                         RExC_parse += UTF
11853                                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11854                                       : 1;
11855                         vFAIL("Switch condition not recognized");
11856                     }
11857                     nextchar(pRExC_state);
11858                   insert_if:
11859                     if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
11860                                                              IFTHEN, 0)))
11861                     {
11862                         REQUIRE_BRANCHJ(flagp, 0);
11863                     }
11864                     br = regbranch(pRExC_state, &flags, 1, depth+1);
11865                     if (br == 0) {
11866                         RETURN_FAIL_ON_RESTART(flags,flagp);
11867                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11868                               (UV) flags);
11869                     } else
11870                     if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
11871                                                              LONGJMP, 0)))
11872                     {
11873                         REQUIRE_BRANCHJ(flagp, 0);
11874                     }
11875                     c = UCHARAT(RExC_parse);
11876                     nextchar(pRExC_state);
11877                     if (flags&HASWIDTH)
11878                         *flagp |= HASWIDTH;
11879                     if (c == '|') {
11880                         if (is_define)
11881                             vFAIL("(?(DEFINE)....) does not allow branches");
11882
11883                         /* Fake one for optimizer.  */
11884                         lastbr = reganode(pRExC_state, IFTHEN, 0);
11885
11886                         if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11887                             RETURN_FAIL_ON_RESTART(flags, flagp);
11888                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11889                                   (UV) flags);
11890                         }
11891                         if (! REGTAIL(pRExC_state, ret, lastbr)) {
11892                             REQUIRE_BRANCHJ(flagp, 0);
11893                         }
11894                         if (flags&HASWIDTH)
11895                             *flagp |= HASWIDTH;
11896                         c = UCHARAT(RExC_parse);
11897                         nextchar(pRExC_state);
11898                     }
11899                     else
11900                         lastbr = 0;
11901                     if (c != ')') {
11902                         if (RExC_parse >= RExC_end)
11903                             vFAIL("Switch (?(condition)... not terminated");
11904                         else
11905                             vFAIL("Switch (?(condition)... contains too many branches");
11906                     }
11907                     ender = reg_node(pRExC_state, TAIL);
11908                     if (! REGTAIL(pRExC_state, br, ender)) {
11909                         REQUIRE_BRANCHJ(flagp, 0);
11910                     }
11911                     if (lastbr) {
11912                         if (! REGTAIL(pRExC_state, lastbr, ender)) {
11913                             REQUIRE_BRANCHJ(flagp, 0);
11914                         }
11915                         if (! REGTAIL(pRExC_state,
11916                                       REGNODE_OFFSET(
11917                                                  NEXTOPER(
11918                                                  NEXTOPER(REGNODE_p(lastbr)))),
11919                                       ender))
11920                         {
11921                             REQUIRE_BRANCHJ(flagp, 0);
11922                         }
11923                     }
11924                     else
11925                         if (! REGTAIL(pRExC_state, ret, ender)) {
11926                             REQUIRE_BRANCHJ(flagp, 0);
11927                         }
11928 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
11929                     RExC_size++; /* XXX WHY do we need this?!!
11930                                     For large programs it seems to be required
11931                                     but I can't figure out why. -- dmq*/
11932 #endif
11933                     return ret;
11934                 }
11935                 RExC_parse += UTF
11936                               ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11937                               : 1;
11938                 vFAIL("Unknown switch condition (?(...))");
11939             }
11940             case '[':           /* (?[ ... ]) */
11941                 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11942                                          oregcomp_parse);
11943             case 0: /* A NUL */
11944                 RExC_parse--; /* for vFAIL to print correctly */
11945                 vFAIL("Sequence (? incomplete");
11946                 break;
11947
11948             case ')':
11949                 if (RExC_strict) {  /* [perl #132851] */
11950                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
11951                 }
11952                 /* FALLTHROUGH */
11953             case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
11954             /* FALLTHROUGH */
11955             default: /* e.g., (?i) */
11956                 RExC_parse = (char *) seqstart + 1;
11957               parse_flags:
11958                 parse_lparen_question_flags(pRExC_state);
11959                 if (UCHARAT(RExC_parse) != ':') {
11960                     if (RExC_parse < RExC_end)
11961                         nextchar(pRExC_state);
11962                     *flagp = TRYAGAIN;
11963                     return 0;
11964                 }
11965                 paren = ':';
11966                 nextchar(pRExC_state);
11967                 ret = 0;
11968                 goto parse_rest;
11969             } /* end switch */
11970         }
11971         else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11972           capturing_parens:
11973             parno = RExC_npar;
11974             RExC_npar++;
11975             if (! ALL_PARENS_COUNTED) {
11976                 /* If we are in our first pass through (and maybe only pass),
11977                  * we  need to allocate memory for the capturing parentheses
11978                  * data structures.
11979                  */
11980
11981                 if (!RExC_parens_buf_size) {
11982                     /* first guess at number of parens we might encounter */
11983                     RExC_parens_buf_size = 10;
11984
11985                     /* setup RExC_open_parens, which holds the address of each
11986                      * OPEN tag, and to make things simpler for the 0 index the
11987                      * start of the program - this is used later for offsets */
11988                     Newxz(RExC_open_parens, RExC_parens_buf_size,
11989                             regnode_offset);
11990                     RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
11991
11992                     /* setup RExC_close_parens, which holds the address of each
11993                      * CLOSE tag, and to make things simpler for the 0 index
11994                      * the end of the program - this is used later for offsets
11995                      * */
11996                     Newxz(RExC_close_parens, RExC_parens_buf_size,
11997                             regnode_offset);
11998                     /* we dont know where end op starts yet, so we dont need to
11999                      * set RExC_close_parens[0] like we do RExC_open_parens[0]
12000                      * above */
12001                 }
12002                 else if (RExC_npar > RExC_parens_buf_size) {
12003                     I32 old_size = RExC_parens_buf_size;
12004
12005                     RExC_parens_buf_size *= 2;
12006
12007                     Renew(RExC_open_parens, RExC_parens_buf_size,
12008                             regnode_offset);
12009                     Zero(RExC_open_parens + old_size,
12010                             RExC_parens_buf_size - old_size, regnode_offset);
12011
12012                     Renew(RExC_close_parens, RExC_parens_buf_size,
12013                             regnode_offset);
12014                     Zero(RExC_close_parens + old_size,
12015                             RExC_parens_buf_size - old_size, regnode_offset);
12016                 }
12017             }
12018
12019             ret = reganode(pRExC_state, OPEN, parno);
12020             if (!RExC_nestroot)
12021                 RExC_nestroot = parno;
12022             if (RExC_open_parens && !RExC_open_parens[parno])
12023             {
12024                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12025                     "%*s%*s Setting open paren #%" IVdf " to %d\n",
12026                     22, "|    |", (int)(depth * 2 + 1), "",
12027                     (IV)parno, ret));
12028                 RExC_open_parens[parno]= ret;
12029             }
12030
12031             Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12032             Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12033             is_open = 1;
12034         } else {
12035             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12036             paren = ':';
12037             ret = 0;
12038         }
12039     }
12040     else                        /* ! paren */
12041         ret = 0;
12042
12043    parse_rest:
12044     /* Pick up the branches, linking them together. */
12045     parse_start = RExC_parse;   /* MJD */
12046     br = regbranch(pRExC_state, &flags, 1, depth+1);
12047
12048     /*     branch_len = (paren != 0); */
12049
12050     if (br == 0) {
12051         RETURN_FAIL_ON_RESTART(flags, flagp);
12052         FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12053     }
12054     if (*RExC_parse == '|') {
12055         if (RExC_use_BRANCHJ) {
12056             reginsert(pRExC_state, BRANCHJ, br, depth+1);
12057         }
12058         else {                  /* MJD */
12059             reginsert(pRExC_state, BRANCH, br, depth+1);
12060             Set_Node_Length(REGNODE_p(br), paren != 0);
12061             Set_Node_Offset_To_R(br, parse_start-RExC_start);
12062         }
12063         have_branch = 1;
12064     }
12065     else if (paren == ':') {
12066         *flagp |= flags&SIMPLE;
12067     }
12068     if (is_open) {                              /* Starts with OPEN. */
12069         if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
12070             REQUIRE_BRANCHJ(flagp, 0);
12071         }
12072     }
12073     else if (paren != '?')              /* Not Conditional */
12074         ret = br;
12075     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12076     lastbr = br;
12077     while (*RExC_parse == '|') {
12078         if (RExC_use_BRANCHJ) {
12079             bool shut_gcc_up;
12080
12081             ender = reganode(pRExC_state, LONGJMP, 0);
12082
12083             /* Append to the previous. */
12084             shut_gcc_up = REGTAIL(pRExC_state,
12085                          REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12086                          ender);
12087             PERL_UNUSED_VAR(shut_gcc_up);
12088         }
12089         nextchar(pRExC_state);
12090         if (freeze_paren) {
12091             if (RExC_npar > after_freeze)
12092                 after_freeze = RExC_npar;
12093             RExC_npar = freeze_paren;
12094         }
12095         br = regbranch(pRExC_state, &flags, 0, depth+1);
12096
12097         if (br == 0) {
12098             RETURN_FAIL_ON_RESTART(flags, flagp);
12099             FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12100         }
12101         if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
12102             REQUIRE_BRANCHJ(flagp, 0);
12103         }
12104         lastbr = br;
12105         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12106     }
12107
12108     if (have_branch || paren != ':') {
12109         regnode * br;
12110
12111         /* Make a closing node, and hook it on the end. */
12112         switch (paren) {
12113         case ':':
12114             ender = reg_node(pRExC_state, TAIL);
12115             break;
12116         case 1: case 2:
12117             ender = reganode(pRExC_state, CLOSE, parno);
12118             if ( RExC_close_parens ) {
12119                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12120                         "%*s%*s Setting close paren #%" IVdf " to %d\n",
12121                         22, "|    |", (int)(depth * 2 + 1), "",
12122                         (IV)parno, ender));
12123                 RExC_close_parens[parno]= ender;
12124                 if (RExC_nestroot == parno)
12125                     RExC_nestroot = 0;
12126             }
12127             Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12128             Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12129             break;
12130         case 's':
12131             ender = reg_node(pRExC_state, SRCLOSE);
12132             RExC_in_script_run = 0;
12133             break;
12134         case '<':
12135         case 'a':
12136         case 'A':
12137         case 'b':
12138         case 'B':
12139         case ',':
12140         case '=':
12141         case '!':
12142             *flagp &= ~HASWIDTH;
12143             /* FALLTHROUGH */
12144         case 't':   /* aTomic */
12145         case '>':
12146             ender = reg_node(pRExC_state, SUCCEED);
12147             break;
12148         case 0:
12149             ender = reg_node(pRExC_state, END);
12150             assert(!RExC_end_op); /* there can only be one! */
12151             RExC_end_op = REGNODE_p(ender);
12152             if (RExC_close_parens) {
12153                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12154                     "%*s%*s Setting close paren #0 (END) to %d\n",
12155                     22, "|    |", (int)(depth * 2 + 1), "",
12156                     ender));
12157
12158                 RExC_close_parens[0]= ender;
12159             }
12160             break;
12161         }
12162         DEBUG_PARSE_r(
12163             DEBUG_PARSE_MSG("lsbr");
12164             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12165             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12166             Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12167                           SvPV_nolen_const(RExC_mysv1),
12168                           (IV)lastbr,
12169                           SvPV_nolen_const(RExC_mysv2),
12170                           (IV)ender,
12171                           (IV)(ender - lastbr)
12172             );
12173         );
12174         if (! REGTAIL(pRExC_state, lastbr, ender)) {
12175             REQUIRE_BRANCHJ(flagp, 0);
12176         }
12177
12178         if (have_branch) {
12179             char is_nothing= 1;
12180             if (depth==1)
12181                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12182
12183             /* Hook the tails of the branches to the closing node. */
12184             for (br = REGNODE_p(ret); br; br = regnext(br)) {
12185                 const U8 op = PL_regkind[OP(br)];
12186                 if (op == BRANCH) {
12187                     if (! REGTAIL_STUDY(pRExC_state,
12188                                         REGNODE_OFFSET(NEXTOPER(br)),
12189                                         ender))
12190                     {
12191                         REQUIRE_BRANCHJ(flagp, 0);
12192                     }
12193                     if ( OP(NEXTOPER(br)) != NOTHING
12194                          || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12195                         is_nothing= 0;
12196                 }
12197                 else if (op == BRANCHJ) {
12198                     bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12199                                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12200                                         ender);
12201                     PERL_UNUSED_VAR(shut_gcc_up);
12202                     /* for now we always disable this optimisation * /
12203                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12204                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12205                     */
12206                         is_nothing= 0;
12207                 }
12208             }
12209             if (is_nothing) {
12210                 regnode * ret_as_regnode = REGNODE_p(ret);
12211                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12212                                ? regnext(ret_as_regnode)
12213                                : ret_as_regnode;
12214                 DEBUG_PARSE_r(
12215                     DEBUG_PARSE_MSG("NADA");
12216                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12217                                      NULL, pRExC_state);
12218                     regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12219                                      NULL, pRExC_state);
12220                     Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12221                                   SvPV_nolen_const(RExC_mysv1),
12222                                   (IV)REG_NODE_NUM(ret_as_regnode),
12223                                   SvPV_nolen_const(RExC_mysv2),
12224                                   (IV)ender,
12225                                   (IV)(ender - ret)
12226                     );
12227                 );
12228                 OP(br)= NOTHING;
12229                 if (OP(REGNODE_p(ender)) == TAIL) {
12230                     NEXT_OFF(br)= 0;
12231                     RExC_emit= REGNODE_OFFSET(br) + 1;
12232                 } else {
12233                     regnode *opt;
12234                     for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12235                         OP(opt)= OPTIMIZED;
12236                     NEXT_OFF(br)= REGNODE_p(ender) - br;
12237                 }
12238             }
12239         }
12240     }
12241
12242     {
12243         const char *p;
12244          /* Even/odd or x=don't care: 010101x10x */
12245         static const char parens[] = "=!aA<,>Bbt";
12246          /* flag below is set to 0 up through 'A'; 1 for larger */
12247
12248         if (paren && (p = strchr(parens, paren))) {
12249             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12250             int flag = (p - parens) > 3;
12251
12252             if (paren == '>' || paren == 't') {
12253                 node = SUSPEND, flag = 0;
12254             }
12255
12256             reginsert(pRExC_state, node, ret, depth+1);
12257             Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12258             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12259             FLAGS(REGNODE_p(ret)) = flag;
12260             if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12261             {
12262                 REQUIRE_BRANCHJ(flagp, 0);
12263             }
12264         }
12265     }
12266
12267     /* Check for proper termination. */
12268     if (paren) {
12269         /* restore original flags, but keep (?p) and, if we've encountered
12270          * something in the parse that changes /d rules into /u, keep the /u */
12271         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12272         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12273             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12274         }
12275         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12276             RExC_parse = oregcomp_parse;
12277             vFAIL("Unmatched (");
12278         }
12279         nextchar(pRExC_state);
12280     }
12281     else if (!paren && RExC_parse < RExC_end) {
12282         if (*RExC_parse == ')') {
12283             RExC_parse++;
12284             vFAIL("Unmatched )");
12285         }
12286         else
12287             FAIL("Junk on end of regexp");      /* "Can't happen". */
12288         NOT_REACHED; /* NOTREACHED */
12289     }
12290
12291     if (RExC_in_lookbehind) {
12292         RExC_in_lookbehind--;
12293     }
12294     if (RExC_in_lookahead) {
12295         RExC_in_lookahead--;
12296     }
12297     if (after_freeze > RExC_npar)
12298         RExC_npar = after_freeze;
12299     return(ret);
12300 }
12301
12302 /*
12303  - regbranch - one alternative of an | operator
12304  *
12305  * Implements the concatenation operator.
12306  *
12307  * On success, returns the offset at which any next node should be placed into
12308  * the regex engine program being compiled.
12309  *
12310  * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12311  * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12312  * UTF-8
12313  */
12314 STATIC regnode_offset
12315 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12316 {
12317     regnode_offset ret;
12318     regnode_offset chain = 0;
12319     regnode_offset latest;
12320     I32 flags = 0, c = 0;
12321     GET_RE_DEBUG_FLAGS_DECL;
12322
12323     PERL_ARGS_ASSERT_REGBRANCH;
12324
12325     DEBUG_PARSE("brnc");
12326
12327     if (first)
12328         ret = 0;
12329     else {
12330         if (RExC_use_BRANCHJ)
12331             ret = reganode(pRExC_state, BRANCHJ, 0);
12332         else {
12333             ret = reg_node(pRExC_state, BRANCH);
12334             Set_Node_Length(REGNODE_p(ret), 1);
12335         }
12336     }
12337
12338     *flagp = WORST;                     /* Tentatively. */
12339
12340     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12341                             FALSE /* Don't force to /x */ );
12342     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12343         flags &= ~TRYAGAIN;
12344         latest = regpiece(pRExC_state, &flags, depth+1);
12345         if (latest == 0) {
12346             if (flags & TRYAGAIN)
12347                 continue;
12348             RETURN_FAIL_ON_RESTART(flags, flagp);
12349             FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12350         }
12351         else if (ret == 0)
12352             ret = latest;
12353         *flagp |= flags&(HASWIDTH|POSTPONED);
12354         if (chain == 0)         /* First piece. */
12355             *flagp |= flags&SPSTART;
12356         else {
12357             /* FIXME adding one for every branch after the first is probably
12358              * excessive now we have TRIE support. (hv) */
12359             MARK_NAUGHTY(1);
12360             if (! REGTAIL(pRExC_state, chain, latest)) {
12361                 /* XXX We could just redo this branch, but figuring out what
12362                  * bookkeeping needs to be reset is a pain, and it's likely
12363                  * that other branches that goto END will also be too large */
12364                 REQUIRE_BRANCHJ(flagp, 0);
12365             }
12366         }
12367         chain = latest;
12368         c++;
12369     }
12370     if (chain == 0) {   /* Loop ran zero times. */
12371         chain = reg_node(pRExC_state, NOTHING);
12372         if (ret == 0)
12373             ret = chain;
12374     }
12375     if (c == 1) {
12376         *flagp |= flags&SIMPLE;
12377     }
12378
12379     return ret;
12380 }
12381
12382 /*
12383  - regpiece - something followed by possible quantifier * + ? {n,m}
12384  *
12385  * Note that the branching code sequences used for ? and the general cases
12386  * of * and + are somewhat optimized:  they use the same NOTHING node as
12387  * both the endmarker for their branch list and the body of the last branch.
12388  * It might seem that this node could be dispensed with entirely, but the
12389  * endmarker role is not redundant.
12390  *
12391  * On success, returns the offset at which any next node should be placed into
12392  * the regex engine program being compiled.
12393  *
12394  * Returns 0 otherwise, with *flagp set to indicate why:
12395  *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
12396  *  RESTART_PARSE   if the parse needs to be restarted, or'd with
12397  *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12398  */
12399 STATIC regnode_offset
12400 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12401 {
12402     regnode_offset ret;
12403     char op;
12404     char *next;
12405     I32 flags;
12406     const char * const origparse = RExC_parse;
12407     I32 min;
12408     I32 max = REG_INFTY;
12409 #ifdef RE_TRACK_PATTERN_OFFSETS
12410     char *parse_start;
12411 #endif
12412     const char *maxpos = NULL;
12413     UV uv;
12414
12415     /* Save the original in case we change the emitted regop to a FAIL. */
12416     const regnode_offset orig_emit = RExC_emit;
12417
12418     GET_RE_DEBUG_FLAGS_DECL;
12419
12420     PERL_ARGS_ASSERT_REGPIECE;
12421
12422     DEBUG_PARSE("piec");
12423
12424     ret = regatom(pRExC_state, &flags, depth+1);
12425     if (ret == 0) {
12426         RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12427         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12428     }
12429
12430     op = *RExC_parse;
12431
12432     if (op == '{' && regcurly(RExC_parse)) {
12433         maxpos = NULL;
12434 #ifdef RE_TRACK_PATTERN_OFFSETS
12435         parse_start = RExC_parse; /* MJD */
12436 #endif
12437         next = RExC_parse + 1;
12438         while (isDIGIT(*next) || *next == ',') {
12439             if (*next == ',') {
12440                 if (maxpos)
12441                     break;
12442                 else
12443                     maxpos = next;
12444             }
12445             next++;
12446         }
12447         if (*next == '}') {             /* got one */
12448             const char* endptr;
12449             if (!maxpos)
12450                 maxpos = next;
12451             RExC_parse++;
12452             if (isDIGIT(*RExC_parse)) {
12453                 endptr = RExC_end;
12454                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12455                     vFAIL("Invalid quantifier in {,}");
12456                 if (uv >= REG_INFTY)
12457                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12458                 min = (I32)uv;
12459             } else {
12460                 min = 0;
12461             }
12462             if (*maxpos == ',')
12463                 maxpos++;
12464             else
12465                 maxpos = RExC_parse;
12466             if (isDIGIT(*maxpos)) {
12467                 endptr = RExC_end;
12468                 if (!grok_atoUV(maxpos, &uv, &endptr))
12469                     vFAIL("Invalid quantifier in {,}");
12470                 if (uv >= REG_INFTY)
12471                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12472                 max = (I32)uv;
12473             } else {
12474                 max = REG_INFTY;                /* meaning "infinity" */
12475             }
12476             RExC_parse = next;
12477             nextchar(pRExC_state);
12478             if (max < min) {    /* If can't match, warn and optimize to fail
12479                                    unconditionally */
12480                 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12481                 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12482                 NEXT_OFF(REGNODE_p(orig_emit)) =
12483                                     regarglen[OPFAIL] + NODE_STEP_REGNODE;
12484                 return ret;
12485             }
12486             else if (min == max && *RExC_parse == '?')
12487             {
12488                 ckWARN2reg(RExC_parse + 1,
12489                            "Useless use of greediness modifier '%c'",
12490                            *RExC_parse);
12491             }
12492
12493           do_curly:
12494             if ((flags&SIMPLE)) {
12495                 if (min == 0 && max == REG_INFTY) {
12496                     reginsert(pRExC_state, STAR, ret, depth+1);
12497                     MARK_NAUGHTY(4);
12498                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12499                     goto nest_check;
12500                 }
12501                 if (min == 1 && max == REG_INFTY) {
12502                     reginsert(pRExC_state, PLUS, ret, depth+1);
12503                     MARK_NAUGHTY(3);
12504                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12505                     goto nest_check;
12506                 }
12507                 MARK_NAUGHTY_EXP(2, 2);
12508                 reginsert(pRExC_state, CURLY, ret, depth+1);
12509                 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12510                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12511             }
12512             else {
12513                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12514
12515                 FLAGS(REGNODE_p(w)) = 0;
12516                 if (!  REGTAIL(pRExC_state, ret, w)) {
12517                     REQUIRE_BRANCHJ(flagp, 0);
12518                 }
12519                 if (RExC_use_BRANCHJ) {
12520                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
12521                     reginsert(pRExC_state, NOTHING, ret, depth+1);
12522                     NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
12523                 }
12524                 reginsert(pRExC_state, CURLYX, ret, depth+1);
12525                                 /* MJD hk */
12526                 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12527                 Set_Node_Length(REGNODE_p(ret),
12528                                 op == '{' ? (RExC_parse - parse_start) : 1);
12529
12530                 if (RExC_use_BRANCHJ)
12531                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
12532                                                        LONGJMP. */
12533                 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12534                                                           NOTHING)))
12535                 {
12536                     REQUIRE_BRANCHJ(flagp, 0);
12537                 }
12538                 RExC_whilem_seen++;
12539                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
12540             }
12541             FLAGS(REGNODE_p(ret)) = 0;
12542
12543             if (min > 0)
12544                 *flagp = WORST;
12545             if (max > 0)
12546                 *flagp |= HASWIDTH;
12547             ARG1_SET(REGNODE_p(ret), (U16)min);
12548             ARG2_SET(REGNODE_p(ret), (U16)max);
12549             if (max == REG_INFTY)
12550                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12551
12552             goto nest_check;
12553         }
12554     }
12555
12556     if (!ISMULT1(op)) {
12557         *flagp = flags;
12558         return(ret);
12559     }
12560
12561 #if 0                           /* Now runtime fix should be reliable. */
12562
12563     /* if this is reinstated, don't forget to put this back into perldiag:
12564
12565             =item Regexp *+ operand could be empty at {#} in regex m/%s/
12566
12567            (F) The part of the regexp subject to either the * or + quantifier
12568            could match an empty string. The {#} shows in the regular
12569            expression about where the problem was discovered.
12570
12571     */
12572
12573     if (!(flags&HASWIDTH) && op != '?')
12574       vFAIL("Regexp *+ operand could be empty");
12575 #endif
12576
12577 #ifdef RE_TRACK_PATTERN_OFFSETS
12578     parse_start = RExC_parse;
12579 #endif
12580     nextchar(pRExC_state);
12581
12582     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12583
12584     if (op == '*') {
12585         min = 0;
12586         goto do_curly;
12587     }
12588     else if (op == '+') {
12589         min = 1;
12590         goto do_curly;
12591     }
12592     else if (op == '?') {
12593         min = 0; max = 1;
12594         goto do_curly;
12595     }
12596   nest_check:
12597     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12598         ckWARN2reg(RExC_parse,
12599                    "%" UTF8f " matches null string many times",
12600                    UTF8fARG(UTF, (RExC_parse >= origparse
12601                                  ? RExC_parse - origparse
12602                                  : 0),
12603                    origparse));
12604     }
12605
12606     if (*RExC_parse == '?') {
12607         nextchar(pRExC_state);
12608         reginsert(pRExC_state, MINMOD, ret, depth+1);
12609         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12610             REQUIRE_BRANCHJ(flagp, 0);
12611         }
12612     }
12613     else if (*RExC_parse == '+') {
12614         regnode_offset ender;
12615         nextchar(pRExC_state);
12616         ender = reg_node(pRExC_state, SUCCEED);
12617         if (! REGTAIL(pRExC_state, ret, ender)) {
12618             REQUIRE_BRANCHJ(flagp, 0);
12619         }
12620         reginsert(pRExC_state, SUSPEND, ret, depth+1);
12621         ender = reg_node(pRExC_state, TAIL);
12622         if (! REGTAIL(pRExC_state, ret, ender)) {
12623             REQUIRE_BRANCHJ(flagp, 0);
12624         }
12625     }
12626
12627     if (ISMULT2(RExC_parse)) {
12628         RExC_parse++;
12629         vFAIL("Nested quantifiers");
12630     }
12631
12632     return(ret);
12633 }
12634
12635 STATIC bool
12636 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12637                 regnode_offset * node_p,
12638                 UV * code_point_p,
12639                 int * cp_count,
12640                 I32 * flagp,
12641                 const bool strict,
12642                 const U32 depth
12643     )
12644 {
12645  /* This routine teases apart the various meanings of \N and returns
12646   * accordingly.  The input parameters constrain which meaning(s) is/are valid
12647   * in the current context.
12648   *
12649   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12650   *
12651   * If <code_point_p> is not NULL, the context is expecting the result to be a
12652   * single code point.  If this \N instance turns out to a single code point,
12653   * the function returns TRUE and sets *code_point_p to that code point.
12654   *
12655   * If <node_p> is not NULL, the context is expecting the result to be one of
12656   * the things representable by a regnode.  If this \N instance turns out to be
12657   * one such, the function generates the regnode, returns TRUE and sets *node_p
12658   * to point to the offset of that regnode into the regex engine program being
12659   * compiled.
12660   *
12661   * If this instance of \N isn't legal in any context, this function will
12662   * generate a fatal error and not return.
12663   *
12664   * On input, RExC_parse should point to the first char following the \N at the
12665   * time of the call.  On successful return, RExC_parse will have been updated
12666   * to point to just after the sequence identified by this routine.  Also
12667   * *flagp has been updated as needed.
12668   *
12669   * When there is some problem with the current context and this \N instance,
12670   * the function returns FALSE, without advancing RExC_parse, nor setting
12671   * *node_p, nor *code_point_p, nor *flagp.
12672   *
12673   * If <cp_count> is not NULL, the caller wants to know the length (in code
12674   * points) that this \N sequence matches.  This is set, and the input is
12675   * parsed for errors, even if the function returns FALSE, as detailed below.
12676   *
12677   * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12678   *
12679   * Probably the most common case is for the \N to specify a single code point.
12680   * *cp_count will be set to 1, and *code_point_p will be set to that code
12681   * point.
12682   *
12683   * Another possibility is for the input to be an empty \N{}.  This is no
12684   * longer accepted, and will generate a fatal error.
12685   *
12686   * Another possibility is for a custom charnames handler to be in effect which
12687   * translates the input name to an empty string.  *cp_count will be set to 0.
12688   * *node_p will be set to a generated NOTHING node.
12689   *
12690   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12691   * set to 0. *node_p will be set to a generated REG_ANY node.
12692   *
12693   * The fifth possibility is that \N resolves to a sequence of more than one
12694   * code points.  *cp_count will be set to the number of code points in the
12695   * sequence. *node_p will be set to a generated node returned by this
12696   * function calling S_reg().
12697   *
12698   * The final possibility is that it is premature to be calling this function;
12699   * the parse needs to be restarted.  This can happen when this changes from
12700   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
12701   * latter occurs only when the fifth possibility would otherwise be in
12702   * effect, and is because one of those code points requires the pattern to be
12703   * recompiled as UTF-8.  The function returns FALSE, and sets the
12704   * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
12705   * happens, the caller needs to desist from continuing parsing, and return
12706   * this information to its caller.  This is not set for when there is only one
12707   * code point, as this can be called as part of an ANYOF node, and they can
12708   * store above-Latin1 code points without the pattern having to be in UTF-8.
12709   *
12710   * For non-single-quoted regexes, the tokenizer has resolved character and
12711   * sequence names inside \N{...} into their Unicode values, normalizing the
12712   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12713   * hex-represented code points in the sequence.  This is done there because
12714   * the names can vary based on what charnames pragma is in scope at the time,
12715   * so we need a way to take a snapshot of what they resolve to at the time of
12716   * the original parse. [perl #56444].
12717   *
12718   * That parsing is skipped for single-quoted regexes, so here we may get
12719   * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
12720   * like '\N{U+41}', that code point is Unicode, and has to be translated into
12721   * the native character set for non-ASCII platforms.  The other possibilities
12722   * are already native, so no translation is done. */
12723
12724     char * endbrace;    /* points to '}' following the name */
12725     char* p = RExC_parse; /* Temporary */
12726
12727     SV * substitute_parse = NULL;
12728     char *orig_end;
12729     char *save_start;
12730     I32 flags;
12731
12732     GET_RE_DEBUG_FLAGS_DECL;
12733
12734     PERL_ARGS_ASSERT_GROK_BSLASH_N;
12735
12736     GET_RE_DEBUG_FLAGS;
12737
12738     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
12739     assert(! (node_p && cp_count));               /* At most 1 should be set */
12740
12741     if (cp_count) {     /* Initialize return for the most common case */
12742         *cp_count = 1;
12743     }
12744
12745     /* The [^\n] meaning of \N ignores spaces and comments under the /x
12746      * modifier.  The other meanings do not, so use a temporary until we find
12747      * out which we are being called with */
12748     skip_to_be_ignored_text(pRExC_state, &p,
12749                             FALSE /* Don't force to /x */ );
12750
12751     /* Disambiguate between \N meaning a named character versus \N meaning
12752      * [^\n].  The latter is assumed when the {...} following the \N is a legal
12753      * quantifier, or if there is no '{' at all */
12754     if (*p != '{' || regcurly(p)) {
12755         RExC_parse = p;
12756         if (cp_count) {
12757             *cp_count = -1;
12758         }
12759
12760         if (! node_p) {
12761             return FALSE;
12762         }
12763
12764         *node_p = reg_node(pRExC_state, REG_ANY);
12765         *flagp |= HASWIDTH|SIMPLE;
12766         MARK_NAUGHTY(1);
12767         Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12768         return TRUE;
12769     }
12770
12771     /* The test above made sure that the next real character is a '{', but
12772      * under the /x modifier, it could be separated by space (or a comment and
12773      * \n) and this is not allowed (for consistency with \x{...} and the
12774      * tokenizer handling of \N{NAME}). */
12775     if (*RExC_parse != '{') {
12776         vFAIL("Missing braces on \\N{}");
12777     }
12778
12779     RExC_parse++;       /* Skip past the '{' */
12780
12781     endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12782     if (! endbrace) { /* no trailing brace */
12783         vFAIL2("Missing right brace on \\%c{}", 'N');
12784     }
12785
12786     /* Here, we have decided it should be a named character or sequence.  These
12787      * imply Unicode semantics */
12788     REQUIRE_UNI_RULES(flagp, FALSE);
12789
12790     /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12791      * nothing at all (not allowed under strict) */
12792     if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12793         RExC_parse = endbrace;
12794         if (strict) {
12795             RExC_parse++;   /* Position after the "}" */
12796             vFAIL("Zero length \\N{}");
12797         }
12798
12799         if (cp_count) {
12800             *cp_count = 0;
12801         }
12802         nextchar(pRExC_state);
12803         if (! node_p) {
12804             return FALSE;
12805         }
12806
12807         *node_p = reg_node(pRExC_state, NOTHING);
12808         return TRUE;
12809     }
12810
12811     if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12812
12813         /* Here, the name isn't of the form  U+....  This can happen if the
12814          * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
12815          * is the time to find out what the name means */
12816
12817         const STRLEN name_len = endbrace - RExC_parse;
12818         SV *  value_sv;     /* What does this name evaluate to */
12819         SV ** value_svp;
12820         const U8 * value;   /* string of name's value */
12821         STRLEN value_len;   /* and its length */
12822
12823         /*  RExC_unlexed_names is a hash of names that weren't evaluated by
12824          *  toke.c, and their values. Make sure is initialized */
12825         if (! RExC_unlexed_names) {
12826             RExC_unlexed_names = newHV();
12827         }
12828
12829         /* If we have already seen this name in this pattern, use that.  This
12830          * allows us to only call the charnames handler once per name per
12831          * pattern.  A broken or malicious handler could return something
12832          * different each time, which could cause the results to vary depending
12833          * on if something gets added or subtracted from the pattern that
12834          * causes the number of passes to change, for example */
12835         if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12836                                                       name_len, 0)))
12837         {
12838             value_sv = *value_svp;
12839         }
12840         else { /* Otherwise we have to go out and get the name */
12841             const char * error_msg = NULL;
12842             value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12843                                                       UTF,
12844                                                       &error_msg);
12845             if (error_msg) {
12846                 RExC_parse = endbrace;
12847                 vFAIL(error_msg);
12848             }
12849
12850             /* If no error message, should have gotten a valid return */
12851             assert (value_sv);
12852
12853             /* Save the name's meaning for later use */
12854             if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12855                            value_sv, 0))
12856             {
12857                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12858             }
12859         }
12860
12861         /* Here, we have the value the name evaluates to in 'value_sv' */
12862         value = (U8 *) SvPV(value_sv, value_len);
12863
12864         /* See if the result is one code point vs 0 or multiple */
12865         if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
12866                                   ? UTF8SKIP(value)
12867                                   : 1)))
12868         {
12869             /* Here, exactly one code point.  If that isn't what is wanted,
12870              * fail */
12871             if (! code_point_p) {
12872                 RExC_parse = p;
12873                 return FALSE;
12874             }
12875
12876             /* Convert from string to numeric code point */
12877             *code_point_p = (SvUTF8(value_sv))
12878                             ? valid_utf8_to_uvchr(value, NULL)
12879                             : *value;
12880
12881             /* Have parsed this entire single code point \N{...}.  *cp_count
12882              * has already been set to 1, so don't do it again. */
12883             RExC_parse = endbrace;
12884             nextchar(pRExC_state);
12885             return TRUE;
12886         } /* End of is a single code point */
12887
12888         /* Count the code points, if caller desires.  The API says to do this
12889          * even if we will later return FALSE */
12890         if (cp_count) {
12891             *cp_count = 0;
12892
12893             *cp_count = (SvUTF8(value_sv))
12894                         ? utf8_length(value, value + value_len)
12895                         : value_len;
12896         }
12897
12898         /* Fail if caller doesn't want to handle a multi-code-point sequence.
12899          * But don't back the pointer up if the caller wants to know how many
12900          * code points there are (they need to handle it themselves in this
12901          * case).  */
12902         if (! node_p) {
12903             if (! cp_count) {
12904                 RExC_parse = p;
12905             }
12906             return FALSE;
12907         }
12908
12909         /* Convert this to a sub-pattern of the form "(?: ... )", and then call
12910          * reg recursively to parse it.  That way, it retains its atomicness,
12911          * while not having to worry about any special handling that some code
12912          * points may have. */
12913
12914         substitute_parse = newSVpvs("?:");
12915         sv_catsv(substitute_parse, value_sv);
12916         sv_catpv(substitute_parse, ")");
12917
12918         /* The value should already be native, so no need to convert on EBCDIC
12919          * platforms.*/
12920         assert(! RExC_recode_x_to_native);
12921
12922     }
12923     else {   /* \N{U+...} */
12924         Size_t count = 0;   /* code point count kept internally */
12925
12926         /* We can get to here when the input is \N{U+...} or when toke.c has
12927          * converted a name to the \N{U+...} form.  This include changing a
12928          * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12929
12930         RExC_parse += 2;    /* Skip past the 'U+' */
12931
12932         /* Code points are separated by dots.  The '}' terminates the whole
12933          * thing. */
12934
12935         do {    /* Loop until the ending brace */
12936             UV cp = 0;
12937             char * start_digit;     /* The first of the current code point */
12938             if (! isXDIGIT(*RExC_parse)) {
12939                 RExC_parse++;
12940                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12941             }
12942
12943             start_digit = RExC_parse;
12944             count++;
12945
12946             /* Loop through the hex digits of the current code point */
12947             do {
12948                 /* Adding this digit will shift the result 4 bits.  If that
12949                  * result would be above the legal max, it's overflow */
12950                 if (cp > MAX_LEGAL_CP >> 4) {
12951
12952                     /* Find the end of the code point */
12953                     do {
12954                         RExC_parse ++;
12955                     } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12956
12957                     /* Be sure to synchronize this message with the similar one
12958                      * in utf8.c */
12959                     vFAIL4("Use of code point 0x%.*s is not allowed; the"
12960                         " permissible max is 0x%" UVxf,
12961                         (int) (RExC_parse - start_digit), start_digit,
12962                         MAX_LEGAL_CP);
12963                 }
12964
12965                 /* Accumulate this (valid) digit into the running total */
12966                 cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
12967
12968                 /* READ_XDIGIT advanced the input pointer.  Ignore a single
12969                  * underscore separator */
12970                 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12971                     RExC_parse++;
12972                 }
12973             } while (isXDIGIT(*RExC_parse));
12974
12975             /* Here, have accumulated the next code point */
12976             if (RExC_parse >= endbrace) {   /* If done ... */
12977                 if (count != 1) {
12978                     goto do_concat;
12979                 }
12980
12981                 /* Here, is a single code point; fail if doesn't want that */
12982                 if (! code_point_p) {
12983                     RExC_parse = p;
12984                     return FALSE;
12985                 }
12986
12987                 /* A single code point is easy to handle; just return it */
12988                 *code_point_p = UNI_TO_NATIVE(cp);
12989                 RExC_parse = endbrace;
12990                 nextchar(pRExC_state);
12991                 return TRUE;
12992             }
12993
12994             /* Here, the only legal thing would be a multiple character
12995              * sequence (of the form "\N{U+c1.c2. ... }".   So the next
12996              * character must be a dot (and the one after that can't be the
12997              * endbrace, or we'd have something like \N{U+100.} ) */
12998             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12999                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
13000                                 ? UTF8SKIP(RExC_parse)
13001                                 : 1;
13002                 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
13003                     RExC_parse = endbrace;
13004                 }
13005                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13006             }
13007
13008             /* Here, looks like its really a multiple character sequence.  Fail
13009              * if that's not what the caller wants.  But continue with counting
13010              * and error checking if they still want a count */
13011             if (! node_p && ! cp_count) {
13012                 return FALSE;
13013             }
13014
13015             /* What is done here is to convert this to a sub-pattern of the
13016              * form \x{char1}\x{char2}...  and then call reg recursively to
13017              * parse it (enclosing in "(?: ... )" ).  That way, it retains its
13018              * atomicness, while not having to worry about special handling
13019              * that some code points may have.  We don't create a subpattern,
13020              * but go through the motions of code point counting and error
13021              * checking, if the caller doesn't want a node returned. */
13022
13023             if (node_p && count == 1) {
13024                 substitute_parse = newSVpvs("?:");
13025             }
13026
13027           do_concat:
13028
13029             if (node_p) {
13030                 /* Convert to notation the rest of the code understands */
13031                 sv_catpvs(substitute_parse, "\\x{");
13032                 sv_catpvn(substitute_parse, start_digit,
13033                                             RExC_parse - start_digit);
13034                 sv_catpvs(substitute_parse, "}");
13035             }
13036
13037             /* Move to after the dot (or ending brace the final time through.)
13038              * */
13039             RExC_parse++;
13040             count++;
13041
13042         } while (RExC_parse < endbrace);
13043
13044         if (! node_p) { /* Doesn't want the node */
13045             assert (cp_count);
13046
13047             *cp_count = count;
13048             return FALSE;
13049         }
13050
13051         sv_catpvs(substitute_parse, ")");
13052
13053         /* The values are Unicode, and therefore have to be converted to native
13054          * on a non-Unicode (meaning non-ASCII) platform. */
13055         SET_recode_x_to_native(1);
13056     }
13057
13058     /* Here, we have the string the name evaluates to, ready to be parsed,
13059      * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13060      * constructs.  This can be called from within a substitute parse already.
13061      * The error reporting mechanism doesn't work for 2 levels of this, but the
13062      * code above has validated this new construct, so there should be no
13063      * errors generated by the below.  And this isn' an exact copy, so the
13064      * mechanism to seamlessly deal with this won't work, so turn off warnings
13065      * during it */
13066     save_start = RExC_start;
13067     orig_end = RExC_end;
13068
13069     RExC_parse = RExC_start = SvPVX(substitute_parse);
13070     RExC_end = RExC_parse + SvCUR(substitute_parse);
13071     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13072
13073     *node_p = reg(pRExC_state, 1, &flags, depth+1);
13074
13075     /* Restore the saved values */
13076     RESTORE_WARNINGS;
13077     RExC_start = save_start;
13078     RExC_parse = endbrace;
13079     RExC_end = orig_end;
13080     SET_recode_x_to_native(0);
13081
13082     SvREFCNT_dec_NN(substitute_parse);
13083
13084     if (! *node_p) {
13085         RETURN_FAIL_ON_RESTART(flags, flagp);
13086         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13087             (UV) flags);
13088     }
13089     *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13090
13091     nextchar(pRExC_state);
13092
13093     return TRUE;
13094 }
13095
13096
13097 PERL_STATIC_INLINE U8
13098 S_compute_EXACTish(RExC_state_t *pRExC_state)
13099 {
13100     U8 op;
13101
13102     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13103
13104     if (! FOLD) {
13105         return (LOC)
13106                 ? EXACTL
13107                 : EXACT;
13108     }
13109
13110     op = get_regex_charset(RExC_flags);
13111     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13112         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13113                  been, so there is no hole */
13114     }
13115
13116     return op + EXACTF;
13117 }
13118
13119 STATIC bool
13120 S_new_regcurly(const char *s, const char *e)
13121 {
13122     /* This is a temporary function designed to match the most lenient form of
13123      * a {m,n} quantifier we ever envision, with either number omitted, and
13124      * spaces anywhere between/before/after them.
13125      *
13126      * If this function fails, then the string it matches is very unlikely to
13127      * ever be considered a valid quantifier, so we can allow the '{' that
13128      * begins it to be considered as a literal */
13129
13130     bool has_min = FALSE;
13131     bool has_max = FALSE;
13132
13133     PERL_ARGS_ASSERT_NEW_REGCURLY;
13134
13135     if (s >= e || *s++ != '{')
13136         return FALSE;
13137
13138     while (s < e && isSPACE(*s)) {
13139         s++;
13140     }
13141     while (s < e && isDIGIT(*s)) {
13142         has_min = TRUE;
13143         s++;
13144     }
13145     while (s < e && isSPACE(*s)) {
13146         s++;
13147     }
13148
13149     if (*s == ',') {
13150         s++;
13151         while (s < e && isSPACE(*s)) {
13152             s++;
13153         }
13154         while (s < e && isDIGIT(*s)) {
13155             has_max = TRUE;
13156             s++;
13157         }
13158         while (s < e && isSPACE(*s)) {
13159             s++;
13160         }
13161     }
13162
13163     return s < e && *s == '}' && (has_min || has_max);
13164 }
13165
13166 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13167  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13168
13169 static I32
13170 S_backref_value(char *p, char *e)
13171 {
13172     const char* endptr = e;
13173     UV val;
13174     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13175         return (I32)val;
13176     return I32_MAX;
13177 }
13178
13179
13180 /*
13181  - regatom - the lowest level
13182
13183    Try to identify anything special at the start of the current parse position.
13184    If there is, then handle it as required. This may involve generating a
13185    single regop, such as for an assertion; or it may involve recursing, such as
13186    to handle a () structure.
13187
13188    If the string doesn't start with something special then we gobble up
13189    as much literal text as we can.  If we encounter a quantifier, we have to
13190    back off the final literal character, as that quantifier applies to just it
13191    and not to the whole string of literals.
13192
13193    Once we have been able to handle whatever type of thing started the
13194    sequence, we return the offset into the regex engine program being compiled
13195    at which any  next regnode should be placed.
13196
13197    Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13198    Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13199    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13200    Otherwise does not return 0.
13201
13202    Note: we have to be careful with escapes, as they can be both literal
13203    and special, and in the case of \10 and friends, context determines which.
13204
13205    A summary of the code structure is:
13206
13207    switch (first_byte) {
13208         cases for each special:
13209             handle this special;
13210             break;
13211         case '\\':
13212             switch (2nd byte) {
13213                 cases for each unambiguous special:
13214                     handle this special;
13215                     break;
13216                 cases for each ambigous special/literal:
13217                     disambiguate;
13218                     if (special)  handle here
13219                     else goto defchar;
13220                 default: // unambiguously literal:
13221                     goto defchar;
13222             }
13223         default:  // is a literal char
13224             // FALL THROUGH
13225         defchar:
13226             create EXACTish node for literal;
13227             while (more input and node isn't full) {
13228                 switch (input_byte) {
13229                    cases for each special;
13230                        make sure parse pointer is set so that the next call to
13231                            regatom will see this special first
13232                        goto loopdone; // EXACTish node terminated by prev. char
13233                    default:
13234                        append char to EXACTISH node;
13235                 }
13236                 get next input byte;
13237             }
13238         loopdone:
13239    }
13240    return the generated node;
13241
13242    Specifically there are two separate switches for handling
13243    escape sequences, with the one for handling literal escapes requiring
13244    a dummy entry for all of the special escapes that are actually handled
13245    by the other.
13246
13247 */
13248
13249 STATIC regnode_offset
13250 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13251 {
13252     dVAR;
13253     regnode_offset ret = 0;
13254     I32 flags = 0;
13255     char *parse_start;
13256     U8 op;
13257     int invert = 0;
13258
13259     GET_RE_DEBUG_FLAGS_DECL;
13260
13261     *flagp = WORST;             /* Tentatively. */
13262
13263     DEBUG_PARSE("atom");
13264
13265     PERL_ARGS_ASSERT_REGATOM;
13266
13267   tryagain:
13268     parse_start = RExC_parse;
13269     assert(RExC_parse < RExC_end);
13270     switch ((U8)*RExC_parse) {
13271     case '^':
13272         RExC_seen_zerolen++;
13273         nextchar(pRExC_state);
13274         if (RExC_flags & RXf_PMf_MULTILINE)
13275             ret = reg_node(pRExC_state, MBOL);
13276         else
13277             ret = reg_node(pRExC_state, SBOL);
13278         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13279         break;
13280     case '$':
13281         nextchar(pRExC_state);
13282         if (*RExC_parse)
13283             RExC_seen_zerolen++;
13284         if (RExC_flags & RXf_PMf_MULTILINE)
13285             ret = reg_node(pRExC_state, MEOL);
13286         else
13287             ret = reg_node(pRExC_state, SEOL);
13288         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13289         break;
13290     case '.':
13291         nextchar(pRExC_state);
13292         if (RExC_flags & RXf_PMf_SINGLELINE)
13293             ret = reg_node(pRExC_state, SANY);
13294         else
13295             ret = reg_node(pRExC_state, REG_ANY);
13296         *flagp |= HASWIDTH|SIMPLE;
13297         MARK_NAUGHTY(1);
13298         Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13299         break;
13300     case '[':
13301     {
13302         char * const oregcomp_parse = ++RExC_parse;
13303         ret = regclass(pRExC_state, flagp, depth+1,
13304                        FALSE, /* means parse the whole char class */
13305                        TRUE, /* allow multi-char folds */
13306                        FALSE, /* don't silence non-portable warnings. */
13307                        (bool) RExC_strict,
13308                        TRUE, /* Allow an optimized regnode result */
13309                        NULL);
13310         if (ret == 0) {
13311             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13312             FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13313                   (UV) *flagp);
13314         }
13315         if (*RExC_parse != ']') {
13316             RExC_parse = oregcomp_parse;
13317             vFAIL("Unmatched [");
13318         }
13319         nextchar(pRExC_state);
13320         Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13321         break;
13322     }
13323     case '(':
13324         nextchar(pRExC_state);
13325         ret = reg(pRExC_state, 2, &flags, depth+1);
13326         if (ret == 0) {
13327                 if (flags & TRYAGAIN) {
13328                     if (RExC_parse >= RExC_end) {
13329                          /* Make parent create an empty node if needed. */
13330                         *flagp |= TRYAGAIN;
13331                         return(0);
13332                     }
13333                     goto tryagain;
13334                 }
13335                 RETURN_FAIL_ON_RESTART(flags, flagp);
13336                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13337                                                                  (UV) flags);
13338         }
13339         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13340         break;
13341     case '|':
13342     case ')':
13343         if (flags & TRYAGAIN) {
13344             *flagp |= TRYAGAIN;
13345             return 0;
13346         }
13347         vFAIL("Internal urp");
13348                                 /* Supposed to be caught earlier. */
13349         break;
13350     case '?':
13351     case '+':
13352     case '*':
13353         RExC_parse++;
13354         vFAIL("Quantifier follows nothing");
13355         break;
13356     case '\\':
13357         /* Special Escapes
13358
13359            This switch handles escape sequences that resolve to some kind
13360            of special regop and not to literal text. Escape sequences that
13361            resolve to literal text are handled below in the switch marked
13362            "Literal Escapes".
13363
13364            Every entry in this switch *must* have a corresponding entry
13365            in the literal escape switch. However, the opposite is not
13366            required, as the default for this switch is to jump to the
13367            literal text handling code.
13368         */
13369         RExC_parse++;
13370         switch ((U8)*RExC_parse) {
13371         /* Special Escapes */
13372         case 'A':
13373             RExC_seen_zerolen++;
13374             ret = reg_node(pRExC_state, SBOL);
13375             /* SBOL is shared with /^/ so we set the flags so we can tell
13376              * /\A/ from /^/ in split. */
13377             FLAGS(REGNODE_p(ret)) = 1;
13378             *flagp |= SIMPLE;
13379             goto finish_meta_pat;
13380         case 'G':
13381             ret = reg_node(pRExC_state, GPOS);
13382             RExC_seen |= REG_GPOS_SEEN;
13383             *flagp |= SIMPLE;
13384             goto finish_meta_pat;
13385         case 'K':
13386             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13387                 RExC_seen_zerolen++;
13388                 ret = reg_node(pRExC_state, KEEPS);
13389                 *flagp |= SIMPLE;
13390                 /* XXX:dmq : disabling in-place substitution seems to
13391                  * be necessary here to avoid cases of memory corruption, as
13392                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13393                  */
13394                 RExC_seen |= REG_LOOKBEHIND_SEEN;
13395                 goto finish_meta_pat;
13396             }
13397             else {
13398                 ++RExC_parse; /* advance past the 'K' */
13399                 vFAIL("\\K not permitted in lookahead/lookbehind");
13400             }
13401         case 'Z':
13402             ret = reg_node(pRExC_state, SEOL);
13403             *flagp |= SIMPLE;
13404             RExC_seen_zerolen++;                /* Do not optimize RE away */
13405             goto finish_meta_pat;
13406         case 'z':
13407             ret = reg_node(pRExC_state, EOS);
13408             *flagp |= SIMPLE;
13409             RExC_seen_zerolen++;                /* Do not optimize RE away */
13410             goto finish_meta_pat;
13411         case 'C':
13412             vFAIL("\\C no longer supported");
13413         case 'X':
13414             ret = reg_node(pRExC_state, CLUMP);
13415             *flagp |= HASWIDTH;
13416             goto finish_meta_pat;
13417
13418         case 'B':
13419             invert = 1;
13420             /* FALLTHROUGH */
13421         case 'b':
13422           {
13423             U8 flags = 0;
13424             regex_charset charset = get_regex_charset(RExC_flags);
13425
13426             RExC_seen_zerolen++;
13427             RExC_seen |= REG_LOOKBEHIND_SEEN;
13428             op = BOUND + charset;
13429
13430             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13431                 flags = TRADITIONAL_BOUND;
13432                 if (op > BOUNDA) {  /* /aa is same as /a */
13433                     op = BOUNDA;
13434                 }
13435             }
13436             else {
13437                 STRLEN length;
13438                 char name = *RExC_parse;
13439                 char * endbrace = NULL;
13440                 RExC_parse += 2;
13441                 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13442
13443                 if (! endbrace) {
13444                     vFAIL2("Missing right brace on \\%c{}", name);
13445                 }
13446                 /* XXX Need to decide whether to take spaces or not.  Should be
13447                  * consistent with \p{}, but that currently is SPACE, which
13448                  * means vertical too, which seems wrong
13449                  * while (isBLANK(*RExC_parse)) {
13450                     RExC_parse++;
13451                 }*/
13452                 if (endbrace == RExC_parse) {
13453                     RExC_parse++;  /* After the '}' */
13454                     vFAIL2("Empty \\%c{}", name);
13455                 }
13456                 length = endbrace - RExC_parse;
13457                 /*while (isBLANK(*(RExC_parse + length - 1))) {
13458                     length--;
13459                 }*/
13460                 switch (*RExC_parse) {
13461                     case 'g':
13462                         if (    length != 1
13463                             && (memNEs(RExC_parse + 1, length - 1, "cb")))
13464                         {
13465                             goto bad_bound_type;
13466                         }
13467                         flags = GCB_BOUND;
13468                         break;
13469                     case 'l':
13470                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13471                             goto bad_bound_type;
13472                         }
13473                         flags = LB_BOUND;
13474                         break;
13475                     case 's':
13476                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13477                             goto bad_bound_type;
13478                         }
13479                         flags = SB_BOUND;
13480                         break;
13481                     case 'w':
13482                         if (length != 2 || *(RExC_parse + 1) != 'b') {
13483                             goto bad_bound_type;
13484                         }
13485                         flags = WB_BOUND;
13486                         break;
13487                     default:
13488                       bad_bound_type:
13489                         RExC_parse = endbrace;
13490                         vFAIL2utf8f(
13491                             "'%" UTF8f "' is an unknown bound type",
13492                             UTF8fARG(UTF, length, endbrace - length));
13493                         NOT_REACHED; /*NOTREACHED*/
13494                 }
13495                 RExC_parse = endbrace;
13496                 REQUIRE_UNI_RULES(flagp, 0);
13497
13498                 if (op == BOUND) {
13499                     op = BOUNDU;
13500                 }
13501                 else if (op >= BOUNDA) {  /* /aa is same as /a */
13502                     op = BOUNDU;
13503                     length += 4;
13504
13505                     /* Don't have to worry about UTF-8, in this message because
13506                      * to get here the contents of the \b must be ASCII */
13507                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
13508                               "Using /u for '%.*s' instead of /%s",
13509                               (unsigned) length,
13510                               endbrace - length + 1,
13511                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13512                               ? ASCII_RESTRICT_PAT_MODS
13513                               : ASCII_MORE_RESTRICT_PAT_MODS);
13514                 }
13515             }
13516
13517             if (op == BOUND) {
13518                 RExC_seen_d_op = TRUE;
13519             }
13520             else if (op == BOUNDL) {
13521                 RExC_contains_locale = 1;
13522             }
13523
13524             if (invert) {
13525                 op += NBOUND - BOUND;
13526             }
13527
13528             ret = reg_node(pRExC_state, op);
13529             FLAGS(REGNODE_p(ret)) = flags;
13530
13531             *flagp |= SIMPLE;
13532
13533             goto finish_meta_pat;
13534           }
13535
13536         case 'R':
13537             ret = reg_node(pRExC_state, LNBREAK);
13538             *flagp |= HASWIDTH|SIMPLE;
13539             goto finish_meta_pat;
13540
13541         case 'd':
13542         case 'D':
13543         case 'h':
13544         case 'H':
13545         case 'p':
13546         case 'P':
13547         case 's':
13548         case 'S':
13549         case 'v':
13550         case 'V':
13551         case 'w':
13552         case 'W':
13553             /* These all have the same meaning inside [brackets], and it knows
13554              * how to do the best optimizations for them.  So, pretend we found
13555              * these within brackets, and let it do the work */
13556             RExC_parse--;
13557
13558             ret = regclass(pRExC_state, flagp, depth+1,
13559                            TRUE, /* means just parse this element */
13560                            FALSE, /* don't allow multi-char folds */
13561                            FALSE, /* don't silence non-portable warnings.  It
13562                                      would be a bug if these returned
13563                                      non-portables */
13564                            (bool) RExC_strict,
13565                            TRUE, /* Allow an optimized regnode result */
13566                            NULL);
13567             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13568             /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13569              * multi-char folds are allowed.  */
13570             if (!ret)
13571                 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13572                       (UV) *flagp);
13573
13574             RExC_parse--;   /* regclass() leaves this one too far ahead */
13575
13576           finish_meta_pat:
13577                    /* The escapes above that don't take a parameter can't be
13578                     * followed by a '{'.  But 'pX', 'p{foo}' and
13579                     * correspondingly 'P' can be */
13580             if (   RExC_parse - parse_start == 1
13581                 && UCHARAT(RExC_parse + 1) == '{'
13582                 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13583             {
13584                 RExC_parse += 2;
13585                 vFAIL("Unescaped left brace in regex is illegal here");
13586             }
13587             Set_Node_Offset(REGNODE_p(ret), parse_start);
13588             Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13589             nextchar(pRExC_state);
13590             break;
13591         case 'N':
13592             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13593              * \N{...} evaluates to a sequence of more than one code points).
13594              * The function call below returns a regnode, which is our result.
13595              * The parameters cause it to fail if the \N{} evaluates to a
13596              * single code point; we handle those like any other literal.  The
13597              * reason that the multicharacter case is handled here and not as
13598              * part of the EXACtish code is because of quantifiers.  In
13599              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13600              * this way makes that Just Happen. dmq.
13601              * join_exact() will join this up with adjacent EXACTish nodes
13602              * later on, if appropriate. */
13603             ++RExC_parse;
13604             if (grok_bslash_N(pRExC_state,
13605                               &ret,     /* Want a regnode returned */
13606                               NULL,     /* Fail if evaluates to a single code
13607                                            point */
13608                               NULL,     /* Don't need a count of how many code
13609                                            points */
13610                               flagp,
13611                               RExC_strict,
13612                               depth)
13613             ) {
13614                 break;
13615             }
13616
13617             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13618
13619             /* Here, evaluates to a single code point.  Go get that */
13620             RExC_parse = parse_start;
13621             goto defchar;
13622
13623         case 'k':    /* Handle \k<NAME> and \k'NAME' */
13624       parse_named_seq:
13625         {
13626             char ch;
13627             if (   RExC_parse >= RExC_end - 1
13628                 || ((   ch = RExC_parse[1]) != '<'
13629                                       && ch != '\''
13630                                       && ch != '{'))
13631             {
13632                 RExC_parse++;
13633                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13634                 vFAIL2("Sequence %.2s... not terminated", parse_start);
13635             } else {
13636                 RExC_parse += 2;
13637                 ret = handle_named_backref(pRExC_state,
13638                                            flagp,
13639                                            parse_start,
13640                                            (ch == '<')
13641                                            ? '>'
13642                                            : (ch == '{')
13643                                              ? '}'
13644                                              : '\'');
13645             }
13646             break;
13647         }
13648         case 'g':
13649         case '1': case '2': case '3': case '4':
13650         case '5': case '6': case '7': case '8': case '9':
13651             {
13652                 I32 num;
13653                 bool hasbrace = 0;
13654
13655                 if (*RExC_parse == 'g') {
13656                     bool isrel = 0;
13657
13658                     RExC_parse++;
13659                     if (*RExC_parse == '{') {
13660                         RExC_parse++;
13661                         hasbrace = 1;
13662                     }
13663                     if (*RExC_parse == '-') {
13664                         RExC_parse++;
13665                         isrel = 1;
13666                     }
13667                     if (hasbrace && !isDIGIT(*RExC_parse)) {
13668                         if (isrel) RExC_parse--;
13669                         RExC_parse -= 2;
13670                         goto parse_named_seq;
13671                     }
13672
13673                     if (RExC_parse >= RExC_end) {
13674                         goto unterminated_g;
13675                     }
13676                     num = S_backref_value(RExC_parse, RExC_end);
13677                     if (num == 0)
13678                         vFAIL("Reference to invalid group 0");
13679                     else if (num == I32_MAX) {
13680                          if (isDIGIT(*RExC_parse))
13681                             vFAIL("Reference to nonexistent group");
13682                         else
13683                           unterminated_g:
13684                             vFAIL("Unterminated \\g... pattern");
13685                     }
13686
13687                     if (isrel) {
13688                         num = RExC_npar - num;
13689                         if (num < 1)
13690                             vFAIL("Reference to nonexistent or unclosed group");
13691                     }
13692                 }
13693                 else {
13694                     num = S_backref_value(RExC_parse, RExC_end);
13695                     /* bare \NNN might be backref or octal - if it is larger
13696                      * than or equal RExC_npar then it is assumed to be an
13697                      * octal escape. Note RExC_npar is +1 from the actual
13698                      * number of parens. */
13699                     /* Note we do NOT check if num == I32_MAX here, as that is
13700                      * handled by the RExC_npar check */
13701
13702                     if (
13703                         /* any numeric escape < 10 is always a backref */
13704                         num > 9
13705                         /* any numeric escape < RExC_npar is a backref */
13706                         && num >= RExC_npar
13707                         /* cannot be an octal escape if it starts with 8 */
13708                         && *RExC_parse != '8'
13709                         /* cannot be an octal escape if it starts with 9 */
13710                         && *RExC_parse != '9'
13711                     ) {
13712                         /* Probably not meant to be a backref, instead likely
13713                          * to be an octal character escape, e.g. \35 or \777.
13714                          * The above logic should make it obvious why using
13715                          * octal escapes in patterns is problematic. - Yves */
13716                         RExC_parse = parse_start;
13717                         goto defchar;
13718                     }
13719                 }
13720
13721                 /* At this point RExC_parse points at a numeric escape like
13722                  * \12 or \88 or something similar, which we should NOT treat
13723                  * as an octal escape. It may or may not be a valid backref
13724                  * escape. For instance \88888888 is unlikely to be a valid
13725                  * backref. */
13726                 while (isDIGIT(*RExC_parse))
13727                     RExC_parse++;
13728                 if (hasbrace) {
13729                     if (*RExC_parse != '}')
13730                         vFAIL("Unterminated \\g{...} pattern");
13731                     RExC_parse++;
13732                 }
13733                 if (num >= (I32)RExC_npar) {
13734
13735                     /* It might be a forward reference; we can't fail until we
13736                      * know, by completing the parse to get all the groups, and
13737                      * then reparsing */
13738                     if (ALL_PARENS_COUNTED)  {
13739                         if (num >= RExC_total_parens)  {
13740                             vFAIL("Reference to nonexistent group");
13741                         }
13742                     }
13743                     else {
13744                         REQUIRE_PARENS_PASS;
13745                     }
13746                 }
13747                 RExC_sawback = 1;
13748                 ret = reganode(pRExC_state,
13749                                ((! FOLD)
13750                                  ? REF
13751                                  : (ASCII_FOLD_RESTRICTED)
13752                                    ? REFFA
13753                                    : (AT_LEAST_UNI_SEMANTICS)
13754                                      ? REFFU
13755                                      : (LOC)
13756                                        ? REFFL
13757                                        : REFF),
13758                                 num);
13759                 if (OP(REGNODE_p(ret)) == REFF) {
13760                     RExC_seen_d_op = TRUE;
13761                 }
13762                 *flagp |= HASWIDTH;
13763
13764                 /* override incorrect value set in reganode MJD */
13765                 Set_Node_Offset(REGNODE_p(ret), parse_start);
13766                 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13767                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13768                                         FALSE /* Don't force to /x */ );
13769             }
13770             break;
13771         case '\0':
13772             if (RExC_parse >= RExC_end)
13773                 FAIL("Trailing \\");
13774             /* FALLTHROUGH */
13775         default:
13776             /* Do not generate "unrecognized" warnings here, we fall
13777                back into the quick-grab loop below */
13778             RExC_parse = parse_start;
13779             goto defchar;
13780         } /* end of switch on a \foo sequence */
13781         break;
13782
13783     case '#':
13784
13785         /* '#' comments should have been spaced over before this function was
13786          * called */
13787         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13788         /*
13789         if (RExC_flags & RXf_PMf_EXTENDED) {
13790             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13791             if (RExC_parse < RExC_end)
13792                 goto tryagain;
13793         }
13794         */
13795
13796         /* FALLTHROUGH */
13797
13798     default:
13799           defchar: {
13800
13801             /* Here, we have determined that the next thing is probably a
13802              * literal character.  RExC_parse points to the first byte of its
13803              * definition.  (It still may be an escape sequence that evaluates
13804              * to a single character) */
13805
13806             STRLEN len = 0;
13807             UV ender = 0;
13808             char *p;
13809             char *s, *old_s = NULL, *old_old_s = NULL;
13810             char *s0;
13811             U32 max_string_len = 255;
13812
13813             /* We may have to reparse the node, artificially stopping filling
13814              * it early, based on info gleaned in the first parse.  This
13815              * variable gives where we stop.  Make it above the normal stopping
13816              * place first time through; otherwise it would stop too early */
13817             U32 upper_fill = max_string_len + 1;
13818
13819             /* We start out as an EXACT node, even if under /i, until we find a
13820              * character which is in a fold.  The algorithm now segregates into
13821              * separate nodes, characters that fold from those that don't under
13822              * /i.  (This hopefully will create nodes that are fixed strings
13823              * even under /i, giving the optimizer something to grab on to.)
13824              * So, if a node has something in it and the next character is in
13825              * the opposite category, that node is closed up, and the function
13826              * returns.  Then regatom is called again, and a new node is
13827              * created for the new category. */
13828             U8 node_type = EXACT;
13829
13830             /* Assume the node will be fully used; the excess is given back at
13831              * the end.  Under /i, we may need to temporarily add the fold of
13832              * an extra character or two at the end to check for splitting
13833              * multi-char folds, so allocate extra space for that.   We can't
13834              * make any other length assumptions, as a byte input sequence
13835              * could shrink down. */
13836             Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
13837                                                  + ((! FOLD)
13838                                                     ? 0
13839                                                     : 2 * ((UTF)
13840                                                            ? UTF8_MAXBYTES_CASE
13841                         /* Max non-UTF-8 expansion is 2 */ : 2)));
13842
13843             bool next_is_quantifier;
13844             char * oldp = NULL;
13845
13846             /* We can convert EXACTF nodes to EXACTFU if they contain only
13847              * characters that match identically regardless of the target
13848              * string's UTF8ness.  The reason to do this is that EXACTF is not
13849              * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13850              * runtime.
13851              *
13852              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13853              * contain only above-Latin1 characters (hence must be in UTF8),
13854              * which don't participate in folds with Latin1-range characters,
13855              * as the latter's folds aren't known until runtime. */
13856             bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13857
13858             /* Single-character EXACTish nodes are almost always SIMPLE.  This
13859              * allows us to override this as encountered */
13860             U8 maybe_SIMPLE = SIMPLE;
13861
13862             /* Does this node contain something that can't match unless the
13863              * target string is (also) in UTF-8 */
13864             bool requires_utf8_target = FALSE;
13865
13866             /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13867             bool has_ss = FALSE;
13868
13869             /* So is the MICRO SIGN */
13870             bool has_micro_sign = FALSE;
13871
13872             /* Set when we fill up the current node and there is still more
13873              * text to process */
13874             bool overflowed;
13875
13876             /* Allocate an EXACT node.  The node_type may change below to
13877              * another EXACTish node, but since the size of the node doesn't
13878              * change, it works */
13879             ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
13880                                                                     "exact");
13881             FILL_NODE(ret, node_type);
13882             RExC_emit++;
13883
13884             s = STRING(REGNODE_p(ret));
13885
13886             s0 = s;
13887
13888           reparse:
13889
13890             p = RExC_parse;
13891             len = 0;
13892             s = s0;
13893             node_type = EXACT;
13894             oldp = NULL;
13895             maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13896             maybe_SIMPLE = SIMPLE;
13897             requires_utf8_target = FALSE;
13898             has_ss = FALSE;
13899             has_micro_sign = FALSE;
13900
13901           continue_parse:
13902
13903             /* This breaks under rare circumstances.  If folding, we do not
13904              * want to split a node at a character that is a non-final in a
13905              * multi-char fold, as an input string could just happen to want to
13906              * match across the node boundary.  The code at the end of the loop
13907              * looks for this, and backs off until it finds not such a
13908              * character, but it is possible (though extremely, extremely
13909              * unlikely) for all characters in the node to be non-final fold
13910              * ones, in which case we just leave the node fully filled, and
13911              * hope that it doesn't match the string in just the wrong place */
13912
13913             assert( ! UTF     /* Is at the beginning of a character */
13914                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13915                    || UTF8_IS_START(UCHARAT(RExC_parse)));
13916
13917             overflowed = FALSE;
13918
13919             /* Here, we have a literal character.  Find the maximal string of
13920              * them in the input that we can fit into a single EXACTish node.
13921              * We quit at the first non-literal or when the node gets full, or
13922              * under /i the categorization of folding/non-folding character
13923              * changes */
13924             while (p < RExC_end && len < upper_fill) {
13925
13926                 /* In most cases each iteration adds one byte to the output.
13927                  * The exceptions override this */
13928                 Size_t added_len = 1;
13929
13930                 oldp = p;
13931                 old_old_s = old_s;
13932                 old_s = s;
13933
13934                 /* White space has already been ignored */
13935                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
13936                        || ! is_PATWS_safe((p), RExC_end, UTF));
13937
13938                 switch ((U8)*p) {
13939                 case '^':
13940                 case '$':
13941                 case '.':
13942                 case '[':
13943                 case '(':
13944                 case ')':
13945                 case '|':
13946                     goto loopdone;
13947                 case '\\':
13948                     /* Literal Escapes Switch
13949
13950                        This switch is meant to handle escape sequences that
13951                        resolve to a literal character.
13952
13953                        Every escape sequence that represents something
13954                        else, like an assertion or a char class, is handled
13955                        in the switch marked 'Special Escapes' above in this
13956                        routine, but also has an entry here as anything that
13957                        isn't explicitly mentioned here will be treated as
13958                        an unescaped equivalent literal.
13959                     */
13960
13961                     switch ((U8)*++p) {
13962
13963                     /* These are all the special escapes. */
13964                     case 'A':             /* Start assertion */
13965                     case 'b': case 'B':   /* Word-boundary assertion*/
13966                     case 'C':             /* Single char !DANGEROUS! */
13967                     case 'd': case 'D':   /* digit class */
13968                     case 'g': case 'G':   /* generic-backref, pos assertion */
13969                     case 'h': case 'H':   /* HORIZWS */
13970                     case 'k': case 'K':   /* named backref, keep marker */
13971                     case 'p': case 'P':   /* Unicode property */
13972                               case 'R':   /* LNBREAK */
13973                     case 's': case 'S':   /* space class */
13974                     case 'v': case 'V':   /* VERTWS */
13975                     case 'w': case 'W':   /* word class */
13976                     case 'X':             /* eXtended Unicode "combining
13977                                              character sequence" */
13978                     case 'z': case 'Z':   /* End of line/string assertion */
13979                         --p;
13980                         goto loopdone;
13981
13982                     /* Anything after here is an escape that resolves to a
13983                        literal. (Except digits, which may or may not)
13984                      */
13985                     case 'n':
13986                         ender = '\n';
13987                         p++;
13988                         break;
13989                     case 'N': /* Handle a single-code point named character. */
13990                         RExC_parse = p + 1;
13991                         if (! grok_bslash_N(pRExC_state,
13992                                             NULL,   /* Fail if evaluates to
13993                                                        anything other than a
13994                                                        single code point */
13995                                             &ender, /* The returned single code
13996                                                        point */
13997                                             NULL,   /* Don't need a count of
13998                                                        how many code points */
13999                                             flagp,
14000                                             RExC_strict,
14001                                             depth)
14002                         ) {
14003                             if (*flagp & NEED_UTF8)
14004                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
14005                             RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14006
14007                             /* Here, it wasn't a single code point.  Go close
14008                              * up this EXACTish node.  The switch() prior to
14009                              * this switch handles the other cases */
14010                             RExC_parse = p = oldp;
14011                             goto loopdone;
14012                         }
14013                         p = RExC_parse;
14014                         RExC_parse = parse_start;
14015
14016                         /* The \N{} means the pattern, if previously /d,
14017                          * becomes /u.  That means it can't be an EXACTF node,
14018                          * but an EXACTFU */
14019                         if (node_type == EXACTF) {
14020                             node_type = EXACTFU;
14021
14022                             /* If the node already contains something that
14023                              * differs between EXACTF and EXACTFU, reparse it
14024                              * as EXACTFU */
14025                             if (! maybe_exactfu) {
14026                                 len = 0;
14027                                 s = s0;
14028                                 goto reparse;
14029                             }
14030                         }
14031
14032                         break;
14033                     case 'r':
14034                         ender = '\r';
14035                         p++;
14036                         break;
14037                     case 't':
14038                         ender = '\t';
14039                         p++;
14040                         break;
14041                     case 'f':
14042                         ender = '\f';
14043                         p++;
14044                         break;
14045                     case 'e':
14046                         ender = ESC_NATIVE;
14047                         p++;
14048                         break;
14049                     case 'a':
14050                         ender = '\a';
14051                         p++;
14052                         break;
14053                     case 'o':
14054                         {
14055                             UV result;
14056                             const char* error_msg;
14057
14058                             bool valid = grok_bslash_o(&p,
14059                                                        RExC_end,
14060                                                        &result,
14061                                                        &error_msg,
14062                                                        TO_OUTPUT_WARNINGS(p),
14063                                                        (bool) RExC_strict,
14064                                                        TRUE, /* Output warnings
14065                                                                 for non-
14066                                                                 portables */
14067                                                        UTF);
14068                             if (! valid) {
14069                                 RExC_parse = p; /* going to die anyway; point
14070                                                    to exact spot of failure */
14071                                 vFAIL(error_msg);
14072                             }
14073                             UPDATE_WARNINGS_LOC(p - 1);
14074                             ender = result;
14075                             break;
14076                         }
14077                     case 'x':
14078                         {
14079                             UV result = UV_MAX; /* initialize to erroneous
14080                                                    value */
14081                             const char* error_msg;
14082
14083                             bool valid = grok_bslash_x(&p,
14084                                                        RExC_end,
14085                                                        &result,
14086                                                        &error_msg,
14087                                                        TO_OUTPUT_WARNINGS(p),
14088                                                        (bool) RExC_strict,
14089                                                        TRUE, /* Silence warnings
14090                                                                 for non-
14091                                                                 portables */
14092                                                        UTF);
14093                             if (! valid) {
14094                                 RExC_parse = p; /* going to die anyway; point
14095                                                    to exact spot of failure */
14096                                 vFAIL(error_msg);
14097                             }
14098                             UPDATE_WARNINGS_LOC(p - 1);
14099                             ender = result;
14100
14101 #ifdef EBCDIC
14102                             if (ender < 0x100) {
14103                                 if (RExC_recode_x_to_native) {
14104                                     ender = LATIN1_TO_NATIVE(ender);
14105                                 }
14106                             }
14107 #endif
14108                             break;
14109                         }
14110                     case 'c':
14111                         p++;
14112                         ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14113                         UPDATE_WARNINGS_LOC(p);
14114                         p++;
14115                         break;
14116                     case '8': case '9': /* must be a backreference */
14117                         --p;
14118                         /* we have an escape like \8 which cannot be an octal escape
14119                          * so we exit the loop, and let the outer loop handle this
14120                          * escape which may or may not be a legitimate backref. */
14121                         goto loopdone;
14122                     case '1': case '2': case '3':case '4':
14123                     case '5': case '6': case '7':
14124                         /* When we parse backslash escapes there is ambiguity
14125                          * between backreferences and octal escapes. Any escape
14126                          * from \1 - \9 is a backreference, any multi-digit
14127                          * escape which does not start with 0 and which when
14128                          * evaluated as decimal could refer to an already
14129                          * parsed capture buffer is a back reference. Anything
14130                          * else is octal.
14131                          *
14132                          * Note this implies that \118 could be interpreted as
14133                          * 118 OR as "\11" . "8" depending on whether there
14134                          * were 118 capture buffers defined already in the
14135                          * pattern.  */
14136
14137                         /* NOTE, RExC_npar is 1 more than the actual number of
14138                          * parens we have seen so far, hence the "<" as opposed
14139                          * to "<=" */
14140                         if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14141                         {  /* Not to be treated as an octal constant, go
14142                                    find backref */
14143                             --p;
14144                             goto loopdone;
14145                         }
14146                         /* FALLTHROUGH */
14147                     case '0':
14148                         {
14149                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14150                             STRLEN numlen = 3;
14151                             ender = grok_oct(p, &numlen, &flags, NULL);
14152                             p += numlen;
14153                             if (   isDIGIT(*p)  /* like \08, \178 */
14154                                 && ckWARN(WARN_REGEXP)
14155                                 && numlen < 3)
14156                             {
14157                                 reg_warn_non_literal_string(
14158                                          p + 1,
14159                                          form_short_octal_warning(p, numlen));
14160                             }
14161                         }
14162                         break;
14163                     case '\0':
14164                         if (p >= RExC_end)
14165                             FAIL("Trailing \\");
14166                         /* FALLTHROUGH */
14167                     default:
14168                         if (isALPHANUMERIC(*p)) {
14169                             /* An alpha followed by '{' is going to fail next
14170                              * iteration, so don't output this warning in that
14171                              * case */
14172                             if (! isALPHA(*p) || *(p + 1) != '{') {
14173                                 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14174                                                   " passed through", p);
14175                             }
14176                         }
14177                         goto normal_default;
14178                     } /* End of switch on '\' */
14179                     break;
14180                 case '{':
14181                     /* Trying to gain new uses for '{' without breaking too
14182                      * much existing code is hard.  The solution currently
14183                      * adopted is:
14184                      *  1)  If there is no ambiguity that a '{' should always
14185                      *      be taken literally, at the start of a construct, we
14186                      *      just do so.
14187                      *  2)  If the literal '{' conflicts with our desired use
14188                      *      of it as a metacharacter, we die.  The deprecation
14189                      *      cycles for this have come and gone.
14190                      *  3)  If there is ambiguity, we raise a simple warning.
14191                      *      This could happen, for example, if the user
14192                      *      intended it to introduce a quantifier, but slightly
14193                      *      misspelled the quantifier.  Without this warning,
14194                      *      the quantifier would silently be taken as a literal
14195                      *      string of characters instead of a meta construct */
14196                     if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14197                         if (      RExC_strict
14198                             || (  p > parse_start + 1
14199                                 && isALPHA_A(*(p - 1))
14200                                 && *(p - 2) == '\\')
14201                             || new_regcurly(p, RExC_end))
14202                         {
14203                             RExC_parse = p + 1;
14204                             vFAIL("Unescaped left brace in regex is "
14205                                   "illegal here");
14206                         }
14207                         ckWARNreg(p + 1, "Unescaped left brace in regex is"
14208                                          " passed through");
14209                     }
14210                     goto normal_default;
14211                 case '}':
14212                 case ']':
14213                     if (p > RExC_parse && RExC_strict) {
14214                         ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14215                     }
14216                     /*FALLTHROUGH*/
14217                 default:    /* A literal character */
14218                   normal_default:
14219                     if (! UTF8_IS_INVARIANT(*p) && UTF) {
14220                         STRLEN numlen;
14221                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14222                                                &numlen, UTF8_ALLOW_DEFAULT);
14223                         p += numlen;
14224                     }
14225                     else
14226                         ender = (U8) *p++;
14227                     break;
14228                 } /* End of switch on the literal */
14229
14230                 /* Here, have looked at the literal character, and <ender>
14231                  * contains its ordinal; <p> points to the character after it.
14232                  * */
14233
14234                 if (ender > 255) {
14235                     REQUIRE_UTF8(flagp);
14236                 }
14237
14238                 /* We need to check if the next non-ignored thing is a
14239                  * quantifier.  Move <p> to after anything that should be
14240                  * ignored, which, as a side effect, positions <p> for the next
14241                  * loop iteration */
14242                 skip_to_be_ignored_text(pRExC_state, &p,
14243                                         FALSE /* Don't force to /x */ );
14244
14245                 /* If the next thing is a quantifier, it applies to this
14246                  * character only, which means that this character has to be in
14247                  * its own node and can't just be appended to the string in an
14248                  * existing node, so if there are already other characters in
14249                  * the node, close the node with just them, and set up to do
14250                  * this character again next time through, when it will be the
14251                  * only thing in its new node */
14252
14253                 next_is_quantifier =    LIKELY(p < RExC_end)
14254                                      && UNLIKELY(ISMULT2(p));
14255
14256                 if (next_is_quantifier && LIKELY(len)) {
14257                     p = oldp;
14258                     goto loopdone;
14259                 }
14260
14261                 /* Ready to add 'ender' to the node */
14262
14263                 if (! FOLD) {  /* The simple case, just append the literal */
14264                   not_fold_common:
14265
14266                     /* Don't output if it would overflow */
14267                     if (UNLIKELY(len > max_string_len - ((UTF)
14268                                                       ? UVCHR_SKIP(ender)
14269                                                       : 1)))
14270                     {
14271                         overflowed = TRUE;
14272                         break;
14273                     }
14274
14275                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14276                         *(s++) = (char) ender;
14277                     }
14278                     else {
14279                         U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14280                         added_len = (char *) new_s - s;
14281                         s = (char *) new_s;
14282
14283                         if (ender > 255)  {
14284                             requires_utf8_target = TRUE;
14285                         }
14286                     }
14287                 }
14288                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14289
14290                     /* Here are folding under /l, and the code point is
14291                      * problematic.  If this is the first character in the
14292                      * node, change the node type to folding.   Otherwise, if
14293                      * this is the first problematic character, close up the
14294                      * existing node, so can start a new node with this one */
14295                     if (! len) {
14296                         node_type = EXACTFL;
14297                         RExC_contains_locale = 1;
14298                     }
14299                     else if (node_type == EXACT) {
14300                         p = oldp;
14301                         goto loopdone;
14302                     }
14303
14304                     /* This problematic code point means we can't simplify
14305                      * things */
14306                     maybe_exactfu = FALSE;
14307
14308                     /* Here, we are adding a problematic fold character.
14309                      * "Problematic" in this context means that its fold isn't
14310                      * known until runtime.  (The non-problematic code points
14311                      * are the above-Latin1 ones that fold to also all
14312                      * above-Latin1.  Their folds don't vary no matter what the
14313                      * locale is.) But here we have characters whose fold
14314                      * depends on the locale.  We just add in the unfolded
14315                      * character, and wait until runtime to fold it */
14316                     goto not_fold_common;
14317                 }
14318                 else /* regular fold; see if actually is in a fold */
14319                      if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14320                          || (ender > 255
14321                             && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14322                 {
14323                     /* Here, folding, but the character isn't in a fold.
14324                      *
14325                      * Start a new node if previous characters in the node were
14326                      * folded */
14327                     if (len && node_type != EXACT) {
14328                         p = oldp;
14329                         goto loopdone;
14330                     }
14331
14332                     /* Here, continuing a node with non-folded characters.  Add
14333                      * this one */
14334                     goto not_fold_common;
14335                 }
14336                 else {  /* Here, does participate in some fold */
14337
14338                     /* If this is the first character in the node, change its
14339                      * type to folding.  Otherwise, if this is the first
14340                      * folding character in the node, close up the existing
14341                      * node, so can start a new node with this one.  */
14342                     if (! len) {
14343                         node_type = compute_EXACTish(pRExC_state);
14344                     }
14345                     else if (node_type == EXACT) {
14346                         p = oldp;
14347                         goto loopdone;
14348                     }
14349
14350                     if (UTF) {  /* Alway use the folded value for UTF-8
14351                                    patterns */
14352                         if (UVCHR_IS_INVARIANT(ender)) {
14353                             if (UNLIKELY(len + 1 > max_string_len)) {
14354                                 overflowed = TRUE;
14355                                 break;
14356                             }
14357
14358                             *(s)++ = (U8) toFOLD(ender);
14359                         }
14360                         else {
14361                             UV folded = _to_uni_fold_flags(
14362                                     ender,
14363                                     (U8 *) s,  /* We have allocated extra space
14364                                                   in 's' so can't run off the
14365                                                   end */
14366                                     &added_len,
14367                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14368                                                     ? FOLD_FLAGS_NOMIX_ASCII
14369                                                     : 0));
14370                             if (UNLIKELY(len + added_len > max_string_len)) {
14371                                 overflowed = TRUE;
14372                                 break;
14373                             }
14374
14375                             s += added_len;
14376
14377                             if (   folded > 255
14378                                 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14379                             {
14380                                 /* U+B5 folds to the MU, so its possible for a
14381                                  * non-UTF-8 target to match it */
14382                                 requires_utf8_target = TRUE;
14383                             }
14384                         }
14385                     }
14386                     else { /* Here is non-UTF8. */
14387
14388                         /* The fold will be one or (rarely) two characters.
14389                          * Check that there's room for at least a single one
14390                          * before setting any flags, etc.  Because otherwise an
14391                          * overflowing character could cause a flag to be set
14392                          * even though it doesn't end up in this node.  (For
14393                          * the two character fold, we check again, before
14394                          * setting any flags) */
14395                         if (UNLIKELY(len + 1 > max_string_len)) {
14396                             overflowed = TRUE;
14397                             break;
14398                         }
14399
14400 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
14401    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
14402                                       || UNICODE_DOT_DOT_VERSION > 0)
14403
14404                         /* On non-ancient Unicodes, check for the only possible
14405                          * multi-char fold  */
14406                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14407
14408                             /* This potential multi-char fold means the node
14409                              * can't be simple (because it could match more
14410                              * than a single char).  And in some cases it will
14411                              * match 'ss', so set that flag */
14412                             maybe_SIMPLE = 0;
14413                             has_ss = TRUE;
14414
14415                             /* It can't change to be an EXACTFU (unless already
14416                              * is one).  We fold it iff under /u rules. */
14417                             if (node_type != EXACTFU) {
14418                                 maybe_exactfu = FALSE;
14419                             }
14420                             else {
14421                                 if (UNLIKELY(len + 2 > max_string_len)) {
14422                                     overflowed = TRUE;
14423                                     break;
14424                                 }
14425
14426                                 *(s++) = 's';
14427                                 *(s++) = 's';
14428                                 added_len = 2;
14429
14430                                 goto done_with_this_char;
14431                             }
14432                         }
14433                         else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14434                                  && LIKELY(len > 0)
14435                                  && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14436                         {
14437                             /* Also, the sequence 'ss' is special when not
14438                              * under /u.  If the target string is UTF-8, it
14439                              * should match SHARP S; otherwise it won't.  So,
14440                              * here we have to exclude the possibility of this
14441                              * node moving to /u.*/
14442                             has_ss = TRUE;
14443                             maybe_exactfu = FALSE;
14444                         }
14445 #endif
14446                         /* Here, the fold will be a single character */
14447
14448                         if (UNLIKELY(ender == MICRO_SIGN)) {
14449                             has_micro_sign = TRUE;
14450                         }
14451                         else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14452
14453                             /* If the character's fold differs between /d and
14454                              * /u, this can't change to be an EXACTFU node */
14455                             maybe_exactfu = FALSE;
14456                         }
14457
14458                         *(s++) = (DEPENDS_SEMANTICS)
14459                                  ? (char) toFOLD(ender)
14460
14461                                    /* Under /u, the fold of any character in
14462                                     * the 0-255 range happens to be its
14463                                     * lowercase equivalent, except for LATIN
14464                                     * SMALL LETTER SHARP S, which was handled
14465                                     * above, and the MICRO SIGN, whose fold
14466                                     * requires UTF-8 to represent.  */
14467                                  : (char) toLOWER_L1(ender);
14468                     }
14469                 } /* End of adding current character to the node */
14470
14471               done_with_this_char:
14472
14473                 len += added_len;
14474
14475                 if (next_is_quantifier) {
14476
14477                     /* Here, the next input is a quantifier, and to get here,
14478                      * the current character is the only one in the node. */
14479                     goto loopdone;
14480                 }
14481
14482             } /* End of loop through literal characters */
14483
14484             /* Here we have either exhausted the input or run out of room in
14485              * the node.  If the former, we are done.  (If we encountered a
14486              * character that can't be in the node, transfer is made directly
14487              * to <loopdone>, and so we wouldn't have fallen off the end of the
14488              * loop.)  */
14489             if (LIKELY(! overflowed)) {
14490                 goto loopdone;
14491             }
14492
14493             /* Here we have run out of room.  We can grow plain EXACT and
14494              * LEXACT nodes.  If the pattern is gigantic enough, though,
14495              * eventually we'll have to artificially chunk the pattern into
14496              * multiple nodes. */
14497             if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14498                 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14499                 Size_t overhead_expansion = 0;
14500                 char temp[256];
14501                 Size_t max_nodes_for_string;
14502                 Size_t achievable;
14503                 SSize_t delta;
14504
14505                 /* Here we couldn't fit the final character in the current
14506                  * node, so it will have to be reparsed, no matter what else we
14507                  * do */
14508                 p = oldp;
14509
14510                 /* If would have overflowed a regular EXACT node, switch
14511                  * instead to an LEXACT.  The code below is structured so that
14512                  * the actual growing code is common to changing from an EXACT
14513                  * or just increasing the LEXACT size.  This means that we have
14514                  * to save the string in the EXACT case before growing, and
14515                  * then copy it afterwards to its new location */
14516                 if (node_type == EXACT) {
14517                     overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14518                     RExC_emit += overhead_expansion;
14519                     Copy(s0, temp, len, char);
14520                 }
14521
14522                 /* Ready to grow.  If it was a plain EXACT, the string was
14523                  * saved, and the first few bytes of it overwritten by adding
14524                  * an argument field.  We assume, as we do elsewhere in this
14525                  * file, that one byte of remaining input will translate into
14526                  * one byte of output, and if that's too small, we grow again,
14527                  * if too large the excess memory is freed at the end */
14528
14529                 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14530                 achievable = MIN(max_nodes_for_string,
14531                                  current_string_nodes + STR_SZ(RExC_end - p));
14532                 delta = achievable - current_string_nodes;
14533
14534                 /* If there is just no more room, go finish up this chunk of
14535                  * the pattern. */
14536                 if (delta <= 0) {
14537                     goto loopdone;
14538                 }
14539
14540                 change_engine_size(pRExC_state, delta + overhead_expansion);
14541                 current_string_nodes += delta;
14542                 max_string_len
14543                            = sizeof(struct regnode) * current_string_nodes;
14544                 upper_fill = max_string_len + 1;
14545
14546                 /* If the length was small, we know this was originally an
14547                  * EXACT node now converted to LEXACT, and the string has to be
14548                  * restored.  Otherwise the string was untouched.  260 is just
14549                  * a number safely above 255 so don't have to worry about
14550                  * getting it precise */
14551                 if (len < 260) {
14552                     node_type = LEXACT;
14553                     FILL_NODE(ret, node_type);
14554                     s0 = STRING(REGNODE_p(ret));
14555                     Copy(temp, s0, len, char);
14556                     s = s0 + len;
14557                 }
14558
14559                 goto continue_parse;
14560             }
14561             else if (FOLD) {
14562                 bool splittable = FALSE;
14563                 bool backed_up = FALSE;
14564                 char * e;
14565                 char * s_start;
14566
14567                 /* Here is /i.  Running out of room creates a problem if we are
14568                  * folding, and the split happens in the middle of a
14569                  * multi-character fold, as a match that should have occurred,
14570                  * won't, due to the way nodes are matched, and our artificial
14571                  * boundary.  So back off until we aren't splitting such a
14572                  * fold.  If there is no such place to back off to, we end up
14573                  * taking the entire node as-is.  This can happen if the node
14574                  * consists entirely of 'f' or entirely of 's' characters (or
14575                  * things that fold to them) as 'ff' and 'ss' are
14576                  * multi-character folds.
14577                  *
14578                  * The Unicode standard says that multi character folds consist
14579                  * of either two or three characters.  That means we would be
14580                  * splitting one if the final character in the node is at the
14581                  * beginning of either type, or is the second of a three
14582                  * character fold.
14583                  *
14584                  * At this point:
14585                  *  ender     is the code point of the character that won't fit
14586                  *            in the node
14587                  *  s         points to just beyond the final byte in the node.
14588                  *            It's where we would place ender if there were
14589                  *            room, and where in fact we do place ender's fold
14590                  *            in the code below, as we've over-allocated space
14591                  *            for s0 (hence s) to allow for this
14592                  *  e         starts at 's' and advances as we append things.
14593                  *  old_s     is the same as 's'.  (If ender had fit, 's' would
14594                  *            have been advanced to beyond it).
14595                  *  old_old_s points to the beginning byte of the final
14596                  *            character in the node
14597                  *  p         points to the beginning byte in the input of the
14598                  *            character beyond 'ender'.
14599                  *  oldp      points to the beginning byte in the input of
14600                  *            'ender'.
14601                  *
14602                  * In the case of /il, we haven't folded anything that could be
14603                  * affected by the locale.  That means only above-Latin1
14604                  * characters that fold to other above-latin1 characters get
14605                  * folded at compile time.  To check where a good place to
14606                  * split nodes is, everything in it will have to be folded.
14607                  * The boolean 'maybe_exactfu' keeps track in /il if there are
14608                  * any unfolded characters in the node. */
14609                 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14610
14611                 /* If we do need to fold the node, we need a place to store the
14612                  * folded copy, and a way to map back to the unfolded original
14613                  * */
14614                 char * locfold_buf = NULL;
14615                 Size_t * loc_correspondence = NULL;
14616
14617                 if (! need_to_fold_loc) {   /* The normal case.  Just
14618                                                initialize to the actual node */
14619                     e = s;
14620                     s_start = s0;
14621                     s = old_old_s;  /* Point to the beginning of the final char
14622                                        that fits in the node */
14623                 }
14624                 else {
14625
14626                     /* Here, we have filled a /il node, and there are unfolded
14627                      * characters in it.  If the runtime locale turns out to be
14628                      * UTF-8, there are possible multi-character folds, just
14629                      * like when not under /l.  The node hence can't terminate
14630                      * in the middle of such a fold.  To determine this, we
14631                      * have to create a folded copy of this node.  That means
14632                      * reparsing the node, folding everything assuming a UTF-8
14633                      * locale.  (If at runtime it isn't such a locale, the
14634                      * actions here wouldn't have been necessary, but we have
14635                      * to assume the worst case.)  If we find we need to back
14636                      * off the folded string, we do so, and then map that
14637                      * position back to the original unfolded node, which then
14638                      * gets output, truncated at that spot */
14639
14640                     char * redo_p = RExC_parse;
14641                     char * redo_e;
14642                     char * old_redo_e;
14643
14644                     /* Allow enough space assuming a single byte input folds to
14645                      * a single byte output, plus assume that the two unparsed
14646                      * characters (that we may need) fold to the largest number
14647                      * of bytes possible, plus extra for one more worst case
14648                      * scenario.  In the loop below, if we start eating into
14649                      * that final spare space, we enlarge this initial space */
14650                     Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14651
14652                     Newxz(locfold_buf, size, char);
14653                     Newxz(loc_correspondence, size, Size_t);
14654
14655                     /* Redo this node's parse, folding into 'locfold_buf' */
14656                     redo_p = RExC_parse;
14657                     old_redo_e = redo_e = locfold_buf;
14658                     while (redo_p <= oldp) {
14659
14660                         old_redo_e = redo_e;
14661                         loc_correspondence[redo_e - locfold_buf]
14662                                                         = redo_p - RExC_parse;
14663
14664                         if (UTF) {
14665                             Size_t added_len;
14666
14667                             (void) _to_utf8_fold_flags((U8 *) redo_p,
14668                                                        (U8 *) RExC_end,
14669                                                        (U8 *) redo_e,
14670                                                        &added_len,
14671                                                        FOLD_FLAGS_FULL);
14672                             redo_e += added_len;
14673                             redo_p += UTF8SKIP(redo_p);
14674                         }
14675                         else {
14676
14677                             /* Note that if this code is run on some ancient
14678                              * Unicode versions, SHARP S doesn't fold to 'ss',
14679                              * but rather than clutter the code with #ifdef's,
14680                              * as is done above, we ignore that possibility.
14681                              * This is ok because this code doesn't affect what
14682                              * gets matched, but merely where the node gets
14683                              * split */
14684                             if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14685                                 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14686                             }
14687                             else {
14688                                 *redo_e++ = 's';
14689                                 *redo_e++ = 's';
14690                             }
14691                             redo_p++;
14692                         }
14693
14694
14695                         /* If we're getting so close to the end that a
14696                          * worst-case fold in the next character would cause us
14697                          * to overflow, increase, assuming one byte output byte
14698                          * per one byte input one, plus room for another worst
14699                          * case fold */
14700                         if (   redo_p <= oldp
14701                             && redo_e > locfold_buf + size
14702                                                     - (UTF8_MAXBYTES_CASE + 1))
14703                         {
14704                             Size_t new_size = size
14705                                             + (oldp - redo_p)
14706                                             + UTF8_MAXBYTES_CASE + 1;
14707                             Ptrdiff_t e_offset = redo_e - locfold_buf;
14708
14709                             Renew(locfold_buf, new_size, char);
14710                             Renew(loc_correspondence, new_size, Size_t);
14711                             size = new_size;
14712
14713                             redo_e = locfold_buf + e_offset;
14714                         }
14715                     }
14716
14717                     /* Set so that things are in terms of the folded, temporary
14718                      * string */
14719                     s = old_redo_e;
14720                     s_start = locfold_buf;
14721                     e = redo_e;
14722
14723                 }
14724
14725                 /* Here, we have 's', 's_start' and 'e' set up to point to the
14726                  * input that goes into the node, folded.
14727                  *
14728                  * If the final character of the node and the fold of ender
14729                  * form the first two characters of a three character fold, we
14730                  * need to peek ahead at the next (unparsed) character in the
14731                  * input to determine if the three actually do form such a
14732                  * fold.  Just looking at that character is not generally
14733                  * sufficient, as it could be, for example, an escape sequence
14734                  * that evaluates to something else, and it needs to be folded.
14735                  *
14736                  * khw originally thought to just go through the parse loop one
14737                  * extra time, but that doesn't work easily as that iteration
14738                  * could cause things to think that the parse is over and to
14739                  * goto loopdone.  The character could be a '$' for example, or
14740                  * the character beyond could be a quantifier, and other
14741                  * glitches as well.
14742                  *
14743                  * The solution used here for peeking ahead is to look at that
14744                  * next character.  If it isn't ASCII punctuation, then it will
14745                  * be something that continues in an EXACTish node if there
14746                  * were space.  We append the fold of it to s, having reserved
14747                  * enough room in s0 for the purpose.  If we can't reasonably
14748                  * peek ahead, we instead assume the worst case: that it is
14749                  * something that would form the completion of a multi-char
14750                  * fold.
14751                  *
14752                  * If we can't split between s and ender, we work backwards
14753                  * character-by-character down to s0.  At each current point
14754                  * see if we are at the beginning of a multi-char fold.  If so,
14755                  * that means we would be splitting the fold across nodes, and
14756                  * so we back up one and try again.
14757                  *
14758                  * If we're not at the beginning, we still could be at the
14759                  * final two characters of a (rare) three character fold.  We
14760                  * check if the sequence starting at the character before the
14761                  * current position (and including the current and next
14762                  * characters) is a three character fold.  If not, the node can
14763                  * be split here.  If it is, we have to backup two characters
14764                  * and try again.
14765                  *
14766                  * Otherwise, the node can be split at the current position.
14767                  *
14768                  * The same logic is used for UTF-8 patterns and not */
14769                 if (UTF) {
14770                     Size_t added_len;
14771
14772                     /* Append the fold of ender */
14773                     (void) _to_uni_fold_flags(
14774                         ender,
14775                         (U8 *) e,
14776                         &added_len,
14777                         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14778                                         ? FOLD_FLAGS_NOMIX_ASCII
14779                                         : 0));
14780                     e += added_len;
14781
14782                     /* 's' and the character folded to by ender may be the
14783                      * first two of a three-character fold, in which case the
14784                      * node should not be split here.  That may mean examining
14785                      * the so-far unparsed character starting at 'p'.  But if
14786                      * ender folded to more than one character, we already have
14787                      * three characters to look at.  Also, we first check if
14788                      * the sequence consisting of s and the next character form
14789                      * the first two of some three character fold.  If not,
14790                      * there's no need to peek ahead. */
14791                     if (   added_len <= UTF8SKIP(e - added_len)
14792                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
14793                     {
14794                         /* Here, the two do form the beginning of a potential
14795                          * three character fold.  The unexamined character may
14796                          * or may not complete it.  Peek at it.  It might be
14797                          * something that ends the node or an escape sequence,
14798                          * in which case we don't know without a lot of work
14799                          * what it evaluates to, so we have to assume the worst
14800                          * case: that it does complete the fold, and so we
14801                          * can't split here.  All such instances  will have
14802                          * that character be an ASCII punctuation character,
14803                          * like a backslash.  So, for that case, backup one and
14804                          * drop down to try at that position */
14805                         if (isPUNCT(*p)) {
14806                             s = (char *) utf8_hop_back((U8 *) s, -1,
14807                                        (U8 *) s_start);
14808                             backed_up = TRUE;
14809                         }
14810                         else {
14811                             /* Here, since it's not punctuation, it must be a
14812                              * real character, and we can append its fold to
14813                              * 'e' (having deliberately reserved enough space
14814                              * for this eventuality) and drop down to check if
14815                              * the three actually do form a folded sequence */
14816                             (void) _to_utf8_fold_flags(
14817                                 (U8 *) p, (U8 *) RExC_end,
14818                                 (U8 *) e,
14819                                 &added_len,
14820                                 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14821                                                 ? FOLD_FLAGS_NOMIX_ASCII
14822                                                 : 0));
14823                             e += added_len;
14824                         }
14825                     }
14826
14827                     /* Here, we either have three characters available in
14828                      * sequence starting at 's', or we have two characters and
14829                      * know that the following one can't possibly be part of a
14830                      * three character fold.  We go through the node backwards
14831                      * until we find a place where we can split it without
14832                      * breaking apart a multi-character fold.  At any given
14833                      * point we have to worry about if such a fold begins at
14834                      * the current 's', and also if a three-character fold
14835                      * begins at s-1, (containing s and s+1).  Splitting in
14836                      * either case would break apart a fold */
14837                     do {
14838                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
14839                                                             (U8 *) s_start);
14840
14841                         /* If is a multi-char fold, can't split here.  Backup
14842                          * one char and try again */
14843                         if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
14844                             s = prev_s;
14845                             backed_up = TRUE;
14846                             continue;
14847                         }
14848
14849                         /* If the two characters beginning at 's' are part of a
14850                          * three character fold starting at the character
14851                          * before s, we can't split either before or after s.
14852                          * Backup two chars and try again */
14853                         if (   LIKELY(s > s_start)
14854                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
14855                         {
14856                             s = prev_s;
14857                             s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
14858                             backed_up = TRUE;
14859                             continue;
14860                         }
14861
14862                         /* Here there's no multi-char fold between s and the
14863                          * next character following it.  We can split */
14864                         splittable = TRUE;
14865                         break;
14866
14867                     } while (s > s_start); /* End of loops backing up through the node */
14868
14869                     /* Here we either couldn't find a place to split the node,
14870                      * or else we broke out of the loop setting 'splittable' to
14871                      * true.  In the latter case, the place to split is between
14872                      * the first and second characters in the sequence starting
14873                      * at 's' */
14874                     if (splittable) {
14875                         s += UTF8SKIP(s);
14876                     }
14877                 }
14878                 else {  /* Pattern not UTF-8 */
14879                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
14880                         || ASCII_FOLD_RESTRICTED)
14881                     {
14882                         *e++ = toLOWER_L1(ender);
14883                     }
14884                     else {
14885                         *e++ = 's';
14886                         *e++ = 's';
14887                     }
14888
14889                     if (   e - s  <= 1
14890                         && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
14891                     {
14892                         if (isPUNCT(*p)) {
14893                             s--;
14894                             backed_up = TRUE;
14895                         }
14896                         else {
14897                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
14898                                 || ASCII_FOLD_RESTRICTED)
14899                             {
14900                                 *e++ = toLOWER_L1(ender);
14901                             }
14902                             else {
14903                                 *e++ = 's';
14904                                 *e++ = 's';
14905                             }
14906                         }
14907                     }
14908
14909                     do {
14910                         if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
14911                             s--;
14912                             backed_up = TRUE;
14913                             continue;
14914                         }
14915
14916                         if (   LIKELY(s > s_start)
14917                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
14918                         {
14919                             s -= 2;
14920                             backed_up = TRUE;
14921                             continue;
14922                         }
14923
14924                         splittable = TRUE;
14925                         break;
14926
14927                     } while (s > s_start);
14928
14929                     if (splittable) {
14930                         s++;
14931                     }
14932                 }
14933
14934                 /* Here, we are done backing up.  If we didn't backup at all
14935                  * (the likely case), just proceed */
14936                 if (backed_up) {
14937
14938                    /* If we did find a place to split, reparse the entire node
14939                     * stopping where we have calculated. */
14940                     if (splittable) {
14941
14942                        /* If we created a temporary folded string under /l, we
14943                         * have to map that back to the original */
14944                         if (need_to_fold_loc) {
14945                             upper_fill = loc_correspondence[s - s_start];
14946                             Safefree(locfold_buf);
14947                             Safefree(loc_correspondence);
14948
14949                             if (upper_fill == 0) {
14950                                 FAIL2("panic: loc_correspondence[%d] is 0",
14951                                       (int) (s - s_start));
14952                             }
14953                         }
14954                         else {
14955                             upper_fill = s - s0;
14956                         }
14957                         goto reparse;
14958                     }
14959                     else if (need_to_fold_loc) {
14960                         Safefree(locfold_buf);
14961                         Safefree(loc_correspondence);
14962                     }
14963
14964                     /* Here the node consists entirely of non-final multi-char
14965                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
14966                      * decent place to split it, so give up and just take the
14967                      * whole thing */
14968                     len = old_s - s0;
14969                 }
14970             }   /* End of verifying node ends with an appropriate char */
14971
14972             /* We need to start the next node at the character that didn't fit
14973              * in this one */
14974             p = oldp;
14975
14976           loopdone:   /* Jumped to when encounters something that shouldn't be
14977                          in the node */
14978
14979             /* Free up any over-allocated space; cast is to silence bogus
14980              * warning in MS VC */
14981             change_engine_size(pRExC_state,
14982                         - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
14983
14984             /* I (khw) don't know if you can get here with zero length, but the
14985              * old code handled this situation by creating a zero-length EXACT
14986              * node.  Might as well be NOTHING instead */
14987             if (len == 0) {
14988                 OP(REGNODE_p(ret)) = NOTHING;
14989             }
14990             else {
14991
14992                 /* If the node type is EXACT here, check to see if it
14993                  * should be EXACTL, or EXACT_REQ8. */
14994                 if (node_type == EXACT) {
14995                     if (LOC) {
14996                         node_type = EXACTL;
14997                     }
14998                     else if (requires_utf8_target) {
14999                         node_type = EXACT_REQ8;
15000                     }
15001                 }
15002                 else if (node_type == LEXACT) {
15003                     if (requires_utf8_target) {
15004                         node_type = LEXACT_REQ8;
15005                     }
15006                 }
15007                 else if (FOLD) {
15008                     if (    UNLIKELY(has_micro_sign || has_ss)
15009                         && (node_type == EXACTFU || (   node_type == EXACTF
15010                                                      && maybe_exactfu)))
15011                     {   /* These two conditions are problematic in non-UTF-8
15012                            EXACTFU nodes. */
15013                         assert(! UTF);
15014                         node_type = EXACTFUP;
15015                     }
15016                     else if (node_type == EXACTFL) {
15017
15018                         /* 'maybe_exactfu' is deliberately set above to
15019                          * indicate this node type, where all code points in it
15020                          * are above 255 */
15021                         if (maybe_exactfu) {
15022                             node_type = EXACTFLU8;
15023                         }
15024                         else if (UNLIKELY(
15025                              _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15026                         {
15027                             /* A character that folds to more than one will
15028                              * match multiple characters, so can't be SIMPLE.
15029                              * We don't have to worry about this with EXACTFLU8
15030                              * nodes just above, as they have already been
15031                              * folded (since the fold doesn't vary at run
15032                              * time).  Here, if the final character in the node
15033                              * folds to multiple, it can't be simple.  (This
15034                              * only has an effect if the node has only a single
15035                              * character, hence the final one, as elsewhere we
15036                              * turn off simple for nodes whose length > 1 */
15037                             maybe_SIMPLE = 0;
15038                         }
15039                     }
15040                     else if (node_type == EXACTF) {  /* Means is /di */
15041
15042                         /* This intermediate variable is needed solely because
15043                          * the asserts in the macro where used exceed Win32's
15044                          * literal string capacity */
15045                         char first_char = * STRING(REGNODE_p(ret));
15046
15047                         /* If 'maybe_exactfu' is clear, then we need to stay
15048                          * /di.  If it is set, it means there are no code
15049                          * points that match differently depending on UTF8ness
15050                          * of the target string, so it can become an EXACTFU
15051                          * node */
15052                         if (! maybe_exactfu) {
15053                             RExC_seen_d_op = TRUE;
15054                         }
15055                         else if (   isALPHA_FOLD_EQ(first_char, 's')
15056                                  || isALPHA_FOLD_EQ(ender, 's'))
15057                         {
15058                             /* But, if the node begins or ends in an 's' we
15059                              * have to defer changing it into an EXACTFU, as
15060                              * the node could later get joined with another one
15061                              * that ends or begins with 's' creating an 'ss'
15062                              * sequence which would then wrongly match the
15063                              * sharp s without the target being UTF-8.  We
15064                              * create a special node that we resolve later when
15065                              * we join nodes together */
15066
15067                             node_type = EXACTFU_S_EDGE;
15068                         }
15069                         else {
15070                             node_type = EXACTFU;
15071                         }
15072                     }
15073
15074                     if (requires_utf8_target && node_type == EXACTFU) {
15075                         node_type = EXACTFU_REQ8;
15076                     }
15077                 }
15078
15079                 OP(REGNODE_p(ret)) = node_type;
15080                 setSTR_LEN(REGNODE_p(ret), len);
15081                 RExC_emit += STR_SZ(len);
15082
15083                 /* If the node isn't a single character, it can't be SIMPLE */
15084                 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15085                     maybe_SIMPLE = 0;
15086                 }
15087
15088                 *flagp |= HASWIDTH | maybe_SIMPLE;
15089             }
15090
15091             Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15092             RExC_parse = p;
15093
15094             {
15095                 /* len is STRLEN which is unsigned, need to copy to signed */
15096                 IV iv = len;
15097                 if (iv < 0)
15098                     vFAIL("Internal disaster");
15099             }
15100
15101         } /* End of label 'defchar:' */
15102         break;
15103     } /* End of giant switch on input character */
15104
15105     /* Position parse to next real character */
15106     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15107                                             FALSE /* Don't force to /x */ );
15108     if (   *RExC_parse == '{'
15109         && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15110     {
15111         if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15112             RExC_parse++;
15113             vFAIL("Unescaped left brace in regex is illegal here");
15114         }
15115         ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15116                                   " passed through");
15117     }
15118
15119     return(ret);
15120 }
15121
15122
15123 STATIC void
15124 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15125 {
15126     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
15127      * sets up the bitmap and any flags, removing those code points from the
15128      * inversion list, setting it to NULL should it become completely empty */
15129
15130     dVAR;
15131
15132     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15133     assert(PL_regkind[OP(node)] == ANYOF);
15134
15135     /* There is no bitmap for this node type */
15136     if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15137         return;
15138     }
15139
15140     ANYOF_BITMAP_ZERO(node);
15141     if (*invlist_ptr) {
15142
15143         /* This gets set if we actually need to modify things */
15144         bool change_invlist = FALSE;
15145
15146         UV start, end;
15147
15148         /* Start looking through *invlist_ptr */
15149         invlist_iterinit(*invlist_ptr);
15150         while (invlist_iternext(*invlist_ptr, &start, &end)) {
15151             UV high;
15152             int i;
15153
15154             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15155                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15156             }
15157
15158             /* Quit if are above what we should change */
15159             if (start >= NUM_ANYOF_CODE_POINTS) {
15160                 break;
15161             }
15162
15163             change_invlist = TRUE;
15164
15165             /* Set all the bits in the range, up to the max that we are doing */
15166             high = (end < NUM_ANYOF_CODE_POINTS - 1)
15167                    ? end
15168                    : NUM_ANYOF_CODE_POINTS - 1;
15169             for (i = start; i <= (int) high; i++) {
15170                 if (! ANYOF_BITMAP_TEST(node, i)) {
15171                     ANYOF_BITMAP_SET(node, i);
15172                 }
15173             }
15174         }
15175         invlist_iterfinish(*invlist_ptr);
15176
15177         /* Done with loop; remove any code points that are in the bitmap from
15178          * *invlist_ptr; similarly for code points above the bitmap if we have
15179          * a flag to match all of them anyways */
15180         if (change_invlist) {
15181             _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15182         }
15183         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15184             _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15185         }
15186
15187         /* If have completely emptied it, remove it completely */
15188         if (_invlist_len(*invlist_ptr) == 0) {
15189             SvREFCNT_dec_NN(*invlist_ptr);
15190             *invlist_ptr = NULL;
15191         }
15192     }
15193 }
15194
15195 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15196    Character classes ([:foo:]) can also be negated ([:^foo:]).
15197    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15198    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15199    but trigger failures because they are currently unimplemented. */
15200
15201 #define POSIXCC_DONE(c)   ((c) == ':')
15202 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15203 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15204 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15205
15206 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
15207 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
15208 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
15209
15210 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15211
15212 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15213  * routine. q.v. */
15214 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
15215         if (posix_warnings) {                                               \
15216             if (! RExC_warn_text ) RExC_warn_text =                         \
15217                                          (AV *) sv_2mortal((SV *) newAV()); \
15218             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
15219                                              WARNING_PREFIX                 \
15220                                              text                           \
15221                                              REPORT_LOCATION,               \
15222                                              REPORT_LOCATION_ARGS(p)));     \
15223         }                                                                   \
15224     } STMT_END
15225 #define CLEAR_POSIX_WARNINGS()                                              \
15226     STMT_START {                                                            \
15227         if (posix_warnings && RExC_warn_text)                               \
15228             av_clear(RExC_warn_text);                                       \
15229     } STMT_END
15230
15231 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
15232     STMT_START {                                                            \
15233         CLEAR_POSIX_WARNINGS();                                             \
15234         return ret;                                                         \
15235     } STMT_END
15236
15237 STATIC int
15238 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15239
15240     const char * const s,      /* Where the putative posix class begins.
15241                                   Normally, this is one past the '['.  This
15242                                   parameter exists so it can be somewhere
15243                                   besides RExC_parse. */
15244     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15245                                   NULL */
15246     AV ** posix_warnings,      /* Where to place any generated warnings, or
15247                                   NULL */
15248     const bool check_only      /* Don't die if error */
15249 )
15250 {
15251     /* This parses what the caller thinks may be one of the three POSIX
15252      * constructs:
15253      *  1) a character class, like [:blank:]
15254      *  2) a collating symbol, like [. .]
15255      *  3) an equivalence class, like [= =]
15256      * In the latter two cases, it croaks if it finds a syntactically legal
15257      * one, as these are not handled by Perl.
15258      *
15259      * The main purpose is to look for a POSIX character class.  It returns:
15260      *  a) the class number
15261      *      if it is a completely syntactically and semantically legal class.
15262      *      'updated_parse_ptr', if not NULL, is set to point to just after the
15263      *      closing ']' of the class
15264      *  b) OOB_NAMEDCLASS
15265      *      if it appears that one of the three POSIX constructs was meant, but
15266      *      its specification was somehow defective.  'updated_parse_ptr', if
15267      *      not NULL, is set to point to the character just after the end
15268      *      character of the class.  See below for handling of warnings.
15269      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15270      *      if it  doesn't appear that a POSIX construct was intended.
15271      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
15272      *      raised.
15273      *
15274      * In b) there may be errors or warnings generated.  If 'check_only' is
15275      * TRUE, then any errors are discarded.  Warnings are returned to the
15276      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
15277      * instead it is NULL, warnings are suppressed.
15278      *
15279      * The reason for this function, and its complexity is that a bracketed
15280      * character class can contain just about anything.  But it's easy to
15281      * mistype the very specific posix class syntax but yielding a valid
15282      * regular bracketed class, so it silently gets compiled into something
15283      * quite unintended.
15284      *
15285      * The solution adopted here maintains backward compatibility except that
15286      * it adds a warning if it looks like a posix class was intended but
15287      * improperly specified.  The warning is not raised unless what is input
15288      * very closely resembles one of the 14 legal posix classes.  To do this,
15289      * it uses fuzzy parsing.  It calculates how many single-character edits it
15290      * would take to transform what was input into a legal posix class.  Only
15291      * if that number is quite small does it think that the intention was a
15292      * posix class.  Obviously these are heuristics, and there will be cases
15293      * where it errs on one side or another, and they can be tweaked as
15294      * experience informs.
15295      *
15296      * The syntax for a legal posix class is:
15297      *
15298      * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15299      *
15300      * What this routine considers syntactically to be an intended posix class
15301      * is this (the comments indicate some restrictions that the pattern
15302      * doesn't show):
15303      *
15304      *  qr/(?x: \[?                         # The left bracket, possibly
15305      *                                      # omitted
15306      *          \h*                         # possibly followed by blanks
15307      *          (?: \^ \h* )?               # possibly a misplaced caret
15308      *          [:;]?                       # The opening class character,
15309      *                                      # possibly omitted.  A typo
15310      *                                      # semi-colon can also be used.
15311      *          \h*
15312      *          \^?                         # possibly a correctly placed
15313      *                                      # caret, but not if there was also
15314      *                                      # a misplaced one
15315      *          \h*
15316      *          .{3,15}                     # The class name.  If there are
15317      *                                      # deviations from the legal syntax,
15318      *                                      # its edit distance must be close
15319      *                                      # to a real class name in order
15320      *                                      # for it to be considered to be
15321      *                                      # an intended posix class.
15322      *          \h*
15323      *          [[:punct:]]?                # The closing class character,
15324      *                                      # possibly omitted.  If not a colon
15325      *                                      # nor semi colon, the class name
15326      *                                      # must be even closer to a valid
15327      *                                      # one
15328      *          \h*
15329      *          \]?                         # The right bracket, possibly
15330      *                                      # omitted.
15331      *     )/
15332      *
15333      * In the above, \h must be ASCII-only.
15334      *
15335      * These are heuristics, and can be tweaked as field experience dictates.
15336      * There will be cases when someone didn't intend to specify a posix class
15337      * that this warns as being so.  The goal is to minimize these, while
15338      * maximizing the catching of things intended to be a posix class that
15339      * aren't parsed as such.
15340      */
15341
15342     const char* p             = s;
15343     const char * const e      = RExC_end;
15344     unsigned complement       = 0;      /* If to complement the class */
15345     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
15346     bool has_opening_bracket  = FALSE;
15347     bool has_opening_colon    = FALSE;
15348     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
15349                                                    valid class */
15350     const char * possible_end = NULL;   /* used for a 2nd parse pass */
15351     const char* name_start;             /* ptr to class name first char */
15352
15353     /* If the number of single-character typos the input name is away from a
15354      * legal name is no more than this number, it is considered to have meant
15355      * the legal name */
15356     int max_distance          = 2;
15357
15358     /* to store the name.  The size determines the maximum length before we
15359      * decide that no posix class was intended.  Should be at least
15360      * sizeof("alphanumeric") */
15361     UV input_text[15];
15362     STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15363
15364     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15365
15366     CLEAR_POSIX_WARNINGS();
15367
15368     if (p >= e) {
15369         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15370     }
15371
15372     if (*(p - 1) != '[') {
15373         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15374         found_problem = TRUE;
15375     }
15376     else {
15377         has_opening_bracket = TRUE;
15378     }
15379
15380     /* They could be confused and think you can put spaces between the
15381      * components */
15382     if (isBLANK(*p)) {
15383         found_problem = TRUE;
15384
15385         do {
15386             p++;
15387         } while (p < e && isBLANK(*p));
15388
15389         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15390     }
15391
15392     /* For [. .] and [= =].  These are quite different internally from [: :],
15393      * so they are handled separately.  */
15394     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15395                                             and 1 for at least one char in it
15396                                           */
15397     {
15398         const char open_char  = *p;
15399         const char * temp_ptr = p + 1;
15400
15401         /* These two constructs are not handled by perl, and if we find a
15402          * syntactically valid one, we croak.  khw, who wrote this code, finds
15403          * this explanation of them very unclear:
15404          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15405          * And searching the rest of the internet wasn't very helpful either.
15406          * It looks like just about any byte can be in these constructs,
15407          * depending on the locale.  But unless the pattern is being compiled
15408          * under /l, which is very rare, Perl runs under the C or POSIX locale.
15409          * In that case, it looks like [= =] isn't allowed at all, and that
15410          * [. .] could be any single code point, but for longer strings the
15411          * constituent characters would have to be the ASCII alphabetics plus
15412          * the minus-hyphen.  Any sensible locale definition would limit itself
15413          * to these.  And any portable one definitely should.  Trying to parse
15414          * the general case is a nightmare (see [perl #127604]).  So, this code
15415          * looks only for interiors of these constructs that match:
15416          *      qr/.|[-\w]{2,}/
15417          * Using \w relaxes the apparent rules a little, without adding much
15418          * danger of mistaking something else for one of these constructs.
15419          *
15420          * [. .] in some implementations described on the internet is usable to
15421          * escape a character that otherwise is special in bracketed character
15422          * classes.  For example [.].] means a literal right bracket instead of
15423          * the ending of the class
15424          *
15425          * [= =] can legitimately contain a [. .] construct, but we don't
15426          * handle this case, as that [. .] construct will later get parsed
15427          * itself and croak then.  And [= =] is checked for even when not under
15428          * /l, as Perl has long done so.
15429          *
15430          * The code below relies on there being a trailing NUL, so it doesn't
15431          * have to keep checking if the parse ptr < e.
15432          */
15433         if (temp_ptr[1] == open_char) {
15434             temp_ptr++;
15435         }
15436         else while (    temp_ptr < e
15437                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15438         {
15439             temp_ptr++;
15440         }
15441
15442         if (*temp_ptr == open_char) {
15443             temp_ptr++;
15444             if (*temp_ptr == ']') {
15445                 temp_ptr++;
15446                 if (! found_problem && ! check_only) {
15447                     RExC_parse = (char *) temp_ptr;
15448                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
15449                             "extensions", open_char, open_char);
15450                 }
15451
15452                 /* Here, the syntax wasn't completely valid, or else the call
15453                  * is to check-only */
15454                 if (updated_parse_ptr) {
15455                     *updated_parse_ptr = (char *) temp_ptr;
15456                 }
15457
15458                 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15459             }
15460         }
15461
15462         /* If we find something that started out to look like one of these
15463          * constructs, but isn't, we continue below so that it can be checked
15464          * for being a class name with a typo of '.' or '=' instead of a colon.
15465          * */
15466     }
15467
15468     /* Here, we think there is a possibility that a [: :] class was meant, and
15469      * we have the first real character.  It could be they think the '^' comes
15470      * first */
15471     if (*p == '^') {
15472         found_problem = TRUE;
15473         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15474         complement = 1;
15475         p++;
15476
15477         if (isBLANK(*p)) {
15478             found_problem = TRUE;
15479
15480             do {
15481                 p++;
15482             } while (p < e && isBLANK(*p));
15483
15484             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15485         }
15486     }
15487
15488     /* But the first character should be a colon, which they could have easily
15489      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15490      * distinguish from a colon, so treat that as a colon).  */
15491     if (*p == ':') {
15492         p++;
15493         has_opening_colon = TRUE;
15494     }
15495     else if (*p == ';') {
15496         found_problem = TRUE;
15497         p++;
15498         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15499         has_opening_colon = TRUE;
15500     }
15501     else {
15502         found_problem = TRUE;
15503         ADD_POSIX_WARNING(p, "there must be a starting ':'");
15504
15505         /* Consider an initial punctuation (not one of the recognized ones) to
15506          * be a left terminator */
15507         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15508             p++;
15509         }
15510     }
15511
15512     /* They may think that you can put spaces between the components */
15513     if (isBLANK(*p)) {
15514         found_problem = TRUE;
15515
15516         do {
15517             p++;
15518         } while (p < e && isBLANK(*p));
15519
15520         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15521     }
15522
15523     if (*p == '^') {
15524
15525         /* We consider something like [^:^alnum:]] to not have been intended to
15526          * be a posix class, but XXX maybe we should */
15527         if (complement) {
15528             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15529         }
15530
15531         complement = 1;
15532         p++;
15533     }
15534
15535     /* Again, they may think that you can put spaces between the components */
15536     if (isBLANK(*p)) {
15537         found_problem = TRUE;
15538
15539         do {
15540             p++;
15541         } while (p < e && isBLANK(*p));
15542
15543         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15544     }
15545
15546     if (*p == ']') {
15547
15548         /* XXX This ']' may be a typo, and something else was meant.  But
15549          * treating it as such creates enough complications, that that
15550          * possibility isn't currently considered here.  So we assume that the
15551          * ']' is what is intended, and if we've already found an initial '[',
15552          * this leaves this construct looking like [:] or [:^], which almost
15553          * certainly weren't intended to be posix classes */
15554         if (has_opening_bracket) {
15555             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15556         }
15557
15558         /* But this function can be called when we parse the colon for
15559          * something like qr/[alpha:]]/, so we back up to look for the
15560          * beginning */
15561         p--;
15562
15563         if (*p == ';') {
15564             found_problem = TRUE;
15565             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15566         }
15567         else if (*p != ':') {
15568
15569             /* XXX We are currently very restrictive here, so this code doesn't
15570              * consider the possibility that, say, /[alpha.]]/ was intended to
15571              * be a posix class. */
15572             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15573         }
15574
15575         /* Here we have something like 'foo:]'.  There was no initial colon,
15576          * and we back up over 'foo.  XXX Unlike the going forward case, we
15577          * don't handle typos of non-word chars in the middle */
15578         has_opening_colon = FALSE;
15579         p--;
15580
15581         while (p > RExC_start && isWORDCHAR(*p)) {
15582             p--;
15583         }
15584         p++;
15585
15586         /* Here, we have positioned ourselves to where we think the first
15587          * character in the potential class is */
15588     }
15589
15590     /* Now the interior really starts.  There are certain key characters that
15591      * can end the interior, or these could just be typos.  To catch both
15592      * cases, we may have to do two passes.  In the first pass, we keep on
15593      * going unless we come to a sequence that matches
15594      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
15595      * This means it takes a sequence to end the pass, so two typos in a row if
15596      * that wasn't what was intended.  If the class is perfectly formed, just
15597      * this one pass is needed.  We also stop if there are too many characters
15598      * being accumulated, but this number is deliberately set higher than any
15599      * real class.  It is set high enough so that someone who thinks that
15600      * 'alphanumeric' is a correct name would get warned that it wasn't.
15601      * While doing the pass, we keep track of where the key characters were in
15602      * it.  If we don't find an end to the class, and one of the key characters
15603      * was found, we redo the pass, but stop when we get to that character.
15604      * Thus the key character was considered a typo in the first pass, but a
15605      * terminator in the second.  If two key characters are found, we stop at
15606      * the second one in the first pass.  Again this can miss two typos, but
15607      * catches a single one
15608      *
15609      * In the first pass, 'possible_end' starts as NULL, and then gets set to
15610      * point to the first key character.  For the second pass, it starts as -1.
15611      * */
15612
15613     name_start = p;
15614   parse_name:
15615     {
15616         bool has_blank               = FALSE;
15617         bool has_upper               = FALSE;
15618         bool has_terminating_colon   = FALSE;
15619         bool has_terminating_bracket = FALSE;
15620         bool has_semi_colon          = FALSE;
15621         unsigned int name_len        = 0;
15622         int punct_count              = 0;
15623
15624         while (p < e) {
15625
15626             /* Squeeze out blanks when looking up the class name below */
15627             if (isBLANK(*p) ) {
15628                 has_blank = TRUE;
15629                 found_problem = TRUE;
15630                 p++;
15631                 continue;
15632             }
15633
15634             /* The name will end with a punctuation */
15635             if (isPUNCT(*p)) {
15636                 const char * peek = p + 1;
15637
15638                 /* Treat any non-']' punctuation followed by a ']' (possibly
15639                  * with intervening blanks) as trying to terminate the class.
15640                  * ']]' is very likely to mean a class was intended (but
15641                  * missing the colon), but the warning message that gets
15642                  * generated shows the error position better if we exit the
15643                  * loop at the bottom (eventually), so skip it here. */
15644                 if (*p != ']') {
15645                     if (peek < e && isBLANK(*peek)) {
15646                         has_blank = TRUE;
15647                         found_problem = TRUE;
15648                         do {
15649                             peek++;
15650                         } while (peek < e && isBLANK(*peek));
15651                     }
15652
15653                     if (peek < e && *peek == ']') {
15654                         has_terminating_bracket = TRUE;
15655                         if (*p == ':') {
15656                             has_terminating_colon = TRUE;
15657                         }
15658                         else if (*p == ';') {
15659                             has_semi_colon = TRUE;
15660                             has_terminating_colon = TRUE;
15661                         }
15662                         else {
15663                             found_problem = TRUE;
15664                         }
15665                         p = peek + 1;
15666                         goto try_posix;
15667                     }
15668                 }
15669
15670                 /* Here we have punctuation we thought didn't end the class.
15671                  * Keep track of the position of the key characters that are
15672                  * more likely to have been class-enders */
15673                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15674
15675                     /* Allow just one such possible class-ender not actually
15676                      * ending the class. */
15677                     if (possible_end) {
15678                         break;
15679                     }
15680                     possible_end = p;
15681                 }
15682
15683                 /* If we have too many punctuation characters, no use in
15684                  * keeping going */
15685                 if (++punct_count > max_distance) {
15686                     break;
15687                 }
15688
15689                 /* Treat the punctuation as a typo. */
15690                 input_text[name_len++] = *p;
15691                 p++;
15692             }
15693             else if (isUPPER(*p)) { /* Use lowercase for lookup */
15694                 input_text[name_len++] = toLOWER(*p);
15695                 has_upper = TRUE;
15696                 found_problem = TRUE;
15697                 p++;
15698             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15699                 input_text[name_len++] = *p;
15700                 p++;
15701             }
15702             else {
15703                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15704                 p+= UTF8SKIP(p);
15705             }
15706
15707             /* The declaration of 'input_text' is how long we allow a potential
15708              * class name to be, before saying they didn't mean a class name at
15709              * all */
15710             if (name_len >= C_ARRAY_LENGTH(input_text)) {
15711                 break;
15712             }
15713         }
15714
15715         /* We get to here when the possible class name hasn't been properly
15716          * terminated before:
15717          *   1) we ran off the end of the pattern; or
15718          *   2) found two characters, each of which might have been intended to
15719          *      be the name's terminator
15720          *   3) found so many punctuation characters in the purported name,
15721          *      that the edit distance to a valid one is exceeded
15722          *   4) we decided it was more characters than anyone could have
15723          *      intended to be one. */
15724
15725         found_problem = TRUE;
15726
15727         /* In the final two cases, we know that looking up what we've
15728          * accumulated won't lead to a match, even a fuzzy one. */
15729         if (   name_len >= C_ARRAY_LENGTH(input_text)
15730             || punct_count > max_distance)
15731         {
15732             /* If there was an intermediate key character that could have been
15733              * an intended end, redo the parse, but stop there */
15734             if (possible_end && possible_end != (char *) -1) {
15735                 possible_end = (char *) -1; /* Special signal value to say
15736                                                we've done a first pass */
15737                 p = name_start;
15738                 goto parse_name;
15739             }
15740
15741             /* Otherwise, it can't have meant to have been a class */
15742             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15743         }
15744
15745         /* If we ran off the end, and the final character was a punctuation
15746          * one, back up one, to look at that final one just below.  Later, we
15747          * will restore the parse pointer if appropriate */
15748         if (name_len && p == e && isPUNCT(*(p-1))) {
15749             p--;
15750             name_len--;
15751         }
15752
15753         if (p < e && isPUNCT(*p)) {
15754             if (*p == ']') {
15755                 has_terminating_bracket = TRUE;
15756
15757                 /* If this is a 2nd ']', and the first one is just below this
15758                  * one, consider that to be the real terminator.  This gives a
15759                  * uniform and better positioning for the warning message  */
15760                 if (   possible_end
15761                     && possible_end != (char *) -1
15762                     && *possible_end == ']'
15763                     && name_len && input_text[name_len - 1] == ']')
15764                 {
15765                     name_len--;
15766                     p = possible_end;
15767
15768                     /* And this is actually equivalent to having done the 2nd
15769                      * pass now, so set it to not try again */
15770                     possible_end = (char *) -1;
15771                 }
15772             }
15773             else {
15774                 if (*p == ':') {
15775                     has_terminating_colon = TRUE;
15776                 }
15777                 else if (*p == ';') {
15778                     has_semi_colon = TRUE;
15779                     has_terminating_colon = TRUE;
15780                 }
15781                 p++;
15782             }
15783         }
15784
15785     try_posix:
15786
15787         /* Here, we have a class name to look up.  We can short circuit the
15788          * stuff below for short names that can't possibly be meant to be a
15789          * class name.  (We can do this on the first pass, as any second pass
15790          * will yield an even shorter name) */
15791         if (name_len < 3) {
15792             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15793         }
15794
15795         /* Find which class it is.  Initially switch on the length of the name.
15796          * */
15797         switch (name_len) {
15798             case 4:
15799                 if (memEQs(name_start, 4, "word")) {
15800                     /* this is not POSIX, this is the Perl \w */
15801                     class_number = ANYOF_WORDCHAR;
15802                 }
15803                 break;
15804             case 5:
15805                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15806                  *                        graph lower print punct space upper
15807                  * Offset 4 gives the best switch position.  */
15808                 switch (name_start[4]) {
15809                     case 'a':
15810                         if (memBEGINs(name_start, 5, "alph")) /* alpha */
15811                             class_number = ANYOF_ALPHA;
15812                         break;
15813                     case 'e':
15814                         if (memBEGINs(name_start, 5, "spac")) /* space */
15815                             class_number = ANYOF_SPACE;
15816                         break;
15817                     case 'h':
15818                         if (memBEGINs(name_start, 5, "grap")) /* graph */
15819                             class_number = ANYOF_GRAPH;
15820                         break;
15821                     case 'i':
15822                         if (memBEGINs(name_start, 5, "asci")) /* ascii */
15823                             class_number = ANYOF_ASCII;
15824                         break;
15825                     case 'k':
15826                         if (memBEGINs(name_start, 5, "blan")) /* blank */
15827                             class_number = ANYOF_BLANK;
15828                         break;
15829                     case 'l':
15830                         if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15831                             class_number = ANYOF_CNTRL;
15832                         break;
15833                     case 'm':
15834                         if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15835                             class_number = ANYOF_ALPHANUMERIC;
15836                         break;
15837                     case 'r':
15838                         if (memBEGINs(name_start, 5, "lowe")) /* lower */
15839                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15840                         else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15841                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15842                         break;
15843                     case 't':
15844                         if (memBEGINs(name_start, 5, "digi")) /* digit */
15845                             class_number = ANYOF_DIGIT;
15846                         else if (memBEGINs(name_start, 5, "prin")) /* print */
15847                             class_number = ANYOF_PRINT;
15848                         else if (memBEGINs(name_start, 5, "punc")) /* punct */
15849                             class_number = ANYOF_PUNCT;
15850                         break;
15851                 }
15852                 break;
15853             case 6:
15854                 if (memEQs(name_start, 6, "xdigit"))
15855                     class_number = ANYOF_XDIGIT;
15856                 break;
15857         }
15858
15859         /* If the name exactly matches a posix class name the class number will
15860          * here be set to it, and the input almost certainly was meant to be a
15861          * posix class, so we can skip further checking.  If instead the syntax
15862          * is exactly correct, but the name isn't one of the legal ones, we
15863          * will return that as an error below.  But if neither of these apply,
15864          * it could be that no posix class was intended at all, or that one
15865          * was, but there was a typo.  We tease these apart by doing fuzzy
15866          * matching on the name */
15867         if (class_number == OOB_NAMEDCLASS && found_problem) {
15868             const UV posix_names[][6] = {
15869                                                 { 'a', 'l', 'n', 'u', 'm' },
15870                                                 { 'a', 'l', 'p', 'h', 'a' },
15871                                                 { 'a', 's', 'c', 'i', 'i' },
15872                                                 { 'b', 'l', 'a', 'n', 'k' },
15873                                                 { 'c', 'n', 't', 'r', 'l' },
15874                                                 { 'd', 'i', 'g', 'i', 't' },
15875                                                 { 'g', 'r', 'a', 'p', 'h' },
15876                                                 { 'l', 'o', 'w', 'e', 'r' },
15877                                                 { 'p', 'r', 'i', 'n', 't' },
15878                                                 { 'p', 'u', 'n', 'c', 't' },
15879                                                 { 's', 'p', 'a', 'c', 'e' },
15880                                                 { 'u', 'p', 'p', 'e', 'r' },
15881                                                 { 'w', 'o', 'r', 'd' },
15882                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
15883                                             };
15884             /* The names of the above all have added NULs to make them the same
15885              * size, so we need to also have the real lengths */
15886             const UV posix_name_lengths[] = {
15887                                                 sizeof("alnum") - 1,
15888                                                 sizeof("alpha") - 1,
15889                                                 sizeof("ascii") - 1,
15890                                                 sizeof("blank") - 1,
15891                                                 sizeof("cntrl") - 1,
15892                                                 sizeof("digit") - 1,
15893                                                 sizeof("graph") - 1,
15894                                                 sizeof("lower") - 1,
15895                                                 sizeof("print") - 1,
15896                                                 sizeof("punct") - 1,
15897                                                 sizeof("space") - 1,
15898                                                 sizeof("upper") - 1,
15899                                                 sizeof("word")  - 1,
15900                                                 sizeof("xdigit")- 1
15901                                             };
15902             unsigned int i;
15903             int temp_max = max_distance;    /* Use a temporary, so if we
15904                                                reparse, we haven't changed the
15905                                                outer one */
15906
15907             /* Use a smaller max edit distance if we are missing one of the
15908              * delimiters */
15909             if (   has_opening_bracket + has_opening_colon < 2
15910                 || has_terminating_bracket + has_terminating_colon < 2)
15911             {
15912                 temp_max--;
15913             }
15914
15915             /* See if the input name is close to a legal one */
15916             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15917
15918                 /* Short circuit call if the lengths are too far apart to be
15919                  * able to match */
15920                 if (abs( (int) (name_len - posix_name_lengths[i]))
15921                     > temp_max)
15922                 {
15923                     continue;
15924                 }
15925
15926                 if (edit_distance(input_text,
15927                                   posix_names[i],
15928                                   name_len,
15929                                   posix_name_lengths[i],
15930                                   temp_max
15931                                  )
15932                     > -1)
15933                 { /* If it is close, it probably was intended to be a class */
15934                     goto probably_meant_to_be;
15935                 }
15936             }
15937
15938             /* Here the input name is not close enough to a valid class name
15939              * for us to consider it to be intended to be a posix class.  If
15940              * we haven't already done so, and the parse found a character that
15941              * could have been terminators for the name, but which we absorbed
15942              * as typos during the first pass, repeat the parse, signalling it
15943              * to stop at that character */
15944             if (possible_end && possible_end != (char *) -1) {
15945                 possible_end = (char *) -1;
15946                 p = name_start;
15947                 goto parse_name;
15948             }
15949
15950             /* Here neither pass found a close-enough class name */
15951             CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15952         }
15953
15954     probably_meant_to_be:
15955
15956         /* Here we think that a posix specification was intended.  Update any
15957          * parse pointer */
15958         if (updated_parse_ptr) {
15959             *updated_parse_ptr = (char *) p;
15960         }
15961
15962         /* If a posix class name was intended but incorrectly specified, we
15963          * output or return the warnings */
15964         if (found_problem) {
15965
15966             /* We set flags for these issues in the parse loop above instead of
15967              * adding them to the list of warnings, because we can parse it
15968              * twice, and we only want one warning instance */
15969             if (has_upper) {
15970                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15971             }
15972             if (has_blank) {
15973                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15974             }
15975             if (has_semi_colon) {
15976                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15977             }
15978             else if (! has_terminating_colon) {
15979                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15980             }
15981             if (! has_terminating_bracket) {
15982                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15983             }
15984
15985             if (   posix_warnings
15986                 && RExC_warn_text
15987                 && av_top_index(RExC_warn_text) > -1)
15988             {
15989                 *posix_warnings = RExC_warn_text;
15990             }
15991         }
15992         else if (class_number != OOB_NAMEDCLASS) {
15993             /* If it is a known class, return the class.  The class number
15994              * #defines are structured so each complement is +1 to the normal
15995              * one */
15996             CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15997         }
15998         else if (! check_only) {
15999
16000             /* Here, it is an unrecognized class.  This is an error (unless the
16001             * call is to check only, which we've already handled above) */
16002             const char * const complement_string = (complement)
16003                                                    ? "^"
16004                                                    : "";
16005             RExC_parse = (char *) p;
16006             vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16007                         complement_string,
16008                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16009         }
16010     }
16011
16012     return OOB_NAMEDCLASS;
16013 }
16014 #undef ADD_POSIX_WARNING
16015
16016 STATIC unsigned  int
16017 S_regex_set_precedence(const U8 my_operator) {
16018
16019     /* Returns the precedence in the (?[...]) construct of the input operator,
16020      * specified by its character representation.  The precedence follows
16021      * general Perl rules, but it extends this so that ')' and ']' have (low)
16022      * precedence even though they aren't really operators */
16023
16024     switch (my_operator) {
16025         case '!':
16026             return 5;
16027         case '&':
16028             return 4;
16029         case '^':
16030         case '|':
16031         case '+':
16032         case '-':
16033             return 3;
16034         case ')':
16035             return 2;
16036         case ']':
16037             return 1;
16038     }
16039
16040     NOT_REACHED; /* NOTREACHED */
16041     return 0;   /* Silence compiler warning */
16042 }
16043
16044 STATIC regnode_offset
16045 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16046                     I32 *flagp, U32 depth,
16047                     char * const oregcomp_parse)
16048 {
16049     /* Handle the (?[...]) construct to do set operations */
16050
16051     U8 curchar;                     /* Current character being parsed */
16052     UV start, end;                  /* End points of code point ranges */
16053     SV* final = NULL;               /* The end result inversion list */
16054     SV* result_string;              /* 'final' stringified */
16055     AV* stack;                      /* stack of operators and operands not yet
16056                                        resolved */
16057     AV* fence_stack = NULL;         /* A stack containing the positions in
16058                                        'stack' of where the undealt-with left
16059                                        parens would be if they were actually
16060                                        put there */
16061     /* The 'volatile' is a workaround for an optimiser bug
16062      * in Solaris Studio 12.3. See RT #127455 */
16063     volatile IV fence = 0;          /* Position of where most recent undealt-
16064                                        with left paren in stack is; -1 if none.
16065                                      */
16066     STRLEN len;                     /* Temporary */
16067     regnode_offset node;                  /* Temporary, and final regnode returned by
16068                                        this function */
16069     const bool save_fold = FOLD;    /* Temporary */
16070     char *save_end, *save_parse;    /* Temporaries */
16071     const bool in_locale = LOC;     /* we turn off /l during processing */
16072
16073     GET_RE_DEBUG_FLAGS_DECL;
16074
16075     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16076
16077     DEBUG_PARSE("xcls");
16078
16079     if (in_locale) {
16080         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16081     }
16082
16083     /* The use of this operator implies /u.  This is required so that the
16084      * compile time values are valid in all runtime cases */
16085     REQUIRE_UNI_RULES(flagp, 0);
16086
16087     ckWARNexperimental(RExC_parse,
16088                        WARN_EXPERIMENTAL__REGEX_SETS,
16089                        "The regex_sets feature is experimental");
16090
16091     /* Everything in this construct is a metacharacter.  Operands begin with
16092      * either a '\' (for an escape sequence), or a '[' for a bracketed
16093      * character class.  Any other character should be an operator, or
16094      * parenthesis for grouping.  Both types of operands are handled by calling
16095      * regclass() to parse them.  It is called with a parameter to indicate to
16096      * return the computed inversion list.  The parsing here is implemented via
16097      * a stack.  Each entry on the stack is a single character representing one
16098      * of the operators; or else a pointer to an operand inversion list. */
16099
16100 #define IS_OPERATOR(a) SvIOK(a)
16101 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
16102
16103     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
16104      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16105      * with pronouncing it called it Reverse Polish instead, but now that YOU
16106      * know how to pronounce it you can use the correct term, thus giving due
16107      * credit to the person who invented it, and impressing your geek friends.
16108      * Wikipedia says that the pronounciation of "Ł" has been changing so that
16109      * it is now more like an English initial W (as in wonk) than an L.)
16110      *
16111      * This means that, for example, 'a | b & c' is stored on the stack as
16112      *
16113      * c  [4]
16114      * b  [3]
16115      * &  [2]
16116      * a  [1]
16117      * |  [0]
16118      *
16119      * where the numbers in brackets give the stack [array] element number.
16120      * In this implementation, parentheses are not stored on the stack.
16121      * Instead a '(' creates a "fence" so that the part of the stack below the
16122      * fence is invisible except to the corresponding ')' (this allows us to
16123      * replace testing for parens, by using instead subtraction of the fence
16124      * position).  As new operands are processed they are pushed onto the stack
16125      * (except as noted in the next paragraph).  New operators of higher
16126      * precedence than the current final one are inserted on the stack before
16127      * the lhs operand (so that when the rhs is pushed next, everything will be
16128      * in the correct positions shown above.  When an operator of equal or
16129      * lower precedence is encountered in parsing, all the stacked operations
16130      * of equal or higher precedence are evaluated, leaving the result as the
16131      * top entry on the stack.  This makes higher precedence operations
16132      * evaluate before lower precedence ones, and causes operations of equal
16133      * precedence to left associate.
16134      *
16135      * The only unary operator '!' is immediately pushed onto the stack when
16136      * encountered.  When an operand is encountered, if the top of the stack is
16137      * a '!", the complement is immediately performed, and the '!' popped.  The
16138      * resulting value is treated as a new operand, and the logic in the
16139      * previous paragraph is executed.  Thus in the expression
16140      *      [a] + ! [b]
16141      * the stack looks like
16142      *
16143      * !
16144      * a
16145      * +
16146      *
16147      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16148      * becomes
16149      *
16150      * !b
16151      * a
16152      * +
16153      *
16154      * A ')' is treated as an operator with lower precedence than all the
16155      * aforementioned ones, which causes all operations on the stack above the
16156      * corresponding '(' to be evaluated down to a single resultant operand.
16157      * Then the fence for the '(' is removed, and the operand goes through the
16158      * algorithm above, without the fence.
16159      *
16160      * A separate stack is kept of the fence positions, so that the position of
16161      * the latest so-far unbalanced '(' is at the top of it.
16162      *
16163      * The ']' ending the construct is treated as the lowest operator of all,
16164      * so that everything gets evaluated down to a single operand, which is the
16165      * result */
16166
16167     sv_2mortal((SV *)(stack = newAV()));
16168     sv_2mortal((SV *)(fence_stack = newAV()));
16169
16170     while (RExC_parse < RExC_end) {
16171         I32 top_index;              /* Index of top-most element in 'stack' */
16172         SV** top_ptr;               /* Pointer to top 'stack' element */
16173         SV* current = NULL;         /* To contain the current inversion list
16174                                        operand */
16175         SV* only_to_avoid_leaks;
16176
16177         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16178                                 TRUE /* Force /x */ );
16179         if (RExC_parse >= RExC_end) {   /* Fail */
16180             break;
16181         }
16182
16183         curchar = UCHARAT(RExC_parse);
16184
16185 redo_curchar:
16186
16187 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16188                     /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16189         DEBUG_U(dump_regex_sets_structures(pRExC_state,
16190                                            stack, fence, fence_stack));
16191 #endif
16192
16193         top_index = av_tindex_skip_len_mg(stack);
16194
16195         switch (curchar) {
16196             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
16197             char stacked_operator;  /* The topmost operator on the 'stack'. */
16198             SV* lhs;                /* Operand to the left of the operator */
16199             SV* rhs;                /* Operand to the right of the operator */
16200             SV* fence_ptr;          /* Pointer to top element of the fence
16201                                        stack */
16202
16203             case '(':
16204
16205                 if (   RExC_parse < RExC_end - 2
16206                     && UCHARAT(RExC_parse + 1) == '?'
16207                     && UCHARAT(RExC_parse + 2) == '^')
16208                 {
16209                     /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
16210                      * This happens when we have some thing like
16211                      *
16212                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16213                      *   ...
16214                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
16215                      *
16216                      * Here we would be handling the interpolated
16217                      * '$thai_or_lao'.  We handle this by a recursive call to
16218                      * ourselves which returns the inversion list the
16219                      * interpolated expression evaluates to.  We use the flags
16220                      * from the interpolated pattern. */
16221                     U32 save_flags = RExC_flags;
16222                     const char * save_parse;
16223
16224                     RExC_parse += 2;        /* Skip past the '(?' */
16225                     save_parse = RExC_parse;
16226
16227                     /* Parse the flags for the '(?'.  We already know the first
16228                      * flag to parse is a '^' */
16229                     parse_lparen_question_flags(pRExC_state);
16230
16231                     if (   RExC_parse >= RExC_end - 4
16232                         || UCHARAT(RExC_parse) != ':'
16233                         || UCHARAT(++RExC_parse) != '('
16234                         || UCHARAT(++RExC_parse) != '?'
16235                         || UCHARAT(++RExC_parse) != '[')
16236                     {
16237
16238                         /* In combination with the above, this moves the
16239                          * pointer to the point just after the first erroneous
16240                          * character. */
16241                         if (RExC_parse >= RExC_end - 4) {
16242                             RExC_parse = RExC_end;
16243                         }
16244                         else if (RExC_parse != save_parse) {
16245                             RExC_parse += (UTF)
16246                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
16247                                           : 1;
16248                         }
16249                         vFAIL("Expecting '(?flags:(?[...'");
16250                     }
16251
16252                     /* Recurse, with the meat of the embedded expression */
16253                     RExC_parse++;
16254                     if (! handle_regex_sets(pRExC_state, &current, flagp,
16255                                                     depth+1, oregcomp_parse))
16256                     {
16257                         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16258                     }
16259
16260                     /* Here, 'current' contains the embedded expression's
16261                      * inversion list, and RExC_parse points to the trailing
16262                      * ']'; the next character should be the ')' */
16263                     RExC_parse++;
16264                     if (UCHARAT(RExC_parse) != ')')
16265                         vFAIL("Expecting close paren for nested extended charclass");
16266
16267                     /* Then the ')' matching the original '(' handled by this
16268                      * case: statement */
16269                     RExC_parse++;
16270                     if (UCHARAT(RExC_parse) != ')')
16271                         vFAIL("Expecting close paren for wrapper for nested extended charclass");
16272
16273                     RExC_flags = save_flags;
16274                     goto handle_operand;
16275                 }
16276
16277                 /* A regular '('.  Look behind for illegal syntax */
16278                 if (top_index - fence >= 0) {
16279                     /* If the top entry on the stack is an operator, it had
16280                      * better be a '!', otherwise the entry below the top
16281                      * operand should be an operator */
16282                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
16283                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16284                         || (   IS_OPERAND(*top_ptr)
16285                             && (   top_index - fence < 1
16286                                 || ! (stacked_ptr = av_fetch(stack,
16287                                                              top_index - 1,
16288                                                              FALSE))
16289                                 || ! IS_OPERATOR(*stacked_ptr))))
16290                     {
16291                         RExC_parse++;
16292                         vFAIL("Unexpected '(' with no preceding operator");
16293                     }
16294                 }
16295
16296                 /* Stack the position of this undealt-with left paren */
16297                 av_push(fence_stack, newSViv(fence));
16298                 fence = top_index + 1;
16299                 break;
16300
16301             case '\\':
16302                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16303                  * multi-char folds are allowed.  */
16304                 if (!regclass(pRExC_state, flagp, depth+1,
16305                               TRUE, /* means parse just the next thing */
16306                               FALSE, /* don't allow multi-char folds */
16307                               FALSE, /* don't silence non-portable warnings.  */
16308                               TRUE,  /* strict */
16309                               FALSE, /* Require return to be an ANYOF */
16310                               &current))
16311                 {
16312                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16313                     goto regclass_failed;
16314                 }
16315
16316                 /* regclass() will return with parsing just the \ sequence,
16317                  * leaving the parse pointer at the next thing to parse */
16318                 RExC_parse--;
16319                 goto handle_operand;
16320
16321             case '[':   /* Is a bracketed character class */
16322             {
16323                 /* See if this is a [:posix:] class. */
16324                 bool is_posix_class = (OOB_NAMEDCLASS
16325                             < handle_possible_posix(pRExC_state,
16326                                                 RExC_parse + 1,
16327                                                 NULL,
16328                                                 NULL,
16329                                                 TRUE /* checking only */));
16330                 /* If it is a posix class, leave the parse pointer at the '['
16331                  * to fool regclass() into thinking it is part of a
16332                  * '[[:posix:]]'. */
16333                 if (! is_posix_class) {
16334                     RExC_parse++;
16335                 }
16336
16337                 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16338                  * multi-char folds are allowed.  */
16339                 if (!regclass(pRExC_state, flagp, depth+1,
16340                                 is_posix_class, /* parse the whole char
16341                                                     class only if not a
16342                                                     posix class */
16343                                 FALSE, /* don't allow multi-char folds */
16344                                 TRUE, /* silence non-portable warnings. */
16345                                 TRUE, /* strict */
16346                                 FALSE, /* Require return to be an ANYOF */
16347                                 &current))
16348                 {
16349                     RETURN_FAIL_ON_RESTART(*flagp, flagp);
16350                     goto regclass_failed;
16351                 }
16352
16353                 if (! current) {
16354                     break;
16355                 }
16356
16357                 /* function call leaves parse pointing to the ']', except if we
16358                  * faked it */
16359                 if (is_posix_class) {
16360                     RExC_parse--;
16361                 }
16362
16363                 goto handle_operand;
16364             }
16365
16366             case ']':
16367                 if (top_index >= 1) {
16368                     goto join_operators;
16369                 }
16370
16371                 /* Only a single operand on the stack: are done */
16372                 goto done;
16373
16374             case ')':
16375                 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16376                     if (UCHARAT(RExC_parse - 1) == ']')  {
16377                         break;
16378                     }
16379                     RExC_parse++;
16380                     vFAIL("Unexpected ')'");
16381                 }
16382
16383                 /* If nothing after the fence, is missing an operand */
16384                 if (top_index - fence < 0) {
16385                     RExC_parse++;
16386                     goto bad_syntax;
16387                 }
16388                 /* If at least two things on the stack, treat this as an
16389                   * operator */
16390                 if (top_index - fence >= 1) {
16391                     goto join_operators;
16392                 }
16393
16394                 /* Here only a single thing on the fenced stack, and there is a
16395                  * fence.  Get rid of it */
16396                 fence_ptr = av_pop(fence_stack);
16397                 assert(fence_ptr);
16398                 fence = SvIV(fence_ptr);
16399                 SvREFCNT_dec_NN(fence_ptr);
16400                 fence_ptr = NULL;
16401
16402                 if (fence < 0) {
16403                     fence = 0;
16404                 }
16405
16406                 /* Having gotten rid of the fence, we pop the operand at the
16407                  * stack top and process it as a newly encountered operand */
16408                 current = av_pop(stack);
16409                 if (IS_OPERAND(current)) {
16410                     goto handle_operand;
16411                 }
16412
16413                 RExC_parse++;
16414                 goto bad_syntax;
16415
16416             case '&':
16417             case '|':
16418             case '+':
16419             case '-':
16420             case '^':
16421
16422                 /* These binary operators should have a left operand already
16423                  * parsed */
16424                 if (   top_index - fence < 0
16425                     || top_index - fence == 1
16426                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16427                     || ! IS_OPERAND(*top_ptr))
16428                 {
16429                     goto unexpected_binary;
16430                 }
16431
16432                 /* If only the one operand is on the part of the stack visible
16433                  * to us, we just place this operator in the proper position */
16434                 if (top_index - fence < 2) {
16435
16436                     /* Place the operator before the operand */
16437
16438                     SV* lhs = av_pop(stack);
16439                     av_push(stack, newSVuv(curchar));
16440                     av_push(stack, lhs);
16441                     break;
16442                 }
16443
16444                 /* But if there is something else on the stack, we need to
16445                  * process it before this new operator if and only if the
16446                  * stacked operation has equal or higher precedence than the
16447                  * new one */
16448
16449              join_operators:
16450
16451                 /* The operator on the stack is supposed to be below both its
16452                  * operands */
16453                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16454                     || IS_OPERAND(*stacked_ptr))
16455                 {
16456                     /* But if not, it's legal and indicates we are completely
16457                      * done if and only if we're currently processing a ']',
16458                      * which should be the final thing in the expression */
16459                     if (curchar == ']') {
16460                         goto done;
16461                     }
16462
16463                   unexpected_binary:
16464                     RExC_parse++;
16465                     vFAIL2("Unexpected binary operator '%c' with no "
16466                            "preceding operand", curchar);
16467                 }
16468                 stacked_operator = (char) SvUV(*stacked_ptr);
16469
16470                 if (regex_set_precedence(curchar)
16471                     > regex_set_precedence(stacked_operator))
16472                 {
16473                     /* Here, the new operator has higher precedence than the
16474                      * stacked one.  This means we need to add the new one to
16475                      * the stack to await its rhs operand (and maybe more
16476                      * stuff).  We put it before the lhs operand, leaving
16477                      * untouched the stacked operator and everything below it
16478                      * */
16479                     lhs = av_pop(stack);
16480                     assert(IS_OPERAND(lhs));
16481
16482                     av_push(stack, newSVuv(curchar));
16483                     av_push(stack, lhs);
16484                     break;
16485                 }
16486
16487                 /* Here, the new operator has equal or lower precedence than
16488                  * what's already there.  This means the operation already
16489                  * there should be performed now, before the new one. */
16490
16491                 rhs = av_pop(stack);
16492                 if (! IS_OPERAND(rhs)) {
16493
16494                     /* This can happen when a ! is not followed by an operand,
16495                      * like in /(?[\t &!])/ */
16496                     goto bad_syntax;
16497                 }
16498
16499                 lhs = av_pop(stack);
16500
16501                 if (! IS_OPERAND(lhs)) {
16502
16503                     /* This can happen when there is an empty (), like in
16504                      * /(?[[0]+()+])/ */
16505                     goto bad_syntax;
16506                 }
16507
16508                 switch (stacked_operator) {
16509                     case '&':
16510                         _invlist_intersection(lhs, rhs, &rhs);
16511                         break;
16512
16513                     case '|':
16514                     case '+':
16515                         _invlist_union(lhs, rhs, &rhs);
16516                         break;
16517
16518                     case '-':
16519                         _invlist_subtract(lhs, rhs, &rhs);
16520                         break;
16521
16522                     case '^':   /* The union minus the intersection */
16523                     {
16524                         SV* i = NULL;
16525                         SV* u = NULL;
16526
16527                         _invlist_union(lhs, rhs, &u);
16528                         _invlist_intersection(lhs, rhs, &i);
16529                         _invlist_subtract(u, i, &rhs);
16530                         SvREFCNT_dec_NN(i);
16531                         SvREFCNT_dec_NN(u);
16532                         break;
16533                     }
16534                 }
16535                 SvREFCNT_dec(lhs);
16536
16537                 /* Here, the higher precedence operation has been done, and the
16538                  * result is in 'rhs'.  We overwrite the stacked operator with
16539                  * the result.  Then we redo this code to either push the new
16540                  * operator onto the stack or perform any higher precedence
16541                  * stacked operation */
16542                 only_to_avoid_leaks = av_pop(stack);
16543                 SvREFCNT_dec(only_to_avoid_leaks);
16544                 av_push(stack, rhs);
16545                 goto redo_curchar;
16546
16547             case '!':   /* Highest priority, right associative */
16548
16549                 /* If what's already at the top of the stack is another '!",
16550                  * they just cancel each other out */
16551                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
16552                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16553                 {
16554                     only_to_avoid_leaks = av_pop(stack);
16555                     SvREFCNT_dec(only_to_avoid_leaks);
16556                 }
16557                 else { /* Otherwise, since it's right associative, just push
16558                           onto the stack */
16559                     av_push(stack, newSVuv(curchar));
16560                 }
16561                 break;
16562
16563             default:
16564                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16565                 if (RExC_parse >= RExC_end) {
16566                     break;
16567                 }
16568                 vFAIL("Unexpected character");
16569
16570           handle_operand:
16571
16572             /* Here 'current' is the operand.  If something is already on the
16573              * stack, we have to check if it is a !.  But first, the code above
16574              * may have altered the stack in the time since we earlier set
16575              * 'top_index'.  */
16576
16577             top_index = av_tindex_skip_len_mg(stack);
16578             if (top_index - fence >= 0) {
16579                 /* If the top entry on the stack is an operator, it had better
16580                  * be a '!', otherwise the entry below the top operand should
16581                  * be an operator */
16582                 top_ptr = av_fetch(stack, top_index, FALSE);
16583                 assert(top_ptr);
16584                 if (IS_OPERATOR(*top_ptr)) {
16585
16586                     /* The only permissible operator at the top of the stack is
16587                      * '!', which is applied immediately to this operand. */
16588                     curchar = (char) SvUV(*top_ptr);
16589                     if (curchar != '!') {
16590                         SvREFCNT_dec(current);
16591                         vFAIL2("Unexpected binary operator '%c' with no "
16592                                 "preceding operand", curchar);
16593                     }
16594
16595                     _invlist_invert(current);
16596
16597                     only_to_avoid_leaks = av_pop(stack);
16598                     SvREFCNT_dec(only_to_avoid_leaks);
16599
16600                     /* And we redo with the inverted operand.  This allows
16601                      * handling multiple ! in a row */
16602                     goto handle_operand;
16603                 }
16604                           /* Single operand is ok only for the non-binary ')'
16605                            * operator */
16606                 else if ((top_index - fence == 0 && curchar != ')')
16607                          || (top_index - fence > 0
16608                              && (! (stacked_ptr = av_fetch(stack,
16609                                                            top_index - 1,
16610                                                            FALSE))
16611                                  || IS_OPERAND(*stacked_ptr))))
16612                 {
16613                     SvREFCNT_dec(current);
16614                     vFAIL("Operand with no preceding operator");
16615                 }
16616             }
16617
16618             /* Here there was nothing on the stack or the top element was
16619              * another operand.  Just add this new one */
16620             av_push(stack, current);
16621
16622         } /* End of switch on next parse token */
16623
16624         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16625     } /* End of loop parsing through the construct */
16626
16627     vFAIL("Syntax error in (?[...])");
16628
16629   done:
16630
16631     if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16632         if (RExC_parse < RExC_end) {
16633             RExC_parse++;
16634         }
16635
16636         vFAIL("Unexpected ']' with no following ')' in (?[...");
16637     }
16638
16639     if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16640         vFAIL("Unmatched (");
16641     }
16642
16643     if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
16644         || ((final = av_pop(stack)) == NULL)
16645         || ! IS_OPERAND(final)
16646         || ! is_invlist(final)
16647         || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
16648     {
16649       bad_syntax:
16650         SvREFCNT_dec(final);
16651         vFAIL("Incomplete expression within '(?[ ])'");
16652     }
16653
16654     /* Here, 'final' is the resultant inversion list from evaluating the
16655      * expression.  Return it if so requested */
16656     if (return_invlist) {
16657         *return_invlist = final;
16658         return END;
16659     }
16660
16661     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
16662      * expecting a string of ranges and individual code points */
16663     invlist_iterinit(final);
16664     result_string = newSVpvs("");
16665     while (invlist_iternext(final, &start, &end)) {
16666         if (start == end) {
16667             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16668         }
16669         else {
16670             Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16671                                                      start,          end);
16672         }
16673     }
16674
16675     /* About to generate an ANYOF (or similar) node from the inversion list we
16676      * have calculated */
16677     save_parse = RExC_parse;
16678     RExC_parse = SvPV(result_string, len);
16679     save_end = RExC_end;
16680     RExC_end = RExC_parse + len;
16681     TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16682
16683     /* We turn off folding around the call, as the class we have constructed
16684      * already has all folding taken into consideration, and we don't want
16685      * regclass() to add to that */
16686     RExC_flags &= ~RXf_PMf_FOLD;
16687     /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16688      * folds are allowed.  */
16689     node = regclass(pRExC_state, flagp, depth+1,
16690                     FALSE, /* means parse the whole char class */
16691                     FALSE, /* don't allow multi-char folds */
16692                     TRUE, /* silence non-portable warnings.  The above may very
16693                              well have generated non-portable code points, but
16694                              they're valid on this machine */
16695                     FALSE, /* similarly, no need for strict */
16696
16697                     /* We can optimize into something besides an ANYOF, except
16698                      * under /l, which needs to be ANYOF because of runtime
16699                      * checks for locale sanity, etc */
16700                   ! in_locale,
16701                     NULL
16702                 );
16703
16704     RESTORE_WARNINGS;
16705     RExC_parse = save_parse + 1;
16706     RExC_end = save_end;
16707     SvREFCNT_dec_NN(final);
16708     SvREFCNT_dec_NN(result_string);
16709
16710     if (save_fold) {
16711         RExC_flags |= RXf_PMf_FOLD;
16712     }
16713
16714     if (!node) {
16715         RETURN_FAIL_ON_RESTART(*flagp, flagp);
16716         goto regclass_failed;
16717     }
16718
16719     /* Fix up the node type if we are in locale.  (We have pretended we are
16720      * under /u for the purposes of regclass(), as this construct will only
16721      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
16722      * as to cause any warnings about bad locales to be output in regexec.c),
16723      * and add the flag that indicates to check if not in a UTF-8 locale.  The
16724      * reason we above forbid optimization into something other than an ANYOF
16725      * node is simply to minimize the number of code changes in regexec.c.
16726      * Otherwise we would have to create new EXACTish node types and deal with
16727      * them.  This decision could be revisited should this construct become
16728      * popular.
16729      *
16730      * (One might think we could look at the resulting ANYOF node and suppress
16731      * the flag if everything is above 255, as those would be UTF-8 only,
16732      * but this isn't true, as the components that led to that result could
16733      * have been locale-affected, and just happen to cancel each other out
16734      * under UTF-8 locales.) */
16735     if (in_locale) {
16736         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16737
16738         assert(OP(REGNODE_p(node)) == ANYOF);
16739
16740         OP(REGNODE_p(node)) = ANYOFL;
16741         ANYOF_FLAGS(REGNODE_p(node))
16742                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16743     }
16744
16745     nextchar(pRExC_state);
16746     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16747     return node;
16748
16749   regclass_failed:
16750     FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16751                                                                 (UV) *flagp);
16752 }
16753
16754 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16755
16756 STATIC void
16757 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16758                              AV * stack, const IV fence, AV * fence_stack)
16759 {   /* Dumps the stacks in handle_regex_sets() */
16760
16761     const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16762     const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16763     SSize_t i;
16764
16765     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16766
16767     PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16768
16769     if (stack_top < 0) {
16770         PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16771     }
16772     else {
16773         PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16774         for (i = stack_top; i >= 0; i--) {
16775             SV ** element_ptr = av_fetch(stack, i, FALSE);
16776             if (! element_ptr) {
16777             }
16778
16779             if (IS_OPERATOR(*element_ptr)) {
16780                 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16781                                             (int) i, (int) SvIV(*element_ptr));
16782             }
16783             else {
16784                 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16785                 sv_dump(*element_ptr);
16786             }
16787         }
16788     }
16789
16790     if (fence_stack_top < 0) {
16791         PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16792     }
16793     else {
16794         PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16795         for (i = fence_stack_top; i >= 0; i--) {
16796             SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16797             if (! element_ptr) {
16798             }
16799
16800             PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16801                                             (int) i, (int) SvIV(*element_ptr));
16802         }
16803     }
16804 }
16805
16806 #endif
16807
16808 #undef IS_OPERATOR
16809 #undef IS_OPERAND
16810
16811 STATIC void
16812 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16813 {
16814     /* This adds the Latin1/above-Latin1 folding rules.
16815      *
16816      * This should be called only for a Latin1-range code points, cp, which is
16817      * known to be involved in a simple fold with other code points above
16818      * Latin1.  It would give false results if /aa has been specified.
16819      * Multi-char folds are outside the scope of this, and must be handled
16820      * specially. */
16821
16822     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16823
16824     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16825
16826     /* The rules that are valid for all Unicode versions are hard-coded in */
16827     switch (cp) {
16828         case 'k':
16829         case 'K':
16830           *invlist =
16831              add_cp_to_invlist(*invlist, KELVIN_SIGN);
16832             break;
16833         case 's':
16834         case 'S':
16835           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16836             break;
16837         case MICRO_SIGN:
16838           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16839           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16840             break;
16841         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16842         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16843           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16844             break;
16845         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16846           *invlist = add_cp_to_invlist(*invlist,
16847                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16848             break;
16849
16850         default:    /* Other code points are checked against the data for the
16851                        current Unicode version */
16852           {
16853             Size_t folds_count;
16854             unsigned int first_fold;
16855             const unsigned int * remaining_folds;
16856             UV folded_cp;
16857
16858             if (isASCII(cp)) {
16859                 folded_cp = toFOLD(cp);
16860             }
16861             else {
16862                 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16863                 Size_t dummy_len;
16864                 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16865             }
16866
16867             if (folded_cp > 255) {
16868                 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16869             }
16870
16871             folds_count = _inverse_folds(folded_cp, &first_fold,
16872                                                     &remaining_folds);
16873             if (folds_count == 0) {
16874
16875                 /* Use deprecated warning to increase the chances of this being
16876                  * output */
16877                 ckWARN2reg_d(RExC_parse,
16878                         "Perl folding rules are not up-to-date for 0x%02X;"
16879                         " please use the perlbug utility to report;", cp);
16880             }
16881             else {
16882                 unsigned int i;
16883
16884                 if (first_fold > 255) {
16885                     *invlist = add_cp_to_invlist(*invlist, first_fold);
16886                 }
16887                 for (i = 0; i < folds_count - 1; i++) {
16888                     if (remaining_folds[i] > 255) {
16889                         *invlist = add_cp_to_invlist(*invlist,
16890                                                     remaining_folds[i]);
16891                     }
16892                 }
16893             }
16894             break;
16895          }
16896     }
16897 }
16898
16899 STATIC void
16900 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16901 {
16902     /* Output the elements of the array given by '*posix_warnings' as REGEXP
16903      * warnings. */
16904
16905     SV * msg;
16906     const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16907
16908     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16909
16910     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16911         return;
16912     }
16913
16914     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16915         if (first_is_fatal) {           /* Avoid leaking this */
16916             av_undef(posix_warnings);   /* This isn't necessary if the
16917                                             array is mortal, but is a
16918                                             fail-safe */
16919             (void) sv_2mortal(msg);
16920             PREPARE_TO_DIE;
16921         }
16922         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16923         SvREFCNT_dec_NN(msg);
16924     }
16925
16926     UPDATE_WARNINGS_LOC(RExC_parse);
16927 }
16928
16929 PERL_STATIC_INLINE Size_t
16930 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
16931 {
16932     const U8 * const start = s1;
16933     const U8 * const send = start + max;
16934
16935     PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
16936
16937     while (s1 < send && *s1  == *s2) {
16938         s1++; s2++;
16939     }
16940
16941     return s1 - start;
16942 }
16943
16944
16945 STATIC AV *
16946 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16947 {
16948     /* This adds the string scalar <multi_string> to the array
16949      * <multi_char_matches>.  <multi_string> is known to have exactly
16950      * <cp_count> code points in it.  This is used when constructing a
16951      * bracketed character class and we find something that needs to match more
16952      * than a single character.
16953      *
16954      * <multi_char_matches> is actually an array of arrays.  Each top-level
16955      * element is an array that contains all the strings known so far that are
16956      * the same length.  And that length (in number of code points) is the same
16957      * as the index of the top-level array.  Hence, the [2] element is an
16958      * array, each element thereof is a string containing TWO code points;
16959      * while element [3] is for strings of THREE characters, and so on.  Since
16960      * this is for multi-char strings there can never be a [0] nor [1] element.
16961      *
16962      * When we rewrite the character class below, we will do so such that the
16963      * longest strings are written first, so that it prefers the longest
16964      * matching strings first.  This is done even if it turns out that any
16965      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
16966      * Christiansen has agreed that this is ok.  This makes the test for the
16967      * ligature 'ffi' come before the test for 'ff', for example */
16968
16969     AV* this_array;
16970     AV** this_array_ptr;
16971
16972     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16973
16974     if (! multi_char_matches) {
16975         multi_char_matches = newAV();
16976     }
16977
16978     if (av_exists(multi_char_matches, cp_count)) {
16979         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16980         this_array = *this_array_ptr;
16981     }
16982     else {
16983         this_array = newAV();
16984         av_store(multi_char_matches, cp_count,
16985                  (SV*) this_array);
16986     }
16987     av_push(this_array, multi_string);
16988
16989     return multi_char_matches;
16990 }
16991
16992 /* The names of properties whose definitions are not known at compile time are
16993  * stored in this SV, after a constant heading.  So if the length has been
16994  * changed since initialization, then there is a run-time definition. */
16995 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
16996                                         (SvCUR(listsv) != initial_listsv_len)
16997
16998 /* There is a restricted set of white space characters that are legal when
16999  * ignoring white space in a bracketed character class.  This generates the
17000  * code to skip them.
17001  *
17002  * There is a line below that uses the same white space criteria but is outside
17003  * this macro.  Both here and there must use the same definition */
17004 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
17005     STMT_START {                                                        \
17006         if (do_skip) {                                                  \
17007             while (isBLANK_A(UCHARAT(p)))                               \
17008             {                                                           \
17009                 p++;                                                    \
17010             }                                                           \
17011         }                                                               \
17012     } STMT_END
17013
17014 STATIC regnode_offset
17015 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17016                  const bool stop_at_1,  /* Just parse the next thing, don't
17017                                            look for a full character class */
17018                  bool allow_mutiple_chars,
17019                  const bool silence_non_portable,   /* Don't output warnings
17020                                                        about too large
17021                                                        characters */
17022                  const bool strict,
17023                  bool optimizable,                  /* ? Allow a non-ANYOF return
17024                                                        node */
17025                  SV** ret_invlist  /* Return an inversion list, not a node */
17026           )
17027 {
17028     /* parse a bracketed class specification.  Most of these will produce an
17029      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17030      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
17031      * under /i with multi-character folds: it will be rewritten following the
17032      * paradigm of this example, where the <multi-fold>s are characters which
17033      * fold to multiple character sequences:
17034      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17035      * gets effectively rewritten as:
17036      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17037      * reg() gets called (recursively) on the rewritten version, and this
17038      * function will return what it constructs.  (Actually the <multi-fold>s
17039      * aren't physically removed from the [abcdefghi], it's just that they are
17040      * ignored in the recursion by means of a flag:
17041      * <RExC_in_multi_char_class>.)
17042      *
17043      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17044      * characters, with the corresponding bit set if that character is in the
17045      * list.  For characters above this, an inversion list is used.  There
17046      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17047      * determinable at compile time
17048      *
17049      * On success, returns the offset at which any next node should be placed
17050      * into the regex engine program being compiled.
17051      *
17052      * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17053      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17054      * UTF-8
17055      */
17056
17057     dVAR;
17058     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17059     IV range = 0;
17060     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17061     regnode_offset ret = -1;    /* Initialized to an illegal value */
17062     STRLEN numlen;
17063     int namedclass = OOB_NAMEDCLASS;
17064     char *rangebegin = NULL;
17065     SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
17066                                aren't available at the time this was called */
17067     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17068                                       than just initialized.  */
17069     SV* properties = NULL;    /* Code points that match \p{} \P{} */
17070     SV* posixes = NULL;     /* Code points that match classes like [:word:],
17071                                extended beyond the Latin1 range.  These have to
17072                                be kept separate from other code points for much
17073                                of this function because their handling  is
17074                                different under /i, and for most classes under
17075                                /d as well */
17076     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
17077                                separate for a while from the non-complemented
17078                                versions because of complications with /d
17079                                matching */
17080     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17081                                   treated more simply than the general case,
17082                                   leading to less compilation and execution
17083                                   work */
17084     UV element_count = 0;   /* Number of distinct elements in the class.
17085                                Optimizations may be possible if this is tiny */
17086     AV * multi_char_matches = NULL; /* Code points that fold to more than one
17087                                        character; used under /i */
17088     UV n;
17089     char * stop_ptr = RExC_end;    /* where to stop parsing */
17090
17091     /* ignore unescaped whitespace? */
17092     const bool skip_white = cBOOL(   ret_invlist
17093                                   || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17094
17095     /* inversion list of code points this node matches only when the target
17096      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
17097      * /d) */
17098     SV* upper_latin1_only_utf8_matches = NULL;
17099
17100     /* Inversion list of code points this node matches regardless of things
17101      * like locale, folding, utf8ness of the target string */
17102     SV* cp_list = NULL;
17103
17104     /* Like cp_list, but code points on this list need to be checked for things
17105      * that fold to/from them under /i */
17106     SV* cp_foldable_list = NULL;
17107
17108     /* Like cp_list, but code points on this list are valid only when the
17109      * runtime locale is UTF-8 */
17110     SV* only_utf8_locale_list = NULL;
17111
17112     /* In a range, if one of the endpoints is non-character-set portable,
17113      * meaning that it hard-codes a code point that may mean a different
17114      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17115      * mnemonic '\t' which each mean the same character no matter which
17116      * character set the platform is on. */
17117     unsigned int non_portable_endpoint = 0;
17118
17119     /* Is the range unicode? which means on a platform that isn't 1-1 native
17120      * to Unicode (i.e. non-ASCII), each code point in it should be considered
17121      * to be a Unicode value.  */
17122     bool unicode_range = FALSE;
17123     bool invert = FALSE;    /* Is this class to be complemented */
17124
17125     bool warn_super = ALWAYS_WARN_SUPER;
17126
17127     const char * orig_parse = RExC_parse;
17128
17129     /* This variable is used to mark where the end in the input is of something
17130      * that looks like a POSIX construct but isn't.  During the parse, when
17131      * something looks like it could be such a construct is encountered, it is
17132      * checked for being one, but not if we've already checked this area of the
17133      * input.  Only after this position is reached do we check again */
17134     char *not_posix_region_end = RExC_parse - 1;
17135
17136     AV* posix_warnings = NULL;
17137     const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17138     U8 op = END;    /* The returned node-type, initialized to an impossible
17139                        one.  */
17140     U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
17141     U32 posixl = 0;       /* bit field of posix classes matched under /l */
17142
17143
17144 /* Flags as to what things aren't knowable until runtime.  (Note that these are
17145  * mutually exclusive.) */
17146 #define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
17147                                             haven't been defined as of yet */
17148 #define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
17149                                             UTF-8 or not */
17150 #define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
17151                                             what gets folded */
17152     U32 has_runtime_dependency = 0;     /* OR of the above flags */
17153
17154     GET_RE_DEBUG_FLAGS_DECL;
17155
17156     PERL_ARGS_ASSERT_REGCLASS;
17157 #ifndef DEBUGGING
17158     PERL_UNUSED_ARG(depth);
17159 #endif
17160
17161
17162     /* If wants an inversion list returned, we can't optimize to something
17163      * else. */
17164     if (ret_invlist) {
17165         optimizable = FALSE;
17166     }
17167
17168     DEBUG_PARSE("clas");
17169
17170 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
17171     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
17172                                    && UNICODE_DOT_DOT_VERSION == 0)
17173     allow_mutiple_chars = FALSE;
17174 #endif
17175
17176     /* We include the /i status at the beginning of this so that we can
17177      * know it at runtime */
17178     listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17179     initial_listsv_len = SvCUR(listsv);
17180     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
17181
17182     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17183
17184     assert(RExC_parse <= RExC_end);
17185
17186     if (UCHARAT(RExC_parse) == '^') {   /* Complement the class */
17187         RExC_parse++;
17188         invert = TRUE;
17189         allow_mutiple_chars = FALSE;
17190         MARK_NAUGHTY(1);
17191         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17192     }
17193
17194     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17195     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17196         int maybe_class = handle_possible_posix(pRExC_state,
17197                                                 RExC_parse,
17198                                                 &not_posix_region_end,
17199                                                 NULL,
17200                                                 TRUE /* checking only */);
17201         if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17202             ckWARN4reg(not_posix_region_end,
17203                     "POSIX syntax [%c %c] belongs inside character classes%s",
17204                     *RExC_parse, *RExC_parse,
17205                     (maybe_class == OOB_NAMEDCLASS)
17206                     ? ((POSIXCC_NOTYET(*RExC_parse))
17207                         ? " (but this one isn't implemented)"
17208                         : " (but this one isn't fully valid)")
17209                     : ""
17210                     );
17211         }
17212     }
17213
17214     /* If the caller wants us to just parse a single element, accomplish this
17215      * by faking the loop ending condition */
17216     if (stop_at_1 && RExC_end > RExC_parse) {
17217         stop_ptr = RExC_parse + 1;
17218     }
17219
17220     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17221     if (UCHARAT(RExC_parse) == ']')
17222         goto charclassloop;
17223
17224     while (1) {
17225
17226         if (   posix_warnings
17227             && av_tindex_skip_len_mg(posix_warnings) >= 0
17228             && RExC_parse > not_posix_region_end)
17229         {
17230             /* Warnings about posix class issues are considered tentative until
17231              * we are far enough along in the parse that we can no longer
17232              * change our mind, at which point we output them.  This is done
17233              * each time through the loop so that a later class won't zap them
17234              * before they have been dealt with. */
17235             output_posix_warnings(pRExC_state, posix_warnings);
17236         }
17237
17238         if  (RExC_parse >= stop_ptr) {
17239             break;
17240         }
17241
17242         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17243
17244         if  (UCHARAT(RExC_parse) == ']') {
17245             break;
17246         }
17247
17248       charclassloop:
17249
17250         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17251         save_value = value;
17252         save_prevvalue = prevvalue;
17253
17254         if (!range) {
17255             rangebegin = RExC_parse;
17256             element_count++;
17257             non_portable_endpoint = 0;
17258         }
17259         if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17260             value = utf8n_to_uvchr((U8*)RExC_parse,
17261                                    RExC_end - RExC_parse,
17262                                    &numlen, UTF8_ALLOW_DEFAULT);
17263             RExC_parse += numlen;
17264         }
17265         else
17266             value = UCHARAT(RExC_parse++);
17267
17268         if (value == '[') {
17269             char * posix_class_end;
17270             namedclass = handle_possible_posix(pRExC_state,
17271                                                RExC_parse,
17272                                                &posix_class_end,
17273                                                do_posix_warnings ? &posix_warnings : NULL,
17274                                                FALSE    /* die if error */);
17275             if (namedclass > OOB_NAMEDCLASS) {
17276
17277                 /* If there was an earlier attempt to parse this particular
17278                  * posix class, and it failed, it was a false alarm, as this
17279                  * successful one proves */
17280                 if (   posix_warnings
17281                     && av_tindex_skip_len_mg(posix_warnings) >= 0
17282                     && not_posix_region_end >= RExC_parse
17283                     && not_posix_region_end <= posix_class_end)
17284                 {
17285                     av_undef(posix_warnings);
17286                 }
17287
17288                 RExC_parse = posix_class_end;
17289             }
17290             else if (namedclass == OOB_NAMEDCLASS) {
17291                 not_posix_region_end = posix_class_end;
17292             }
17293             else {
17294                 namedclass = OOB_NAMEDCLASS;
17295             }
17296         }
17297         else if (   RExC_parse - 1 > not_posix_region_end
17298                  && MAYBE_POSIXCC(value))
17299         {
17300             (void) handle_possible_posix(
17301                         pRExC_state,
17302                         RExC_parse - 1,  /* -1 because parse has already been
17303                                             advanced */
17304                         &not_posix_region_end,
17305                         do_posix_warnings ? &posix_warnings : NULL,
17306                         TRUE /* checking only */);
17307         }
17308         else if (  strict && ! skip_white
17309                  && (   _generic_isCC(value, _CC_VERTSPACE)
17310                      || is_VERTWS_cp_high(value)))
17311         {
17312             vFAIL("Literal vertical space in [] is illegal except under /x");
17313         }
17314         else if (value == '\\') {
17315             /* Is a backslash; get the code point of the char after it */
17316
17317             if (RExC_parse >= RExC_end) {
17318                 vFAIL("Unmatched [");
17319             }
17320
17321             if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17322                 value = utf8n_to_uvchr((U8*)RExC_parse,
17323                                    RExC_end - RExC_parse,
17324                                    &numlen, UTF8_ALLOW_DEFAULT);
17325                 RExC_parse += numlen;
17326             }
17327             else
17328                 value = UCHARAT(RExC_parse++);
17329
17330             /* Some compilers cannot handle switching on 64-bit integer
17331              * values, therefore value cannot be an UV.  Yes, this will
17332              * be a problem later if we want switch on Unicode.
17333              * A similar issue a little bit later when switching on
17334              * namedclass. --jhi */
17335
17336             /* If the \ is escaping white space when white space is being
17337              * skipped, it means that that white space is wanted literally, and
17338              * is already in 'value'.  Otherwise, need to translate the escape
17339              * into what it signifies. */
17340             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17341
17342             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
17343             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
17344             case 's':   namedclass = ANYOF_SPACE;       break;
17345             case 'S':   namedclass = ANYOF_NSPACE;      break;
17346             case 'd':   namedclass = ANYOF_DIGIT;       break;
17347             case 'D':   namedclass = ANYOF_NDIGIT;      break;
17348             case 'v':   namedclass = ANYOF_VERTWS;      break;
17349             case 'V':   namedclass = ANYOF_NVERTWS;     break;
17350             case 'h':   namedclass = ANYOF_HORIZWS;     break;
17351             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
17352             case 'N':  /* Handle \N{NAME} in class */
17353                 {
17354                     const char * const backslash_N_beg = RExC_parse - 2;
17355                     int cp_count;
17356
17357                     if (! grok_bslash_N(pRExC_state,
17358                                         NULL,      /* No regnode */
17359                                         &value,    /* Yes single value */
17360                                         &cp_count, /* Multiple code pt count */
17361                                         flagp,
17362                                         strict,
17363                                         depth)
17364                     ) {
17365
17366                         if (*flagp & NEED_UTF8)
17367                             FAIL("panic: grok_bslash_N set NEED_UTF8");
17368
17369                         RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17370
17371                         if (cp_count < 0) {
17372                             vFAIL("\\N in a character class must be a named character: \\N{...}");
17373                         }
17374                         else if (cp_count == 0) {
17375                             ckWARNreg(RExC_parse,
17376                               "Ignoring zero length \\N{} in character class");
17377                         }
17378                         else { /* cp_count > 1 */
17379                             assert(cp_count > 1);
17380                             if (! RExC_in_multi_char_class) {
17381                                 if ( ! allow_mutiple_chars
17382                                     || invert
17383                                     || range
17384                                     || *RExC_parse == '-')
17385                                 {
17386                                     if (strict) {
17387                                         RExC_parse--;
17388                                         vFAIL("\\N{} here is restricted to one character");
17389                                     }
17390                                     ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17391                                     break; /* <value> contains the first code
17392                                               point. Drop out of the switch to
17393                                               process it */
17394                                 }
17395                                 else {
17396                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
17397                                                  RExC_parse - backslash_N_beg);
17398                                     multi_char_matches
17399                                         = add_multi_match(multi_char_matches,
17400                                                           multi_char_N,
17401                                                           cp_count);
17402                                 }
17403                             }
17404                         } /* End of cp_count != 1 */
17405
17406                         /* This element should not be processed further in this
17407                          * class */
17408                         element_count--;
17409                         value = save_value;
17410                         prevvalue = save_prevvalue;
17411                         continue;   /* Back to top of loop to get next char */
17412                     }
17413
17414                     /* Here, is a single code point, and <value> contains it */
17415                     unicode_range = TRUE;   /* \N{} are Unicode */
17416                 }
17417                 break;
17418             case 'p':
17419             case 'P':
17420                 {
17421                 char *e;
17422
17423                 /* \p means they want Unicode semantics */
17424                 REQUIRE_UNI_RULES(flagp, 0);
17425
17426                 if (RExC_parse >= RExC_end)
17427                     vFAIL2("Empty \\%c", (U8)value);
17428                 if (*RExC_parse == '{') {
17429                     const U8 c = (U8)value;
17430                     e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17431                     if (!e) {
17432                         RExC_parse++;
17433                         vFAIL2("Missing right brace on \\%c{}", c);
17434                     }
17435
17436                     RExC_parse++;
17437
17438                     /* White space is allowed adjacent to the braces and after
17439                      * any '^', even when not under /x */
17440                     while (isSPACE(*RExC_parse)) {
17441                          RExC_parse++;
17442                     }
17443
17444                     if (UCHARAT(RExC_parse) == '^') {
17445
17446                         /* toggle.  (The rhs xor gets the single bit that
17447                          * differs between P and p; the other xor inverts just
17448                          * that bit) */
17449                         value ^= 'P' ^ 'p';
17450
17451                         RExC_parse++;
17452                         while (isSPACE(*RExC_parse)) {
17453                             RExC_parse++;
17454                         }
17455                     }
17456
17457                     if (e == RExC_parse)
17458                         vFAIL2("Empty \\%c{}", c);
17459
17460                     n = e - RExC_parse;
17461                     while (isSPACE(*(RExC_parse + n - 1)))
17462                         n--;
17463
17464                 }   /* The \p isn't immediately followed by a '{' */
17465                 else if (! isALPHA(*RExC_parse)) {
17466                     RExC_parse += (UTF)
17467                                   ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17468                                   : 1;
17469                     vFAIL2("Character following \\%c must be '{' or a "
17470                            "single-character Unicode property name",
17471                            (U8) value);
17472                 }
17473                 else {
17474                     e = RExC_parse;
17475                     n = 1;
17476                 }
17477                 {
17478                     char* name = RExC_parse;
17479
17480                     /* Any message returned about expanding the definition */
17481                     SV* msg = newSVpvs_flags("", SVs_TEMP);
17482
17483                     /* If set TRUE, the property is user-defined as opposed to
17484                      * official Unicode */
17485                     bool user_defined = FALSE;
17486
17487                     SV * prop_definition = parse_uniprop_string(
17488                                             name, n, UTF, FOLD,
17489                                             FALSE, /* This is compile-time */
17490
17491                                             /* We can't defer this defn when
17492                                              * the full result is required in
17493                                              * this call */
17494                                             ! cBOOL(ret_invlist),
17495
17496                                             &user_defined,
17497                                             msg,
17498                                             0 /* Base level */
17499                                            );
17500                     if (SvCUR(msg)) {   /* Assumes any error causes a msg */
17501                         assert(prop_definition == NULL);
17502                         RExC_parse = e + 1;
17503                         if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
17504                                                thing so, or else the display is
17505                                                mojibake */
17506                             RExC_utf8 = TRUE;
17507                         }
17508                         /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17509                         vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17510                                     SvCUR(msg), SvPVX(msg)));
17511                     }
17512
17513                     if (! is_invlist(prop_definition)) {
17514
17515                         /* Here, the definition isn't known, so we have gotten
17516                          * returned a string that will be evaluated if and when
17517                          * encountered at runtime.  We add it to the list of
17518                          * such properties, along with whether it should be
17519                          * complemented or not */
17520                         if (value == 'P') {
17521                             sv_catpvs(listsv, "!");
17522                         }
17523                         else {
17524                             sv_catpvs(listsv, "+");
17525                         }
17526                         sv_catsv(listsv, prop_definition);
17527
17528                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17529
17530                         /* We don't know yet what this matches, so have to flag
17531                          * it */
17532                         anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17533                     }
17534                     else {
17535                         assert (prop_definition && is_invlist(prop_definition));
17536
17537                         /* Here we do have the complete property definition
17538                          *
17539                          * Temporary workaround for [perl #133136].  For this
17540                          * precise input that is in the .t that is failing,
17541                          * load utf8.pm, which is what the test wants, so that
17542                          * that .t passes */
17543                         if (     memEQs(RExC_start, e + 1 - RExC_start,
17544                                         "foo\\p{Alnum}")
17545                             && ! hv_common(GvHVn(PL_incgv),
17546                                            NULL,
17547                                            "utf8.pm", sizeof("utf8.pm") - 1,
17548                                            0, HV_FETCH_ISEXISTS, NULL, 0))
17549                         {
17550                             require_pv("utf8.pm");
17551                         }
17552
17553                         if (! user_defined &&
17554                             /* We warn on matching an above-Unicode code point
17555                              * if the match would return true, except don't
17556                              * warn for \p{All}, which has exactly one element
17557                              * = 0 */
17558                             (_invlist_contains_cp(prop_definition, 0x110000)
17559                                 && (! (_invlist_len(prop_definition) == 1
17560                                        && *invlist_array(prop_definition) == 0))))
17561                         {
17562                             warn_super = TRUE;
17563                         }
17564
17565                         /* Invert if asking for the complement */
17566                         if (value == 'P') {
17567                             _invlist_union_complement_2nd(properties,
17568                                                           prop_definition,
17569                                                           &properties);
17570                         }
17571                         else {
17572                             _invlist_union(properties, prop_definition, &properties);
17573                         }
17574                     }
17575                 }
17576
17577                 RExC_parse = e + 1;
17578                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
17579                                                 named */
17580                 }
17581                 break;
17582             case 'n':   value = '\n';                   break;
17583             case 'r':   value = '\r';                   break;
17584             case 't':   value = '\t';                   break;
17585             case 'f':   value = '\f';                   break;
17586             case 'b':   value = '\b';                   break;
17587             case 'e':   value = ESC_NATIVE;             break;
17588             case 'a':   value = '\a';                   break;
17589             case 'o':
17590                 RExC_parse--;   /* function expects to be pointed at the 'o' */
17591                 {
17592                     const char* error_msg;
17593                     bool valid = grok_bslash_o(&RExC_parse,
17594                                                RExC_end,
17595                                                &value,
17596                                                &error_msg,
17597                                                TO_OUTPUT_WARNINGS(RExC_parse),
17598                                                strict,
17599                                                silence_non_portable,
17600                                                UTF);
17601                     if (! valid) {
17602                         vFAIL(error_msg);
17603                     }
17604                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17605                 }
17606                 non_portable_endpoint++;
17607                 break;
17608             case 'x':
17609                 RExC_parse--;   /* function expects to be pointed at the 'x' */
17610                 {
17611                     const char* error_msg;
17612                     bool valid = grok_bslash_x(&RExC_parse,
17613                                                RExC_end,
17614                                                &value,
17615                                                &error_msg,
17616                                                TO_OUTPUT_WARNINGS(RExC_parse),
17617                                                strict,
17618                                                silence_non_portable,
17619                                                UTF);
17620                     if (! valid) {
17621                         vFAIL(error_msg);
17622                     }
17623                     UPDATE_WARNINGS_LOC(RExC_parse - 1);
17624                 }
17625                 non_portable_endpoint++;
17626                 break;
17627             case 'c':
17628                 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17629                 UPDATE_WARNINGS_LOC(RExC_parse);
17630                 RExC_parse++;
17631                 non_portable_endpoint++;
17632                 break;
17633             case '0': case '1': case '2': case '3': case '4':
17634             case '5': case '6': case '7':
17635                 {
17636                     /* Take 1-3 octal digits */
17637                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17638                     numlen = (strict) ? 4 : 3;
17639                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17640                     RExC_parse += numlen;
17641                     if (numlen != 3) {
17642                         if (strict) {
17643                             RExC_parse += (UTF)
17644                                           ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17645                                           : 1;
17646                             vFAIL("Need exactly 3 octal digits");
17647                         }
17648                         else if (   numlen < 3 /* like \08, \178 */
17649                                  && RExC_parse < RExC_end
17650                                  && isDIGIT(*RExC_parse)
17651                                  && ckWARN(WARN_REGEXP))
17652                         {
17653                             reg_warn_non_literal_string(
17654                                  RExC_parse + 1,
17655                                  form_short_octal_warning(RExC_parse, numlen));
17656                         }
17657                     }
17658                     non_portable_endpoint++;
17659                     break;
17660                 }
17661             default:
17662                 /* Allow \_ to not give an error */
17663                 if (isWORDCHAR(value) && value != '_') {
17664                     if (strict) {
17665                         vFAIL2("Unrecognized escape \\%c in character class",
17666                                (int)value);
17667                     }
17668                     else {
17669                         ckWARN2reg(RExC_parse,
17670                             "Unrecognized escape \\%c in character class passed through",
17671                             (int)value);
17672                     }
17673                 }
17674                 break;
17675             }   /* End of switch on char following backslash */
17676         } /* end of handling backslash escape sequences */
17677
17678         /* Here, we have the current token in 'value' */
17679
17680         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17681             U8 classnum;
17682
17683             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
17684              * literal, as is the character that began the false range, i.e.
17685              * the 'a' in the examples */
17686             if (range) {
17687                 const int w = (RExC_parse >= rangebegin)
17688                                 ? RExC_parse - rangebegin
17689                                 : 0;
17690                 if (strict) {
17691                     vFAIL2utf8f(
17692                         "False [] range \"%" UTF8f "\"",
17693                         UTF8fARG(UTF, w, rangebegin));
17694                 }
17695                 else {
17696                     ckWARN2reg(RExC_parse,
17697                         "False [] range \"%" UTF8f "\"",
17698                         UTF8fARG(UTF, w, rangebegin));
17699                     cp_list = add_cp_to_invlist(cp_list, '-');
17700                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17701                                                             prevvalue);
17702                 }
17703
17704                 range = 0; /* this was not a true range */
17705                 element_count += 2; /* So counts for three values */
17706             }
17707
17708             classnum = namedclass_to_classnum(namedclass);
17709
17710             if (LOC && namedclass < ANYOF_POSIXL_MAX
17711 #ifndef HAS_ISASCII
17712                 && classnum != _CC_ASCII
17713 #endif
17714             ) {
17715                 SV* scratch_list = NULL;
17716
17717                 /* What the Posix classes (like \w, [:space:]) match isn't
17718                  * generally knowable under locale until actual match time.  A
17719                  * special node is used for these which has extra space for a
17720                  * bitmap, with a bit reserved for each named class that is to
17721                  * be matched against.  (This isn't needed for \p{} and
17722                  * pseudo-classes, as they are not affected by locale, and
17723                  * hence are dealt with separately.)  However, if a named class
17724                  * and its complement are both present, then it matches
17725                  * everything, and there is no runtime dependency.  Odd numbers
17726                  * are the complements of the next lower number, so xor works.
17727                  * (Note that something like [\w\D] should match everything,
17728                  * because \d should be a proper subset of \w.  But rather than
17729                  * trust that the locale is well behaved, we leave this to
17730                  * runtime to sort out) */
17731                 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17732                     cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17733                     POSIXL_ZERO(posixl);
17734                     has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17735                     anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17736                     continue;   /* We could ignore the rest of the class, but
17737                                    best to parse it for any errors */
17738                 }
17739                 else { /* Here, isn't the complement of any already parsed
17740                           class */
17741                     POSIXL_SET(posixl, namedclass);
17742                     has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17743                     anyof_flags |= ANYOF_MATCHES_POSIXL;
17744
17745                     /* The above-Latin1 characters are not subject to locale
17746                      * rules.  Just add them to the unconditionally-matched
17747                      * list */
17748
17749                     /* Get the list of the above-Latin1 code points this
17750                      * matches */
17751                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17752                                             PL_XPosix_ptrs[classnum],
17753
17754                                             /* Odd numbers are complements,
17755                                              * like NDIGIT, NASCII, ... */
17756                                             namedclass % 2 != 0,
17757                                             &scratch_list);
17758                     /* Checking if 'cp_list' is NULL first saves an extra
17759                      * clone.  Its reference count will be decremented at the
17760                      * next union, etc, or if this is the only instance, at the
17761                      * end of the routine */
17762                     if (! cp_list) {
17763                         cp_list = scratch_list;
17764                     }
17765                     else {
17766                         _invlist_union(cp_list, scratch_list, &cp_list);
17767                         SvREFCNT_dec_NN(scratch_list);
17768                     }
17769                     continue;   /* Go get next character */
17770                 }
17771             }
17772             else {
17773
17774                 /* Here, is not /l, or is a POSIX class for which /l doesn't
17775                  * matter (or is a Unicode property, which is skipped here). */
17776                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
17777                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17778
17779                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
17780                          * nor /l make a difference in what these match,
17781                          * therefore we just add what they match to cp_list. */
17782                         if (classnum != _CC_VERTSPACE) {
17783                             assert(   namedclass == ANYOF_HORIZWS
17784                                    || namedclass == ANYOF_NHORIZWS);
17785
17786                             /* It turns out that \h is just a synonym for
17787                              * XPosixBlank */
17788                             classnum = _CC_BLANK;
17789                         }
17790
17791                         _invlist_union_maybe_complement_2nd(
17792                                 cp_list,
17793                                 PL_XPosix_ptrs[classnum],
17794                                 namedclass % 2 != 0,    /* Complement if odd
17795                                                           (NHORIZWS, NVERTWS)
17796                                                         */
17797                                 &cp_list);
17798                     }
17799                 }
17800                 else if (   AT_LEAST_UNI_SEMANTICS
17801                          || classnum == _CC_ASCII
17802                          || (DEPENDS_SEMANTICS && (   classnum == _CC_DIGIT
17803                                                    || classnum == _CC_XDIGIT)))
17804                 {
17805                     /* We usually have to worry about /d affecting what POSIX
17806                      * classes match, with special code needed because we won't
17807                      * know until runtime what all matches.  But there is no
17808                      * extra work needed under /u and /a; and [:ascii:] is
17809                      * unaffected by /d; and :digit: and :xdigit: don't have
17810                      * runtime differences under /d.  So we can special case
17811                      * these, and avoid some extra work below, and at runtime.
17812                      * */
17813                     _invlist_union_maybe_complement_2nd(
17814                                                      simple_posixes,
17815                                                       ((AT_LEAST_ASCII_RESTRICTED)
17816                                                        ? PL_Posix_ptrs[classnum]
17817                                                        : PL_XPosix_ptrs[classnum]),
17818                                                      namedclass % 2 != 0,
17819                                                      &simple_posixes);
17820                 }
17821                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
17822                            complement and use nposixes */
17823                     SV** posixes_ptr = namedclass % 2 == 0
17824                                        ? &posixes
17825                                        : &nposixes;
17826                     _invlist_union_maybe_complement_2nd(
17827                                                      *posixes_ptr,
17828                                                      PL_XPosix_ptrs[classnum],
17829                                                      namedclass % 2 != 0,
17830                                                      posixes_ptr);
17831                 }
17832             }
17833         } /* end of namedclass \blah */
17834
17835         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17836
17837         /* If 'range' is set, 'value' is the ending of a range--check its
17838          * validity.  (If value isn't a single code point in the case of a
17839          * range, we should have figured that out above in the code that
17840          * catches false ranges).  Later, we will handle each individual code
17841          * point in the range.  If 'range' isn't set, this could be the
17842          * beginning of a range, so check for that by looking ahead to see if
17843          * the next real character to be processed is the range indicator--the
17844          * minus sign */
17845
17846         if (range) {
17847 #ifdef EBCDIC
17848             /* For unicode ranges, we have to test that the Unicode as opposed
17849              * to the native values are not decreasing.  (Above 255, there is
17850              * no difference between native and Unicode) */
17851             if (unicode_range && prevvalue < 255 && value < 255) {
17852                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17853                     goto backwards_range;
17854                 }
17855             }
17856             else
17857 #endif
17858             if (prevvalue > value) /* b-a */ {
17859                 int w;
17860 #ifdef EBCDIC
17861               backwards_range:
17862 #endif
17863                 w = RExC_parse - rangebegin;
17864                 vFAIL2utf8f(
17865                     "Invalid [] range \"%" UTF8f "\"",
17866                     UTF8fARG(UTF, w, rangebegin));
17867                 NOT_REACHED; /* NOTREACHED */
17868             }
17869         }
17870         else {
17871             prevvalue = value; /* save the beginning of the potential range */
17872             if (! stop_at_1     /* Can't be a range if parsing just one thing */
17873                 && *RExC_parse == '-')
17874             {
17875                 char* next_char_ptr = RExC_parse + 1;
17876
17877                 /* Get the next real char after the '-' */
17878                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17879
17880                 /* If the '-' is at the end of the class (just before the ']',
17881                  * it is a literal minus; otherwise it is a range */
17882                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17883                     RExC_parse = next_char_ptr;
17884
17885                     /* a bad range like \w-, [:word:]- ? */
17886                     if (namedclass > OOB_NAMEDCLASS) {
17887                         if (strict || ckWARN(WARN_REGEXP)) {
17888                             const int w = RExC_parse >= rangebegin
17889                                           ?  RExC_parse - rangebegin
17890                                           : 0;
17891                             if (strict) {
17892                                 vFAIL4("False [] range \"%*.*s\"",
17893                                     w, w, rangebegin);
17894                             }
17895                             else {
17896                                 vWARN4(RExC_parse,
17897                                     "False [] range \"%*.*s\"",
17898                                     w, w, rangebegin);
17899                             }
17900                         }
17901                         cp_list = add_cp_to_invlist(cp_list, '-');
17902                         element_count++;
17903                     } else
17904                         range = 1;      /* yeah, it's a range! */
17905                     continue;   /* but do it the next time */
17906                 }
17907             }
17908         }
17909
17910         if (namedclass > OOB_NAMEDCLASS) {
17911             continue;
17912         }
17913
17914         /* Here, we have a single value this time through the loop, and
17915          * <prevvalue> is the beginning of the range, if any; or <value> if
17916          * not. */
17917
17918         /* non-Latin1 code point implies unicode semantics. */
17919         if (value > 255) {
17920             REQUIRE_UNI_RULES(flagp, 0);
17921         }
17922
17923         /* Ready to process either the single value, or the completed range.
17924          * For single-valued non-inverted ranges, we consider the possibility
17925          * of multi-char folds.  (We made a conscious decision to not do this
17926          * for the other cases because it can often lead to non-intuitive
17927          * results.  For example, you have the peculiar case that:
17928          *  "s s" =~ /^[^\xDF]+$/i => Y
17929          *  "ss"  =~ /^[^\xDF]+$/i => N
17930          *
17931          * See [perl #89750] */
17932         if (FOLD && allow_mutiple_chars && value == prevvalue) {
17933             if (    value == LATIN_SMALL_LETTER_SHARP_S
17934                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17935                                                         value)))
17936             {
17937                 /* Here <value> is indeed a multi-char fold.  Get what it is */
17938
17939                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17940                 STRLEN foldlen;
17941
17942                 UV folded = _to_uni_fold_flags(
17943                                 value,
17944                                 foldbuf,
17945                                 &foldlen,
17946                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17947                                                    ? FOLD_FLAGS_NOMIX_ASCII
17948                                                    : 0)
17949                                 );
17950
17951                 /* Here, <folded> should be the first character of the
17952                  * multi-char fold of <value>, with <foldbuf> containing the
17953                  * whole thing.  But, if this fold is not allowed (because of
17954                  * the flags), <fold> will be the same as <value>, and should
17955                  * be processed like any other character, so skip the special
17956                  * handling */
17957                 if (folded != value) {
17958
17959                     /* Skip if we are recursed, currently parsing the class
17960                      * again.  Otherwise add this character to the list of
17961                      * multi-char folds. */
17962                     if (! RExC_in_multi_char_class) {
17963                         STRLEN cp_count = utf8_length(foldbuf,
17964                                                       foldbuf + foldlen);
17965                         SV* multi_fold = sv_2mortal(newSVpvs(""));
17966
17967                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17968
17969                         multi_char_matches
17970                                         = add_multi_match(multi_char_matches,
17971                                                           multi_fold,
17972                                                           cp_count);
17973
17974                     }
17975
17976                     /* This element should not be processed further in this
17977                      * class */
17978                     element_count--;
17979                     value = save_value;
17980                     prevvalue = save_prevvalue;
17981                     continue;
17982                 }
17983             }
17984         }
17985
17986         if (strict && ckWARN(WARN_REGEXP)) {
17987             if (range) {
17988
17989                 /* If the range starts above 255, everything is portable and
17990                  * likely to be so for any forseeable character set, so don't
17991                  * warn. */
17992                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17993                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17994                 }
17995                 else if (prevvalue != value) {
17996
17997                     /* Under strict, ranges that stop and/or end in an ASCII
17998                      * printable should have each end point be a portable value
17999                      * for it (preferably like 'A', but we don't warn if it is
18000                      * a (portable) Unicode name or code point), and the range
18001                      * must be be all digits or all letters of the same case.
18002                      * Otherwise, the range is non-portable and unclear as to
18003                      * what it contains */
18004                     if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
18005                         && (          non_portable_endpoint
18006                             || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18007                                   || (isLOWER_A(prevvalue) && isLOWER_A(value))
18008                                   || (isUPPER_A(prevvalue) && isUPPER_A(value))
18009                     ))) {
18010                         vWARN(RExC_parse, "Ranges of ASCII printables should"
18011                                           " be some subset of \"0-9\","
18012                                           " \"A-Z\", or \"a-z\"");
18013                     }
18014                     else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18015                         SSize_t index_start;
18016                         SSize_t index_final;
18017
18018                         /* But the nature of Unicode and languages mean we
18019                          * can't do the same checks for above-ASCII ranges,
18020                          * except in the case of digit ones.  These should
18021                          * contain only digits from the same group of 10.  The
18022                          * ASCII case is handled just above.  Hence here, the
18023                          * range could be a range of digits.  First some
18024                          * unlikely special cases.  Grandfather in that a range
18025                          * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18026                          * if its starting value is one of the 10 digits prior
18027                          * to it.  This is because it is an alternate way of
18028                          * writing 19D1, and some people may expect it to be in
18029                          * that group.  But it is bad, because it won't give
18030                          * the expected results.  In Unicode 5.2 it was
18031                          * considered to be in that group (of 11, hence), but
18032                          * this was fixed in the next version */
18033
18034                         if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18035                             goto warn_bad_digit_range;
18036                         }
18037                         else if (UNLIKELY(   prevvalue >= 0x1D7CE
18038                                           &&     value <= 0x1D7FF))
18039                         {
18040                             /* This is the only other case currently in Unicode
18041                              * where the algorithm below fails.  The code
18042                              * points just above are the end points of a single
18043                              * range containing only decimal digits.  It is 5
18044                              * different series of 0-9.  All other ranges of
18045                              * digits currently in Unicode are just a single
18046                              * series.  (And mktables will notify us if a later
18047                              * Unicode version breaks this.)
18048                              *
18049                              * If the range being checked is at most 9 long,
18050                              * and the digit values represented are in
18051                              * numerical order, they are from the same series.
18052                              * */
18053                             if (         value - prevvalue > 9
18054                                 ||    (((    value - 0x1D7CE) % 10)
18055                                      <= (prevvalue - 0x1D7CE) % 10))
18056                             {
18057                                 goto warn_bad_digit_range;
18058                             }
18059                         }
18060                         else {
18061
18062                             /* For all other ranges of digits in Unicode, the
18063                              * algorithm is just to check if both end points
18064                              * are in the same series, which is the same range.
18065                              * */
18066                             index_start = _invlist_search(
18067                                                     PL_XPosix_ptrs[_CC_DIGIT],
18068                                                     prevvalue);
18069
18070                             /* Warn if the range starts and ends with a digit,
18071                              * and they are not in the same group of 10. */
18072                             if (   index_start >= 0
18073                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18074                                 && (index_final =
18075                                     _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18076                                                     value)) != index_start
18077                                 && index_final >= 0
18078                                 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18079                             {
18080                               warn_bad_digit_range:
18081                                 vWARN(RExC_parse, "Ranges of digits should be"
18082                                                   " from the same group of"
18083                                                   " 10");
18084                             }
18085                         }
18086                     }
18087                 }
18088             }
18089             if ((! range || prevvalue == value) && non_portable_endpoint) {
18090                 if (isPRINT_A(value)) {
18091                     char literal[3];
18092                     unsigned d = 0;
18093                     if (isBACKSLASHED_PUNCT(value)) {
18094                         literal[d++] = '\\';
18095                     }
18096                     literal[d++] = (char) value;
18097                     literal[d++] = '\0';
18098
18099                     vWARN4(RExC_parse,
18100                            "\"%.*s\" is more clearly written simply as \"%s\"",
18101                            (int) (RExC_parse - rangebegin),
18102                            rangebegin,
18103                            literal
18104                         );
18105                 }
18106                 else if (isMNEMONIC_CNTRL(value)) {
18107                     vWARN4(RExC_parse,
18108                            "\"%.*s\" is more clearly written simply as \"%s\"",
18109                            (int) (RExC_parse - rangebegin),
18110                            rangebegin,
18111                            cntrl_to_mnemonic((U8) value)
18112                         );
18113                 }
18114             }
18115         }
18116
18117         /* Deal with this element of the class */
18118
18119 #ifndef EBCDIC
18120         cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18121                                                     prevvalue, value);
18122 #else
18123         /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18124          * that don't require special handling, we can just add the range like
18125          * we do for ASCII platforms */
18126         if ((UNLIKELY(prevvalue == 0) && value >= 255)
18127             || ! (prevvalue < 256
18128                     && (unicode_range
18129                         || (! non_portable_endpoint
18130                             && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18131                                 || (isUPPER_A(prevvalue)
18132                                     && isUPPER_A(value)))))))
18133         {
18134             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18135                                                         prevvalue, value);
18136         }
18137         else {
18138             /* Here, requires special handling.  This can be because it is a
18139              * range whose code points are considered to be Unicode, and so
18140              * must be individually translated into native, or because its a
18141              * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18142              * EBCDIC, but we have defined them to include only the "expected"
18143              * upper or lower case ASCII alphabetics.  Subranges above 255 are
18144              * the same in native and Unicode, so can be added as a range */
18145             U8 start = NATIVE_TO_LATIN1(prevvalue);
18146             unsigned j;
18147             U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18148             for (j = start; j <= end; j++) {
18149                 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18150             }
18151             if (value > 255) {
18152                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18153                                                             256, value);
18154             }
18155         }
18156 #endif
18157
18158         range = 0; /* this range (if it was one) is done now */
18159     } /* End of loop through all the text within the brackets */
18160
18161     if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18162         output_posix_warnings(pRExC_state, posix_warnings);
18163     }
18164
18165     /* If anything in the class expands to more than one character, we have to
18166      * deal with them by building up a substitute parse string, and recursively
18167      * calling reg() on it, instead of proceeding */
18168     if (multi_char_matches) {
18169         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18170         I32 cp_count;
18171         STRLEN len;
18172         char *save_end = RExC_end;
18173         char *save_parse = RExC_parse;
18174         char *save_start = RExC_start;
18175         Size_t constructed_prefix_len = 0; /* This gives the length of the
18176                                               constructed portion of the
18177                                               substitute parse. */
18178         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
18179                                        a "|" */
18180         I32 reg_flags;
18181
18182         assert(! invert);
18183         /* Only one level of recursion allowed */
18184         assert(RExC_copy_start_in_constructed == RExC_precomp);
18185
18186 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
18187            because too confusing */
18188         if (invert) {
18189             sv_catpvs(substitute_parse, "(?:");
18190         }
18191 #endif
18192
18193         /* Look at the longest folds first */
18194         for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18195                         cp_count > 0;
18196                         cp_count--)
18197         {
18198
18199             if (av_exists(multi_char_matches, cp_count)) {
18200                 AV** this_array_ptr;
18201                 SV* this_sequence;
18202
18203                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18204                                                  cp_count, FALSE);
18205                 while ((this_sequence = av_pop(*this_array_ptr)) !=
18206                                                                 &PL_sv_undef)
18207                 {
18208                     if (! first_time) {
18209                         sv_catpvs(substitute_parse, "|");
18210                     }
18211                     first_time = FALSE;
18212
18213                     sv_catpv(substitute_parse, SvPVX(this_sequence));
18214                 }
18215             }
18216         }
18217
18218         /* If the character class contains anything else besides these
18219          * multi-character folds, have to include it in recursive parsing */
18220         if (element_count) {
18221             sv_catpvs(substitute_parse, "|[");
18222             constructed_prefix_len = SvCUR(substitute_parse);
18223             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18224
18225             /* Put in a closing ']' only if not going off the end, as otherwise
18226              * we are adding something that really isn't there */
18227             if (RExC_parse < RExC_end) {
18228                 sv_catpvs(substitute_parse, "]");
18229             }
18230         }
18231
18232         sv_catpvs(substitute_parse, ")");
18233 #if 0
18234         if (invert) {
18235             /* This is a way to get the parse to skip forward a whole named
18236              * sequence instead of matching the 2nd character when it fails the
18237              * first */
18238             sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18239         }
18240 #endif
18241
18242         /* Set up the data structure so that any errors will be properly
18243          * reported.  See the comments at the definition of
18244          * REPORT_LOCATION_ARGS for details */
18245         RExC_copy_start_in_input = (char *) orig_parse;
18246         RExC_start = RExC_parse = SvPV(substitute_parse, len);
18247         RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18248         RExC_end = RExC_parse + len;
18249         RExC_in_multi_char_class = 1;
18250
18251         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
18252
18253         *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18254
18255         /* And restore so can parse the rest of the pattern */
18256         RExC_parse = save_parse;
18257         RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18258         RExC_end = save_end;
18259         RExC_in_multi_char_class = 0;
18260         SvREFCNT_dec_NN(multi_char_matches);
18261         return ret;
18262     }
18263
18264     /* If folding, we calculate all characters that could fold to or from the
18265      * ones already on the list */
18266     if (cp_foldable_list) {
18267         if (FOLD) {
18268             UV start, end;      /* End points of code point ranges */
18269
18270             SV* fold_intersection = NULL;
18271             SV** use_list;
18272
18273             /* Our calculated list will be for Unicode rules.  For locale
18274              * matching, we have to keep a separate list that is consulted at
18275              * runtime only when the locale indicates Unicode rules (and we
18276              * don't include potential matches in the ASCII/Latin1 range, as
18277              * any code point could fold to any other, based on the run-time
18278              * locale).   For non-locale, we just use the general list */
18279             if (LOC) {
18280                 use_list = &only_utf8_locale_list;
18281             }
18282             else {
18283                 use_list = &cp_list;
18284             }
18285
18286             /* Only the characters in this class that participate in folds need
18287              * be checked.  Get the intersection of this class and all the
18288              * possible characters that are foldable.  This can quickly narrow
18289              * down a large class */
18290             _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18291                                   &fold_intersection);
18292
18293             /* Now look at the foldable characters in this class individually */
18294             invlist_iterinit(fold_intersection);
18295             while (invlist_iternext(fold_intersection, &start, &end)) {
18296                 UV j;
18297                 UV folded;
18298
18299                 /* Look at every character in the range */
18300                 for (j = start; j <= end; j++) {
18301                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18302                     STRLEN foldlen;
18303                     unsigned int k;
18304                     Size_t folds_count;
18305                     unsigned int first_fold;
18306                     const unsigned int * remaining_folds;
18307
18308                     if (j < 256) {
18309
18310                         /* Under /l, we don't know what code points below 256
18311                          * fold to, except we do know the MICRO SIGN folds to
18312                          * an above-255 character if the locale is UTF-8, so we
18313                          * add it to the special list (in *use_list)  Otherwise
18314                          * we know now what things can match, though some folds
18315                          * are valid under /d only if the target is UTF-8.
18316                          * Those go in a separate list */
18317                         if (      IS_IN_SOME_FOLD_L1(j)
18318                             && ! (LOC && j != MICRO_SIGN))
18319                         {
18320
18321                             /* ASCII is always matched; non-ASCII is matched
18322                              * only under Unicode rules (which could happen
18323                              * under /l if the locale is a UTF-8 one */
18324                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18325                                 *use_list = add_cp_to_invlist(*use_list,
18326                                                             PL_fold_latin1[j]);
18327                             }
18328                             else if (j != PL_fold_latin1[j]) {
18329                                 upper_latin1_only_utf8_matches
18330                                         = add_cp_to_invlist(
18331                                                 upper_latin1_only_utf8_matches,
18332                                                 PL_fold_latin1[j]);
18333                             }
18334                         }
18335
18336                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18337                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18338                         {
18339                             add_above_Latin1_folds(pRExC_state,
18340                                                    (U8) j,
18341                                                    use_list);
18342                         }
18343                         continue;
18344                     }
18345
18346                     /* Here is an above Latin1 character.  We don't have the
18347                      * rules hard-coded for it.  First, get its fold.  This is
18348                      * the simple fold, as the multi-character folds have been
18349                      * handled earlier and separated out */
18350                     folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18351                                                         (ASCII_FOLD_RESTRICTED)
18352                                                         ? FOLD_FLAGS_NOMIX_ASCII
18353                                                         : 0);
18354
18355                     /* Single character fold of above Latin1.  Add everything
18356                      * in its fold closure to the list that this node should
18357                      * match. */
18358                     folds_count = _inverse_folds(folded, &first_fold,
18359                                                     &remaining_folds);
18360                     for (k = 0; k <= folds_count; k++) {
18361                         UV c = (k == 0)     /* First time through use itself */
18362                                 ? folded
18363                                 : (k == 1)  /* 2nd time use, the first fold */
18364                                    ? first_fold
18365
18366                                      /* Then the remaining ones */
18367                                    : remaining_folds[k-2];
18368
18369                         /* /aa doesn't allow folds between ASCII and non- */
18370                         if ((   ASCII_FOLD_RESTRICTED
18371                             && (isASCII(c) != isASCII(j))))
18372                         {
18373                             continue;
18374                         }
18375
18376                         /* Folds under /l which cross the 255/256 boundary are
18377                          * added to a separate list.  (These are valid only
18378                          * when the locale is UTF-8.) */
18379                         if (c < 256 && LOC) {
18380                             *use_list = add_cp_to_invlist(*use_list, c);
18381                             continue;
18382                         }
18383
18384                         if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18385                         {
18386                             cp_list = add_cp_to_invlist(cp_list, c);
18387                         }
18388                         else {
18389                             /* Similarly folds involving non-ascii Latin1
18390                              * characters under /d are added to their list */
18391                             upper_latin1_only_utf8_matches
18392                                     = add_cp_to_invlist(
18393                                                 upper_latin1_only_utf8_matches,
18394                                                 c);
18395                         }
18396                     }
18397                 }
18398             }
18399             SvREFCNT_dec_NN(fold_intersection);
18400         }
18401
18402         /* Now that we have finished adding all the folds, there is no reason
18403          * to keep the foldable list separate */
18404         _invlist_union(cp_list, cp_foldable_list, &cp_list);
18405         SvREFCNT_dec_NN(cp_foldable_list);
18406     }
18407
18408     /* And combine the result (if any) with any inversion lists from posix
18409      * classes.  The lists are kept separate up to now because we don't want to
18410      * fold the classes */
18411     if (simple_posixes) {   /* These are the classes known to be unaffected by
18412                                /a, /aa, and /d */
18413         if (cp_list) {
18414             _invlist_union(cp_list, simple_posixes, &cp_list);
18415             SvREFCNT_dec_NN(simple_posixes);
18416         }
18417         else {
18418             cp_list = simple_posixes;
18419         }
18420     }
18421     if (posixes || nposixes) {
18422         if (! DEPENDS_SEMANTICS) {
18423
18424             /* For everything but /d, we can just add the current 'posixes' and
18425              * 'nposixes' to the main list */
18426             if (posixes) {
18427                 if (cp_list) {
18428                     _invlist_union(cp_list, posixes, &cp_list);
18429                     SvREFCNT_dec_NN(posixes);
18430                 }
18431                 else {
18432                     cp_list = posixes;
18433                 }
18434             }
18435             if (nposixes) {
18436                 if (cp_list) {
18437                     _invlist_union(cp_list, nposixes, &cp_list);
18438                     SvREFCNT_dec_NN(nposixes);
18439                 }
18440                 else {
18441                     cp_list = nposixes;
18442                 }
18443             }
18444         }
18445         else {
18446             /* Under /d, things like \w match upper Latin1 characters only if
18447              * the target string is in UTF-8.  But things like \W match all the
18448              * upper Latin1 characters if the target string is not in UTF-8.
18449              *
18450              * Handle the case with something like \W separately */
18451             if (nposixes) {
18452                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18453
18454                 /* A complemented posix class matches all upper Latin1
18455                  * characters if not in UTF-8.  And it matches just certain
18456                  * ones when in UTF-8.  That means those certain ones are
18457                  * matched regardless, so can just be added to the
18458                  * unconditional list */
18459                 if (cp_list) {
18460                     _invlist_union(cp_list, nposixes, &cp_list);
18461                     SvREFCNT_dec_NN(nposixes);
18462                     nposixes = NULL;
18463                 }
18464                 else {
18465                     cp_list = nposixes;
18466                 }
18467
18468                 /* Likewise for 'posixes' */
18469                 _invlist_union(posixes, cp_list, &cp_list);
18470                 SvREFCNT_dec(posixes);
18471
18472                 /* Likewise for anything else in the range that matched only
18473                  * under UTF-8 */
18474                 if (upper_latin1_only_utf8_matches) {
18475                     _invlist_union(cp_list,
18476                                    upper_latin1_only_utf8_matches,
18477                                    &cp_list);
18478                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18479                     upper_latin1_only_utf8_matches = NULL;
18480                 }
18481
18482                 /* If we don't match all the upper Latin1 characters regardless
18483                  * of UTF-8ness, we have to set a flag to match the rest when
18484                  * not in UTF-8 */
18485                 _invlist_subtract(only_non_utf8_list, cp_list,
18486                                   &only_non_utf8_list);
18487                 if (_invlist_len(only_non_utf8_list) != 0) {
18488                     anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18489                 }
18490                 SvREFCNT_dec_NN(only_non_utf8_list);
18491             }
18492             else {
18493                 /* Here there were no complemented posix classes.  That means
18494                  * the upper Latin1 characters in 'posixes' match only when the
18495                  * target string is in UTF-8.  So we have to add them to the
18496                  * list of those types of code points, while adding the
18497                  * remainder to the unconditional list.
18498                  *
18499                  * First calculate what they are */
18500                 SV* nonascii_but_latin1_properties = NULL;
18501                 _invlist_intersection(posixes, PL_UpperLatin1,
18502                                       &nonascii_but_latin1_properties);
18503
18504                 /* And add them to the final list of such characters. */
18505                 _invlist_union(upper_latin1_only_utf8_matches,
18506                                nonascii_but_latin1_properties,
18507                                &upper_latin1_only_utf8_matches);
18508
18509                 /* Remove them from what now becomes the unconditional list */
18510                 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18511                                   &posixes);
18512
18513                 /* And add those unconditional ones to the final list */
18514                 if (cp_list) {
18515                     _invlist_union(cp_list, posixes, &cp_list);
18516                     SvREFCNT_dec_NN(posixes);
18517                     posixes = NULL;
18518                 }
18519                 else {
18520                     cp_list = posixes;
18521                 }
18522
18523                 SvREFCNT_dec(nonascii_but_latin1_properties);
18524
18525                 /* Get rid of any characters from the conditional list that we
18526                  * now know are matched unconditionally, which may make that
18527                  * list empty */
18528                 _invlist_subtract(upper_latin1_only_utf8_matches,
18529                                   cp_list,
18530                                   &upper_latin1_only_utf8_matches);
18531                 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18532                     SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18533                     upper_latin1_only_utf8_matches = NULL;
18534                 }
18535             }
18536         }
18537     }
18538
18539     /* And combine the result (if any) with any inversion list from properties.
18540      * The lists are kept separate up to now so that we can distinguish the two
18541      * in regards to matching above-Unicode.  A run-time warning is generated
18542      * if a Unicode property is matched against a non-Unicode code point. But,
18543      * we allow user-defined properties to match anything, without any warning,
18544      * and we also suppress the warning if there is a portion of the character
18545      * class that isn't a Unicode property, and which matches above Unicode, \W
18546      * or [\x{110000}] for example.
18547      * (Note that in this case, unlike the Posix one above, there is no
18548      * <upper_latin1_only_utf8_matches>, because having a Unicode property
18549      * forces Unicode semantics */
18550     if (properties) {
18551         if (cp_list) {
18552
18553             /* If it matters to the final outcome, see if a non-property
18554              * component of the class matches above Unicode.  If so, the
18555              * warning gets suppressed.  This is true even if just a single
18556              * such code point is specified, as, though not strictly correct if
18557              * another such code point is matched against, the fact that they
18558              * are using above-Unicode code points indicates they should know
18559              * the issues involved */
18560             if (warn_super) {
18561                 warn_super = ! (invert
18562                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18563             }
18564
18565             _invlist_union(properties, cp_list, &cp_list);
18566             SvREFCNT_dec_NN(properties);
18567         }
18568         else {
18569             cp_list = properties;
18570         }
18571
18572         if (warn_super) {
18573             anyof_flags
18574              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18575
18576             /* Because an ANYOF node is the only one that warns, this node
18577              * can't be optimized into something else */
18578             optimizable = FALSE;
18579         }
18580     }
18581
18582     /* Here, we have calculated what code points should be in the character
18583      * class.
18584      *
18585      * Now we can see about various optimizations.  Fold calculation (which we
18586      * did above) needs to take place before inversion.  Otherwise /[^k]/i
18587      * would invert to include K, which under /i would match k, which it
18588      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
18589      * folded until runtime */
18590
18591     /* If we didn't do folding, it's because some information isn't available
18592      * until runtime; set the run-time fold flag for these  We know to set the
18593      * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18594      * at least one 0-255 range code point */
18595     if (LOC && FOLD) {
18596
18597         /* Some things on the list might be unconditionally included because of
18598          * other components.  Remove them, and clean up the list if it goes to
18599          * 0 elements */
18600         if (only_utf8_locale_list && cp_list) {
18601             _invlist_subtract(only_utf8_locale_list, cp_list,
18602                               &only_utf8_locale_list);
18603
18604             if (_invlist_len(only_utf8_locale_list) == 0) {
18605                 SvREFCNT_dec_NN(only_utf8_locale_list);
18606                 only_utf8_locale_list = NULL;
18607             }
18608         }
18609         if (    only_utf8_locale_list
18610             || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18611                             || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18612         {
18613             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18614             anyof_flags
18615                  |= ANYOFL_FOLD
18616                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18617         }
18618         else if (cp_list && invlist_lowest(cp_list) < 256) {
18619             /* If nothing is below 256, has no locale dependency; otherwise it
18620              * does */
18621             anyof_flags |= ANYOFL_FOLD;
18622             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18623         }
18624     }
18625     else if (   DEPENDS_SEMANTICS
18626              && (    upper_latin1_only_utf8_matches
18627                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18628     {
18629         RExC_seen_d_op = TRUE;
18630         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18631     }
18632
18633     /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18634      * compile time. */
18635     if (     cp_list
18636         &&   invert
18637         && ! has_runtime_dependency)
18638     {
18639         _invlist_invert(cp_list);
18640
18641         /* Clear the invert flag since have just done it here */
18642         invert = FALSE;
18643     }
18644
18645     if (ret_invlist) {
18646         *ret_invlist = cp_list;
18647
18648         return RExC_emit;
18649     }
18650
18651     /* All possible optimizations below still have these characteristics.
18652      * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18653      * routine) */
18654     *flagp |= HASWIDTH|SIMPLE;
18655
18656     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18657         RExC_contains_locale = 1;
18658     }
18659
18660     /* Some character classes are equivalent to other nodes.  Such nodes take
18661      * up less room, and some nodes require fewer operations to execute, than
18662      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
18663      * improve efficiency. */
18664
18665     if (optimizable) {
18666         PERL_UINT_FAST8_T i;
18667         UV partial_cp_count = 0;
18668         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18669         UV   end[MAX_FOLD_FROMS+1] = { 0 };
18670         bool single_range = FALSE;
18671
18672         if (cp_list) { /* Count the code points in enough ranges that we would
18673                           see all the ones possible in any fold in this version
18674                           of Unicode */
18675
18676             invlist_iterinit(cp_list);
18677             for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18678                 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18679                     break;
18680                 }
18681                 partial_cp_count += end[i] - start[i] + 1;
18682             }
18683
18684             if (i == 1) {
18685                 single_range = TRUE;
18686             }
18687             invlist_iterfinish(cp_list);
18688         }
18689
18690         /* If we know at compile time that this matches every possible code
18691          * point, any run-time dependencies don't matter */
18692         if (start[0] == 0 && end[0] == UV_MAX) {
18693             if (invert) {
18694                 ret = reganode(pRExC_state, OPFAIL, 0);
18695             }
18696             else {
18697                 ret = reg_node(pRExC_state, SANY);
18698                 MARK_NAUGHTY(1);
18699             }
18700             goto not_anyof;
18701         }
18702
18703         /* Similarly, for /l posix classes, if both a class and its
18704          * complement match, any run-time dependencies don't matter */
18705         if (posixl) {
18706             for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18707                                                         namedclass += 2)
18708             {
18709                 if (   POSIXL_TEST(posixl, namedclass)      /* class */
18710                     && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18711                 {
18712                     if (invert) {
18713                         ret = reganode(pRExC_state, OPFAIL, 0);
18714                     }
18715                     else {
18716                         ret = reg_node(pRExC_state, SANY);
18717                         MARK_NAUGHTY(1);
18718                     }
18719                     goto not_anyof;
18720                 }
18721             }
18722
18723             /* For well-behaved locales, some classes are subsets of others,
18724              * so complementing the subset and including the non-complemented
18725              * superset should match everything, like [\D[:alnum:]], and
18726              * [[:^alpha:][:alnum:]], but some implementations of locales are
18727              * buggy, and khw thinks its a bad idea to have optimization change
18728              * behavior, even if it avoids an OS bug in a given case */
18729
18730 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18731
18732             /* If is a single posix /l class, can optimize to just that op.
18733              * Such a node will not match anything in the Latin1 range, as that
18734              * is not determinable until runtime, but will match whatever the
18735              * class does outside that range.  (Note that some classes won't
18736              * match anything outside the range, like [:ascii:]) */
18737             if (    isSINGLE_BIT_SET(posixl)
18738                 && (partial_cp_count == 0 || start[0] > 255))
18739             {
18740                 U8 classnum;
18741                 SV * class_above_latin1 = NULL;
18742                 bool already_inverted;
18743                 bool are_equivalent;
18744
18745                 /* Compute which bit is set, which is the same thing as, e.g.,
18746                  * ANYOF_CNTRL.  From
18747                  * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18748                  * */
18749                 static const int MultiplyDeBruijnBitPosition2[32] =
18750                     {
18751                     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18752                     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18753                     };
18754
18755                 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18756                                                           * 0x077CB531U) >> 27];
18757                 classnum = namedclass_to_classnum(namedclass);
18758
18759                 /* The named classes are such that the inverted number is one
18760                  * larger than the non-inverted one */
18761                 already_inverted = namedclass
18762                                  - classnum_to_namedclass(classnum);
18763
18764                 /* Create an inversion list of the official property, inverted
18765                  * if the constructed node list is inverted, and restricted to
18766                  * only the above latin1 code points, which are the only ones
18767                  * known at compile time */
18768                 _invlist_intersection_maybe_complement_2nd(
18769                                                     PL_AboveLatin1,
18770                                                     PL_XPosix_ptrs[classnum],
18771                                                     already_inverted,
18772                                                     &class_above_latin1);
18773                 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18774                                                                         FALSE);
18775                 SvREFCNT_dec_NN(class_above_latin1);
18776
18777                 if (are_equivalent) {
18778
18779                     /* Resolve the run-time inversion flag with this possibly
18780                      * inverted class */
18781                     invert = invert ^ already_inverted;
18782
18783                     ret = reg_node(pRExC_state,
18784                                    POSIXL + invert * (NPOSIXL - POSIXL));
18785                     FLAGS(REGNODE_p(ret)) = classnum;
18786                     goto not_anyof;
18787                 }
18788             }
18789         }
18790
18791         /* khw can't think of any other possible transformation involving
18792          * these. */
18793         if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18794             goto is_anyof;
18795         }
18796
18797         if (! has_runtime_dependency) {
18798
18799             /* If the list is empty, nothing matches.  This happens, for
18800              * example, when a Unicode property that doesn't match anything is
18801              * the only element in the character class (perluniprops.pod notes
18802              * such properties). */
18803             if (partial_cp_count == 0) {
18804                 if (invert) {
18805                     ret = reg_node(pRExC_state, SANY);
18806                 }
18807                 else {
18808                     ret = reganode(pRExC_state, OPFAIL, 0);
18809                 }
18810
18811                 goto not_anyof;
18812             }
18813
18814             /* If matches everything but \n */
18815             if (   start[0] == 0 && end[0] == '\n' - 1
18816                 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18817             {
18818                 assert (! invert);
18819                 ret = reg_node(pRExC_state, REG_ANY);
18820                 MARK_NAUGHTY(1);
18821                 goto not_anyof;
18822             }
18823         }
18824
18825         /* Next see if can optimize classes that contain just a few code points
18826          * into an EXACTish node.  The reason to do this is to let the
18827          * optimizer join this node with adjacent EXACTish ones, and ANYOF
18828          * nodes require conversion to code point from UTF-8.
18829          *
18830          * An EXACTFish node can be generated even if not under /i, and vice
18831          * versa.  But care must be taken.  An EXACTFish node has to be such
18832          * that it only matches precisely the code points in the class, but we
18833          * want to generate the least restrictive one that does that, to
18834          * increase the odds of being able to join with an adjacent node.  For
18835          * example, if the class contains [kK], we have to make it an EXACTFAA
18836          * node to prevent the KELVIN SIGN from matching.  Whether we are under
18837          * /i or not is irrelevant in this case.  Less obvious is the pattern
18838          * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
18839          * supposed to match the single character U+0149 LATIN SMALL LETTER N
18840          * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
18841          * that includes \X{02BC}, there is a multi-char fold that does, and so
18842          * the node generated for it must be an EXACTFish one.  On the other
18843          * hand qr/:/i should generate a plain EXACT node since the colon
18844          * participates in no fold whatsoever, and having it EXACT tells the
18845          * optimizer the target string cannot match unless it has a colon in
18846          * it.
18847          */
18848         if (   ! posixl
18849             && ! invert
18850
18851                 /* Only try if there are no more code points in the class than
18852                  * in the max possible fold */
18853             &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
18854         {
18855             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18856             {
18857                 /* We can always make a single code point class into an
18858                  * EXACTish node. */
18859
18860                 if (LOC) {
18861
18862                     /* Here is /l:  Use EXACTL, except if there is a fold not
18863                      * known until runtime so shows as only a single code point
18864                      * here.  For code points above 255, we know which can
18865                      * cause problems by having a potential fold to the Latin1
18866                      * range. */
18867                     if (  ! FOLD
18868                         || (     start[0] > 255
18869                             && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
18870                     {
18871                         op = EXACTL;
18872                     }
18873                     else {
18874                         op = EXACTFL;
18875                     }
18876                 }
18877                 else if (! FOLD) { /* Not /l and not /i */
18878                     op = (start[0] < 256) ? EXACT : EXACT_REQ8;
18879                 }
18880                 else if (start[0] < 256) { /* /i, not /l, and the code point is
18881                                               small */
18882
18883                     /* Under /i, it gets a little tricky.  A code point that
18884                      * doesn't participate in a fold should be an EXACT node.
18885                      * We know this one isn't the result of a simple fold, or
18886                      * there'd be more than one code point in the list, but it
18887                      * could be part of a multi- character fold.  In that case
18888                      * we better not create an EXACT node, as we would wrongly
18889                      * be telling the optimizer that this code point must be in
18890                      * the target string, and that is wrong.  This is because
18891                      * if the sequence around this code point forms a
18892                      * multi-char fold, what needs to be in the string could be
18893                      * the code point that folds to the sequence.
18894                      *
18895                      * This handles the case of below-255 code points, as we
18896                      * have an easy look up for those.  The next clause handles
18897                      * the above-256 one */
18898                     op = IS_IN_SOME_FOLD_L1(start[0])
18899                          ? EXACTFU
18900                          : EXACT;
18901                 }
18902                 else {  /* /i, larger code point.  Since we are under /i, and
18903                            have just this code point, we know that it can't
18904                            fold to something else, so PL_InMultiCharFold
18905                            applies to it */
18906                     op = _invlist_contains_cp(PL_InMultiCharFold,
18907                                               start[0])
18908                          ? EXACTFU_REQ8
18909                          : EXACT_REQ8;
18910                 }
18911
18912                 value = start[0];
18913             }
18914             else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18915                      && _invlist_contains_cp(PL_in_some_fold, start[0]))
18916             {
18917                 /* Here, the only runtime dependency, if any, is from /d, and
18918                  * the class matches more than one code point, and the lowest
18919                  * code point participates in some fold.  It might be that the
18920                  * other code points are /i equivalent to this one, and hence
18921                  * they would representable by an EXACTFish node.  Above, we
18922                  * eliminated classes that contain too many code points to be
18923                  * EXACTFish, with the test for MAX_FOLD_FROMS
18924                  *
18925                  * First, special case the ASCII fold pairs, like 'B' and 'b'.
18926                  * We do this because we have EXACTFAA at our disposal for the
18927                  * ASCII range */
18928                 if (partial_cp_count == 2 && isASCII(start[0])) {
18929
18930                     /* The only ASCII characters that participate in folds are
18931                      * alphabetics */
18932                     assert(isALPHA(start[0]));
18933                     if (   end[0] == start[0]   /* First range is a single
18934                                                    character, so 2nd exists */
18935                         && isALPHA_FOLD_EQ(start[0], start[1]))
18936                     {
18937
18938                         /* Here, is part of an ASCII fold pair */
18939
18940                         if (   ASCII_FOLD_RESTRICTED
18941                             || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18942                         {
18943                             /* If the second clause just above was true, it
18944                              * means we can't be under /i, or else the list
18945                              * would have included more than this fold pair.
18946                              * Therefore we have to exclude the possibility of
18947                              * whatever else it is that folds to these, by
18948                              * using EXACTFAA */
18949                             op = EXACTFAA;
18950                         }
18951                         else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18952
18953                             /* Here, there's no simple fold that start[0] is part
18954                              * of, but there is a multi-character one.  If we
18955                              * are not under /i, we want to exclude that
18956                              * possibility; if under /i, we want to include it
18957                              * */
18958                             op = (FOLD) ? EXACTFU : EXACTFAA;
18959                         }
18960                         else {
18961
18962                             /* Here, the only possible fold start[0] particpates in
18963                              * is with start[1].  /i or not isn't relevant */
18964                             op = EXACTFU;
18965                         }
18966
18967                         value = toFOLD(start[0]);
18968                     }
18969                 }
18970                 else if (  ! upper_latin1_only_utf8_matches
18971                          || (   _invlist_len(upper_latin1_only_utf8_matches)
18972                                                                           == 2
18973                              && PL_fold_latin1[
18974                                invlist_highest(upper_latin1_only_utf8_matches)]
18975                              == start[0]))
18976                 {
18977                     /* Here, the smallest character is non-ascii or there are
18978                      * more than 2 code points matched by this node.  Also, we
18979                      * either don't have /d UTF-8 dependent matches, or if we
18980                      * do, they look like they could be a single character that
18981                      * is the fold of the lowest one in the always-match list.
18982                      * This test quickly excludes most of the false positives
18983                      * when there are /d UTF-8 depdendent matches.  These are
18984                      * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18985                      * SMALL LETTER A WITH GRAVE iff the target string is
18986                      * UTF-8.  (We don't have to worry above about exceeding
18987                      * the array bounds of PL_fold_latin1[] because any code
18988                      * point in 'upper_latin1_only_utf8_matches' is below 256.)
18989                      *
18990                      * EXACTFAA would apply only to pairs (hence exactly 2 code
18991                      * points) in the ASCII range, so we can't use it here to
18992                      * artificially restrict the fold domain, so we check if
18993                      * the class does or does not match some EXACTFish node.
18994                      * Further, if we aren't under /i, and and the folded-to
18995                      * character is part of a multi-character fold, we can't do
18996                      * this optimization, as the sequence around it could be
18997                      * that multi-character fold, and we don't here know the
18998                      * context, so we have to assume it is that multi-char
18999                      * fold, to prevent potential bugs.
19000                      *
19001                      * To do the general case, we first find the fold of the
19002                      * lowest code point (which may be higher than the lowest
19003                      * one), then find everything that folds to it.  (The data
19004                      * structure we have only maps from the folded code points,
19005                      * so we have to do the earlier step.) */
19006
19007                     Size_t foldlen;
19008                     U8 foldbuf[UTF8_MAXBYTES_CASE];
19009                     UV folded = _to_uni_fold_flags(start[0],
19010                                                         foldbuf, &foldlen, 0);
19011                     unsigned int first_fold;
19012                     const unsigned int * remaining_folds;
19013                     Size_t folds_to_this_cp_count = _inverse_folds(
19014                                                             folded,
19015                                                             &first_fold,
19016                                                             &remaining_folds);
19017                     Size_t folds_count = folds_to_this_cp_count + 1;
19018                     SV * fold_list = _new_invlist(folds_count);
19019                     unsigned int i;
19020
19021                     /* If there are UTF-8 dependent matches, create a temporary
19022                      * list of what this node matches, including them. */
19023                     SV * all_cp_list = NULL;
19024                     SV ** use_this_list = &cp_list;
19025
19026                     if (upper_latin1_only_utf8_matches) {
19027                         all_cp_list = _new_invlist(0);
19028                         use_this_list = &all_cp_list;
19029                         _invlist_union(cp_list,
19030                                        upper_latin1_only_utf8_matches,
19031                                        use_this_list);
19032                     }
19033
19034                     /* Having gotten everything that participates in the fold
19035                      * containing the lowest code point, we turn that into an
19036                      * inversion list, making sure everything is included. */
19037                     fold_list = add_cp_to_invlist(fold_list, start[0]);
19038                     fold_list = add_cp_to_invlist(fold_list, folded);
19039                     if (folds_to_this_cp_count > 0) {
19040                         fold_list = add_cp_to_invlist(fold_list, first_fold);
19041                         for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19042                             fold_list = add_cp_to_invlist(fold_list,
19043                                                         remaining_folds[i]);
19044                         }
19045                     }
19046
19047                     /* If the fold list is identical to what's in this ANYOF
19048                      * node, the node can be represented by an EXACTFish one
19049                      * instead */
19050                     if (_invlistEQ(*use_this_list, fold_list,
19051                                    0 /* Don't complement */ )
19052                     ) {
19053
19054                         /* But, we have to be careful, as mentioned above.
19055                          * Just the right sequence of characters could match
19056                          * this if it is part of a multi-character fold.  That
19057                          * IS what we want if we are under /i.  But it ISN'T
19058                          * what we want if not under /i, as it could match when
19059                          * it shouldn't.  So, when we aren't under /i and this
19060                          * character participates in a multi-char fold, we
19061                          * don't optimize into an EXACTFish node.  So, for each
19062                          * case below we have to check if we are folding
19063                          * and if not, if it is not part of a multi-char fold.
19064                          * */
19065                         if (start[0] > 255) {    /* Highish code point */
19066                             if (FOLD || ! _invlist_contains_cp(
19067                                             PL_InMultiCharFold, folded))
19068                             {
19069                                 op = (LOC)
19070                                      ? EXACTFLU8
19071                                      : (ASCII_FOLD_RESTRICTED)
19072                                        ? EXACTFAA
19073                                        : EXACTFU_REQ8;
19074                                 value = folded;
19075                             }
19076                         }   /* Below, the lowest code point < 256 */
19077                         else if (    FOLD
19078                                  &&  folded == 's'
19079                                  &&  DEPENDS_SEMANTICS)
19080                         {   /* An EXACTF node containing a single character
19081                                 's', can be an EXACTFU if it doesn't get
19082                                 joined with an adjacent 's' */
19083                             op = EXACTFU_S_EDGE;
19084                             value = folded;
19085                         }
19086                         else if (    FOLD
19087                                 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19088                         {
19089                             if (upper_latin1_only_utf8_matches) {
19090                                 op = EXACTF;
19091
19092                                 /* We can't use the fold, as that only matches
19093                                  * under UTF-8 */
19094                                 value = start[0];
19095                             }
19096                             else if (     UNLIKELY(start[0] == MICRO_SIGN)
19097                                      && ! UTF)
19098                             {   /* EXACTFUP is a special node for this
19099                                    character */
19100                                 op = (ASCII_FOLD_RESTRICTED)
19101                                      ? EXACTFAA
19102                                      : EXACTFUP;
19103                                 value = MICRO_SIGN;
19104                             }
19105                             else if (     ASCII_FOLD_RESTRICTED
19106                                      && ! isASCII(start[0]))
19107                             {   /* For ASCII under /iaa, we can use EXACTFU
19108                                    below */
19109                                 op = EXACTFAA;
19110                                 value = folded;
19111                             }
19112                             else {
19113                                 op = EXACTFU;
19114                                 value = folded;
19115                             }
19116                         }
19117                     }
19118
19119                     SvREFCNT_dec_NN(fold_list);
19120                     SvREFCNT_dec(all_cp_list);
19121                 }
19122             }
19123
19124             if (op != END) {
19125                 U8 len;
19126
19127                 /* Here, we have calculated what EXACTish node to use.  Have to
19128                  * convert to UTF-8 if not already there */
19129                 if (value > 255) {
19130                     if (! UTF) {
19131                         SvREFCNT_dec(cp_list);;
19132                         REQUIRE_UTF8(flagp);
19133                     }
19134
19135                     /* This is a kludge to the special casing issues with this
19136                      * ligature under /aa.  FB05 should fold to FB06, but the
19137                      * call above to _to_uni_fold_flags() didn't find this, as
19138                      * it didn't use the /aa restriction in order to not miss
19139                      * other folds that would be affected.  This is the only
19140                      * instance likely to ever be a problem in all of Unicode.
19141                      * So special case it. */
19142                     if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
19143                         && ASCII_FOLD_RESTRICTED)
19144                     {
19145                         value = LATIN_SMALL_LIGATURE_ST;
19146                     }
19147                 }
19148
19149                 len = (UTF) ? UVCHR_SKIP(value) : 1;
19150
19151                 ret = regnode_guts(pRExC_state, op, len, "exact");
19152                 FILL_NODE(ret, op);
19153                 RExC_emit += 1 + STR_SZ(len);
19154                 setSTR_LEN(REGNODE_p(ret), len);
19155                 if (len == 1) {
19156                     *STRINGs(REGNODE_p(ret)) = (U8) value;
19157                 }
19158                 else {
19159                     uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19160                 }
19161                 goto not_anyof;
19162             }
19163         }
19164
19165         if (! has_runtime_dependency) {
19166
19167             /* See if this can be turned into an ANYOFM node.  Think about the
19168              * bit patterns in two different bytes.  In some positions, the
19169              * bits in each will be 1; and in other positions both will be 0;
19170              * and in some positions the bit will be 1 in one byte, and 0 in
19171              * the other.  Let 'n' be the number of positions where the bits
19172              * differ.  We create a mask which has exactly 'n' 0 bits, each in
19173              * a position where the two bytes differ.  Now take the set of all
19174              * bytes that when ANDed with the mask yield the same result.  That
19175              * set has 2**n elements, and is representable by just two 8 bit
19176              * numbers: the result and the mask.  Importantly, matching the set
19177              * can be vectorized by creating a word full of the result bytes,
19178              * and a word full of the mask bytes, yielding a significant speed
19179              * up.  Here, see if this node matches such a set.  As a concrete
19180              * example consider [01], and the byte representing '0' which is
19181              * 0x30 on ASCII machines.  It has the bits 0011 0000.  Take the
19182              * mask 1111 1110.  If we AND 0x31 and 0x30 with that mask we get
19183              * 0x30.  Any other bytes ANDed yield something else.  So [01],
19184              * which is a common usage, is optimizable into ANYOFM, and can
19185              * benefit from the speed up.  We can only do this on UTF-8
19186              * invariant bytes, because they have the same bit patterns under
19187              * UTF-8 as not. */
19188             PERL_UINT_FAST8_T inverted = 0;
19189 #ifdef EBCDIC
19190             const PERL_UINT_FAST8_T max_permissible = 0xFF;
19191 #else
19192             const PERL_UINT_FAST8_T max_permissible = 0x7F;
19193 #endif
19194             /* If doesn't fit the criteria for ANYOFM, invert and try again.
19195              * If that works we will instead later generate an NANYOFM, and
19196              * invert back when through */
19197             if (invlist_highest(cp_list) > max_permissible) {
19198                 _invlist_invert(cp_list);
19199                 inverted = 1;
19200             }
19201
19202             if (invlist_highest(cp_list) <= max_permissible) {
19203                 UV this_start, this_end;
19204                 UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
19205                 U8 bits_differing = 0;
19206                 Size_t full_cp_count = 0;
19207                 bool first_time = TRUE;
19208
19209                 /* Go through the bytes and find the bit positions that differ
19210                  * */
19211                 invlist_iterinit(cp_list);
19212                 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19213                     unsigned int i = this_start;
19214
19215                     if (first_time) {
19216                         if (! UVCHR_IS_INVARIANT(i)) {
19217                             goto done_anyofm;
19218                         }
19219
19220                         first_time = FALSE;
19221                         lowest_cp = this_start;
19222
19223                         /* We have set up the code point to compare with.
19224                          * Don't compare it with itself */
19225                         i++;
19226                     }
19227
19228                     /* Find the bit positions that differ from the lowest code
19229                      * point in the node.  Keep track of all such positions by
19230                      * OR'ing */
19231                     for (; i <= this_end; i++) {
19232                         if (! UVCHR_IS_INVARIANT(i)) {
19233                             goto done_anyofm;
19234                         }
19235
19236                         bits_differing  |= i ^ lowest_cp;
19237                     }
19238
19239                     full_cp_count += this_end - this_start + 1;
19240                 }
19241
19242                 /* At the end of the loop, we count how many bits differ from
19243                  * the bits in lowest code point, call the count 'd'.  If the
19244                  * set we found contains 2**d elements, it is the closure of
19245                  * all code points that differ only in those bit positions.  To
19246                  * convince yourself of that, first note that the number in the
19247                  * closure must be a power of 2, which we test for.  The only
19248                  * way we could have that count and it be some differing set,
19249                  * is if we got some code points that don't differ from the
19250                  * lowest code point in any position, but do differ from each
19251                  * other in some other position.  That means one code point has
19252                  * a 1 in that position, and another has a 0.  But that would
19253                  * mean that one of them differs from the lowest code point in
19254                  * that position, which possibility we've already excluded.  */
19255                 if (  (inverted || full_cp_count > 1)
19256                     && full_cp_count == 1U << PL_bitcount[bits_differing])
19257                 {
19258                     U8 ANYOFM_mask;
19259
19260                     op = ANYOFM + inverted;;
19261
19262                     /* We need to make the bits that differ be 0's */
19263                     ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19264
19265                     /* The argument is the lowest code point */
19266                     ret = reganode(pRExC_state, op, lowest_cp);
19267                     FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19268                 }
19269
19270               done_anyofm:
19271                 invlist_iterfinish(cp_list);
19272             }
19273
19274             if (inverted) {
19275                 _invlist_invert(cp_list);
19276             }
19277
19278             if (op != END) {
19279                 goto not_anyof;
19280             }
19281
19282             /* XXX We could create an ANYOFR_LOW node here if we saved above if
19283              * all were invariants, it wasn't inverted, and there is a single
19284              * range.  This would be faster than some of the posix nodes we
19285              * create below like /\d/a, but would be twice the size.  Without
19286              * having actually measured the gain, khw doesn't think the
19287              * tradeoff is really worth it */
19288         }
19289
19290         if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19291             PERL_UINT_FAST8_T type;
19292             SV * intersection = NULL;
19293             SV* d_invlist = NULL;
19294
19295             /* See if this matches any of the POSIX classes.  The POSIXA and
19296              * POSIXD ones are about the same speed as ANYOF ops, but take less
19297              * room; the ones that have above-Latin1 code point matches are
19298              * somewhat faster than ANYOF.  */
19299
19300             for (type = POSIXA; type >= POSIXD; type--) {
19301                 int posix_class;
19302
19303                 if (type == POSIXL) {   /* But not /l posix classes */
19304                     continue;
19305                 }
19306
19307                 for (posix_class = 0;
19308                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19309                      posix_class++)
19310                 {
19311                     SV** our_code_points = &cp_list;
19312                     SV** official_code_points;
19313                     int try_inverted;
19314
19315                     if (type == POSIXA) {
19316                         official_code_points = &PL_Posix_ptrs[posix_class];
19317                     }
19318                     else {
19319                         official_code_points = &PL_XPosix_ptrs[posix_class];
19320                     }
19321
19322                     /* Skip non-existent classes of this type.  e.g. \v only
19323                      * has an entry in PL_XPosix_ptrs */
19324                     if (! *official_code_points) {
19325                         continue;
19326                     }
19327
19328                     /* Try both the regular class, and its inversion */
19329                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19330                         bool this_inverted = invert ^ try_inverted;
19331
19332                         if (type != POSIXD) {
19333
19334                             /* This class that isn't /d can't match if we have
19335                              * /d dependencies */
19336                             if (has_runtime_dependency
19337                                                     & HAS_D_RUNTIME_DEPENDENCY)
19338                             {
19339                                 continue;
19340                             }
19341                         }
19342                         else /* is /d */ if (! this_inverted) {
19343
19344                             /* /d classes don't match anything non-ASCII below
19345                              * 256 unconditionally (which cp_list contains) */
19346                             _invlist_intersection(cp_list, PL_UpperLatin1,
19347                                                            &intersection);
19348                             if (_invlist_len(intersection) != 0) {
19349                                 continue;
19350                             }
19351
19352                             SvREFCNT_dec(d_invlist);
19353                             d_invlist = invlist_clone(cp_list, NULL);
19354
19355                             /* But under UTF-8 it turns into using /u rules.
19356                              * Add the things it matches under these conditions
19357                              * so that we check below that these are identical
19358                              * to what the tested class should match */
19359                             if (upper_latin1_only_utf8_matches) {
19360                                 _invlist_union(
19361                                             d_invlist,
19362                                             upper_latin1_only_utf8_matches,
19363                                             &d_invlist);
19364                             }
19365                             our_code_points = &d_invlist;
19366                         }
19367                         else {  /* POSIXD, inverted.  If this doesn't have this
19368                                    flag set, it isn't /d. */
19369                             if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19370                             {
19371                                 continue;
19372                             }
19373                             our_code_points = &cp_list;
19374                         }
19375
19376                         /* Here, have weeded out some things.  We want to see
19377                          * if the list of characters this node contains
19378                          * ('*our_code_points') precisely matches those of the
19379                          * class we are currently checking against
19380                          * ('*official_code_points'). */
19381                         if (_invlistEQ(*our_code_points,
19382                                        *official_code_points,
19383                                        try_inverted))
19384                         {
19385                             /* Here, they precisely match.  Optimize this ANYOF
19386                              * node into its equivalent POSIX one of the
19387                              * correct type, possibly inverted */
19388                             ret = reg_node(pRExC_state, (try_inverted)
19389                                                         ? type + NPOSIXA
19390                                                                 - POSIXA
19391                                                         : type);
19392                             FLAGS(REGNODE_p(ret)) = posix_class;
19393                             SvREFCNT_dec(d_invlist);
19394                             SvREFCNT_dec(intersection);
19395                             goto not_anyof;
19396                         }
19397                     }
19398                 }
19399             }
19400             SvREFCNT_dec(d_invlist);
19401             SvREFCNT_dec(intersection);
19402         }
19403
19404         /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19405          * both in size and speed.  Currently, a 20 bit range base (smallest
19406          * code point in the range), and a 12 bit maximum delta are packed into
19407          * a 32 bit word.  This allows for using it on all of the Unicode code
19408          * points except for the highest plane, which is only for private use
19409          * code points.  khw doubts that a bigger delta is likely in real world
19410          * applications */
19411         if (     single_range
19412             && ! has_runtime_dependency
19413             &&   anyof_flags == 0
19414             &&   start[0] < (1 << ANYOFR_BASE_BITS)
19415             &&   end[0] - start[0]
19416                     < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19417                                    * CHARBITS - ANYOFR_BASE_BITS))))
19418
19419         {
19420             U8 low_utf8[UTF8_MAXBYTES+1];
19421             U8 high_utf8[UTF8_MAXBYTES+1];
19422
19423             ret = reganode(pRExC_state, ANYOFR,
19424                         (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19425
19426             /* Place the lowest UTF-8 start byte in the flags field, so as to
19427              * allow efficient ruling out at run time of many possible inputs.
19428              * */
19429             (void) uvchr_to_utf8(low_utf8, start[0]);
19430             (void) uvchr_to_utf8(high_utf8, end[0]);
19431
19432             /* If all code points share the same first byte, this can be an
19433              * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
19434              * quickly rule out many inputs at run-time without having to
19435              * compute the code point from UTF-8.  For EBCDIC, we use I8, as
19436              * not doing that transformation would not rule out nearly so many
19437              * things */
19438             if (low_utf8[0] == high_utf8[0]) {
19439                 OP(REGNODE_p(ret)) = ANYOFRb;
19440                 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19441             }
19442             else {
19443                 ANYOF_FLAGS(REGNODE_p(ret))
19444                                     = NATIVE_UTF8_TO_I8(low_utf8[0]);
19445             }
19446
19447             goto not_anyof;
19448         }
19449
19450         /* If didn't find an optimization and there is no need for a bitmap,
19451          * optimize to indicate that */
19452         if (     start[0] >= NUM_ANYOF_CODE_POINTS
19453             && ! LOC
19454             && ! upper_latin1_only_utf8_matches
19455             &&   anyof_flags == 0)
19456         {
19457             U8 low_utf8[UTF8_MAXBYTES+1];
19458             UV highest_cp = invlist_highest(cp_list);
19459
19460             /* Currently the maximum allowed code point by the system is
19461              * IV_MAX.  Higher ones are reserved for future internal use.  This
19462              * particular regnode can be used for higher ones, but we can't
19463              * calculate the code point of those.  IV_MAX suffices though, as
19464              * it will be a large first byte */
19465             Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19466                            - low_utf8;
19467
19468             /* We store the lowest possible first byte of the UTF-8
19469              * representation, using the flags field.  This allows for quick
19470              * ruling out of some inputs without having to convert from UTF-8
19471              * to code point.  For EBCDIC, we use I8, as not doing that
19472              * transformation would not rule out nearly so many things */
19473             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19474
19475             op = ANYOFH;
19476
19477             /* If the first UTF-8 start byte for the highest code point in the
19478              * range is suitably small, we may be able to get an upper bound as
19479              * well */
19480             if (highest_cp <= IV_MAX) {
19481                 U8 high_utf8[UTF8_MAXBYTES+1];
19482                 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19483                                 - high_utf8;
19484
19485                 /* If the lowest and highest are the same, we can get an exact
19486                  * first byte instead of a just minimum or even a sequence of
19487                  * exact leading bytes.  We signal these with different
19488                  * regnodes */
19489                 if (low_utf8[0] == high_utf8[0]) {
19490                     Size_t len = find_first_differing_byte_pos(low_utf8,
19491                                                                high_utf8,
19492                                                        MIN(low_len, high_len));
19493
19494                     if (len == 1) {
19495
19496                         /* No need to convert to I8 for EBCDIC as this is an
19497                          * exact match */
19498                         anyof_flags = low_utf8[0];
19499                         op = ANYOFHb;
19500                     }
19501                     else {
19502                         op = ANYOFHs;
19503                         ret = regnode_guts(pRExC_state, op,
19504                                            regarglen[op] + STR_SZ(len),
19505                                            "anyofhs");
19506                         FILL_NODE(ret, op);
19507                         ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19508                                                                         = len;
19509                         Copy(low_utf8,  /* Add the common bytes */
19510                            ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19511                            len, U8);
19512                         RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19513                         set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19514                                                   NULL, only_utf8_locale_list);
19515                         goto not_anyof;
19516                     }
19517                 }
19518                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19519                 {
19520
19521                     /* Here, the high byte is not the same as the low, but is
19522                      * small enough that its reasonable to have a loose upper
19523                      * bound, which is packed in with the strict lower bound.
19524                      * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19525                      * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
19526                      * is the same thing as UTF-8 */
19527
19528                     U8 bits = 0;
19529                     U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19530                     U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19531                                   - anyof_flags;
19532
19533                     if (range_diff <= max_range_diff / 8) {
19534                         bits = 3;
19535                     }
19536                     else if (range_diff <= max_range_diff / 4) {
19537                         bits = 2;
19538                     }
19539                     else if (range_diff <= max_range_diff / 2) {
19540                         bits = 1;
19541                     }
19542                     anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19543                     op = ANYOFHr;
19544                 }
19545             }
19546
19547             goto done_finding_op;
19548         }
19549     }   /* End of seeing if can optimize it into a different node */
19550
19551   is_anyof: /* It's going to be an ANYOF node. */
19552     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19553          ? ANYOFD
19554          : ((posixl)
19555             ? ANYOFPOSIXL
19556             : ((LOC)
19557                ? ANYOFL
19558                : ANYOF));
19559
19560   done_finding_op:
19561
19562     ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19563     FILL_NODE(ret, op);        /* We set the argument later */
19564     RExC_emit += 1 + regarglen[op];
19565     ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19566
19567     /* Here, <cp_list> contains all the code points we can determine at
19568      * compile time that match under all conditions.  Go through it, and
19569      * for things that belong in the bitmap, put them there, and delete from
19570      * <cp_list>.  While we are at it, see if everything above 255 is in the
19571      * list, and if so, set a flag to speed up execution */
19572
19573     populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19574
19575     if (posixl) {
19576         ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19577     }
19578
19579     if (invert) {
19580         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19581     }
19582
19583     /* Here, the bitmap has been populated with all the Latin1 code points that
19584      * always match.  Can now add to the overall list those that match only
19585      * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19586      * */
19587     if (upper_latin1_only_utf8_matches) {
19588         if (cp_list) {
19589             _invlist_union(cp_list,
19590                            upper_latin1_only_utf8_matches,
19591                            &cp_list);
19592             SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19593         }
19594         else {
19595             cp_list = upper_latin1_only_utf8_matches;
19596         }
19597         ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19598     }
19599
19600     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19601                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19602                    ? listsv
19603                    : NULL,
19604                   only_utf8_locale_list);
19605     SvREFCNT_dec(cp_list);;
19606     SvREFCNT_dec(only_utf8_locale_list);
19607     return ret;
19608
19609   not_anyof:
19610
19611     /* Here, the node is getting optimized into something that's not an ANYOF
19612      * one.  Finish up. */
19613
19614     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19615                                            RExC_parse - orig_parse);;
19616     SvREFCNT_dec(cp_list);;
19617     SvREFCNT_dec(only_utf8_locale_list);
19618     return ret;
19619 }
19620
19621 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19622
19623 STATIC void
19624 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19625                 regnode* const node,
19626                 SV* const cp_list,
19627                 SV* const runtime_defns,
19628                 SV* const only_utf8_locale_list)
19629 {
19630     /* Sets the arg field of an ANYOF-type node 'node', using information about
19631      * the node passed-in.  If there is nothing outside the node's bitmap, the
19632      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
19633      * the count returned by add_data(), having allocated and stored an array,
19634      * av, as follows:
19635      *
19636      *  av[0] stores the inversion list defining this class as far as known at
19637      *        this time, or PL_sv_undef if nothing definite is now known.
19638      *  av[1] stores the inversion list of code points that match only if the
19639      *        current locale is UTF-8, or if none, PL_sv_undef if there is an
19640      *        av[2], or no entry otherwise.
19641      *  av[2] stores the list of user-defined properties whose subroutine
19642      *        definitions aren't known at this time, or no entry if none. */
19643
19644     UV n;
19645
19646     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19647
19648     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19649         assert(! (ANYOF_FLAGS(node)
19650                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19651         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19652     }
19653     else {
19654         AV * const av = newAV();
19655         SV *rv;
19656
19657         if (cp_list) {
19658             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
19659         }
19660
19661         if (only_utf8_locale_list) {
19662             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
19663                                      SvREFCNT_inc_NN(only_utf8_locale_list));
19664         }
19665
19666         if (runtime_defns) {
19667             av_store(av, DEFERRED_USER_DEFINED_INDEX,
19668                          SvREFCNT_inc_NN(runtime_defns));
19669         }
19670
19671         rv = newRV_noinc(MUTABLE_SV(av));
19672         n = add_data(pRExC_state, STR_WITH_LEN("s"));
19673         RExC_rxi->data->data[n] = (void*)rv;
19674         ARG_SET(node, n);
19675     }
19676 }
19677
19678 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19679 SV *
19680 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19681                                         const regnode* node,
19682                                         bool doinit,
19683                                         SV** listsvp,
19684                                         SV** only_utf8_locale_ptr,
19685                                         SV** output_invlist)
19686
19687 {
19688     /* For internal core use only.
19689      * Returns the inversion list for the input 'node' in the regex 'prog'.
19690      * If <doinit> is 'true', will attempt to create the inversion list if not
19691      *    already done.
19692      * If <listsvp> is non-null, will return the printable contents of the
19693      *    property definition.  This can be used to get debugging information
19694      *    even before the inversion list exists, by calling this function with
19695      *    'doinit' set to false, in which case the components that will be used
19696      *    to eventually create the inversion list are returned  (in a printable
19697      *    form).
19698      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19699      *    store an inversion list of code points that should match only if the
19700      *    execution-time locale is a UTF-8 one.
19701      * If <output_invlist> is not NULL, it is where this routine is to store an
19702      *    inversion list of the code points that would be instead returned in
19703      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
19704      *    when this parameter is used, is just the non-code point data that
19705      *    will go into creating the inversion list.  This currently should be just
19706      *    user-defined properties whose definitions were not known at compile
19707      *    time.  Using this parameter allows for easier manipulation of the
19708      *    inversion list's data by the caller.  It is illegal to call this
19709      *    function with this parameter set, but not <listsvp>
19710      *
19711      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
19712      * that, in spite of this function's name, the inversion list it returns
19713      * may include the bitmap data as well */
19714
19715     SV *si  = NULL;         /* Input initialization string */
19716     SV* invlist = NULL;
19717
19718     RXi_GET_DECL(prog, progi);
19719     const struct reg_data * const data = prog ? progi->data : NULL;
19720
19721     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19722     assert(! output_invlist || listsvp);
19723
19724     if (data && data->count) {
19725         const U32 n = ARG(node);
19726
19727         if (data->what[n] == 's') {
19728             SV * const rv = MUTABLE_SV(data->data[n]);
19729             AV * const av = MUTABLE_AV(SvRV(rv));
19730             SV **const ary = AvARRAY(av);
19731
19732             invlist = ary[INVLIST_INDEX];
19733
19734             if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19735                 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19736             }
19737
19738             if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19739                 si = ary[DEFERRED_USER_DEFINED_INDEX];
19740             }
19741
19742             if (doinit && (si || invlist)) {
19743                 if (si) {
19744                     bool user_defined;
19745                     SV * msg = newSVpvs_flags("", SVs_TEMP);
19746
19747                     SV * prop_definition = handle_user_defined_property(
19748                             "", 0, FALSE,   /* There is no \p{}, \P{} */
19749                             SvPVX_const(si)[1] - '0',   /* /i or not has been
19750                                                            stored here for just
19751                                                            this occasion */
19752                             TRUE,           /* run time */
19753                             FALSE,          /* This call must find the defn */
19754                             si,             /* The property definition  */
19755                             &user_defined,
19756                             msg,
19757                             0               /* base level call */
19758                            );
19759
19760                     if (SvCUR(msg)) {
19761                         assert(prop_definition == NULL);
19762
19763                         Perl_croak(aTHX_ "%" UTF8f,
19764                                 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19765                     }
19766
19767                     if (invlist) {
19768                         _invlist_union(invlist, prop_definition, &invlist);
19769                         SvREFCNT_dec_NN(prop_definition);
19770                     }
19771                     else {
19772                         invlist = prop_definition;
19773                     }
19774
19775                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19776                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19777
19778                     ary[INVLIST_INDEX] = invlist;
19779                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19780                                  ? ONLY_LOCALE_MATCHES_INDEX
19781                                  : INVLIST_INDEX);
19782                     si = NULL;
19783                 }
19784             }
19785         }
19786     }
19787
19788     /* If requested, return a printable version of what this ANYOF node matches
19789      * */
19790     if (listsvp) {
19791         SV* matches_string = NULL;
19792
19793         /* This function can be called at compile-time, before everything gets
19794          * resolved, in which case we return the currently best available
19795          * information, which is the string that will eventually be used to do
19796          * that resolving, 'si' */
19797         if (si) {
19798             /* Here, we only have 'si' (and possibly some passed-in data in
19799              * 'invlist', which is handled below)  If the caller only wants
19800              * 'si', use that.  */
19801             if (! output_invlist) {
19802                 matches_string = newSVsv(si);
19803             }
19804             else {
19805                 /* But if the caller wants an inversion list of the node, we
19806                  * need to parse 'si' and place as much as possible in the
19807                  * desired output inversion list, making 'matches_string' only
19808                  * contain the currently unresolvable things */
19809                 const char *si_string = SvPVX(si);
19810                 STRLEN remaining = SvCUR(si);
19811                 UV prev_cp = 0;
19812                 U8 count = 0;
19813
19814                 /* Ignore everything before and including the first new-line */
19815                 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
19816                 assert (si_string != NULL);
19817                 si_string++;
19818                 remaining = SvPVX(si) + SvCUR(si) - si_string;
19819
19820                 while (remaining > 0) {
19821
19822                     /* The data consists of just strings defining user-defined
19823                      * property names, but in prior incarnations, and perhaps
19824                      * somehow from pluggable regex engines, it could still
19825                      * hold hex code point definitions.  Each component of a
19826                      * range would be separated by a tab, and each range by a
19827                      * new-line.  If these are found, instead add them to the
19828                      * inversion list */
19829                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
19830                                      |PERL_SCAN_SILENT_NON_PORTABLE;
19831                     STRLEN len = remaining;
19832                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19833
19834                     /* If the hex decode routine found something, it should go
19835                      * up to the next \n */
19836                     if (   *(si_string + len) == '\n') {
19837                         if (count) {    /* 2nd code point on line */
19838                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19839                         }
19840                         else {
19841                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19842                         }
19843                         count = 0;
19844                         goto prepare_for_next_iteration;
19845                     }
19846
19847                     /* If the hex decode was instead for the lower range limit,
19848                      * save it, and go parse the upper range limit */
19849                     if (*(si_string + len) == '\t') {
19850                         assert(count == 0);
19851
19852                         prev_cp = cp;
19853                         count = 1;
19854                       prepare_for_next_iteration:
19855                         si_string += len + 1;
19856                         remaining -= len + 1;
19857                         continue;
19858                     }
19859
19860                     /* Here, didn't find a legal hex number.  Just add the text
19861                      * from here up to the next \n, omitting any trailing
19862                      * markers. */
19863
19864                     remaining -= len;
19865                     len = strcspn(si_string,
19866                                         DEFERRED_PROP_EXPANSION_MARKERs "\n");
19867                     remaining -= len;
19868                     if (matches_string) {
19869                         sv_catpvn(matches_string, si_string, len);
19870                     }
19871                     else {
19872                         matches_string = newSVpvn(si_string, len);
19873                     }
19874                     sv_catpvs(matches_string, " ");
19875
19876                     si_string += len;
19877                     if (   remaining
19878                         && UCHARAT(si_string)
19879                                             == DEFERRED_PROP_EXPANSION_MARKERc)
19880                     {
19881                         si_string++;
19882                         remaining--;
19883                     }
19884                     if (remaining && UCHARAT(si_string) == '\n') {
19885                         si_string++;
19886                         remaining--;
19887                     }
19888                 } /* end of loop through the text */
19889
19890                 assert(matches_string);
19891                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
19892                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19893                 }
19894             } /* end of has an 'si' */
19895         }
19896
19897         /* Add the stuff that's already known */
19898         if (invlist) {
19899
19900             /* Again, if the caller doesn't want the output inversion list, put
19901              * everything in 'matches-string' */
19902             if (! output_invlist) {
19903                 if ( ! matches_string) {
19904                     matches_string = newSVpvs("\n");
19905                 }
19906                 sv_catsv(matches_string, invlist_contents(invlist,
19907                                                   TRUE /* traditional style */
19908                                                   ));
19909             }
19910             else if (! *output_invlist) {
19911                 *output_invlist = invlist_clone(invlist, NULL);
19912             }
19913             else {
19914                 _invlist_union(*output_invlist, invlist, output_invlist);
19915             }
19916         }
19917
19918         *listsvp = matches_string;
19919     }
19920
19921     return invlist;
19922 }
19923 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19924
19925 /* reg_skipcomment()
19926
19927    Absorbs an /x style # comment from the input stream,
19928    returning a pointer to the first character beyond the comment, or if the
19929    comment terminates the pattern without anything following it, this returns
19930    one past the final character of the pattern (in other words, RExC_end) and
19931    sets the REG_RUN_ON_COMMENT_SEEN flag.
19932
19933    Note it's the callers responsibility to ensure that we are
19934    actually in /x mode
19935
19936 */
19937
19938 PERL_STATIC_INLINE char*
19939 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19940 {
19941     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19942
19943     assert(*p == '#');
19944
19945     while (p < RExC_end) {
19946         if (*(++p) == '\n') {
19947             return p+1;
19948         }
19949     }
19950
19951     /* we ran off the end of the pattern without ending the comment, so we have
19952      * to add an \n when wrapping */
19953     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19954     return p;
19955 }
19956
19957 STATIC void
19958 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19959                                 char ** p,
19960                                 const bool force_to_xmod
19961                          )
19962 {
19963     /* If the text at the current parse position '*p' is a '(?#...)' comment,
19964      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19965      * is /x whitespace, advance '*p' so that on exit it points to the first
19966      * byte past all such white space and comments */
19967
19968     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19969
19970     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19971
19972     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19973
19974     for (;;) {
19975         if (RExC_end - (*p) >= 3
19976             && *(*p)     == '('
19977             && *(*p + 1) == '?'
19978             && *(*p + 2) == '#')
19979         {
19980             while (*(*p) != ')') {
19981                 if ((*p) == RExC_end)
19982                     FAIL("Sequence (?#... not terminated");
19983                 (*p)++;
19984             }
19985             (*p)++;
19986             continue;
19987         }
19988
19989         if (use_xmod) {
19990             const char * save_p = *p;
19991             while ((*p) < RExC_end) {
19992                 STRLEN len;
19993                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19994                     (*p) += len;
19995                 }
19996                 else if (*(*p) == '#') {
19997                     (*p) = reg_skipcomment(pRExC_state, (*p));
19998                 }
19999                 else {
20000                     break;
20001                 }
20002             }
20003             if (*p != save_p) {
20004                 continue;
20005             }
20006         }
20007
20008         break;
20009     }
20010
20011     return;
20012 }
20013
20014 /* nextchar()
20015
20016    Advances the parse position by one byte, unless that byte is the beginning
20017    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
20018    those two cases, the parse position is advanced beyond all such comments and
20019    white space.
20020
20021    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20022 */
20023
20024 STATIC void
20025 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20026 {
20027     PERL_ARGS_ASSERT_NEXTCHAR;
20028
20029     if (RExC_parse < RExC_end) {
20030         assert(   ! UTF
20031                || UTF8_IS_INVARIANT(*RExC_parse)
20032                || UTF8_IS_START(*RExC_parse));
20033
20034         RExC_parse += (UTF)
20035                       ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20036                       : 1;
20037
20038         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20039                                 FALSE /* Don't force /x */ );
20040     }
20041 }
20042
20043 STATIC void
20044 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20045 {
20046     /* 'size' is the delta number of smallest regnode equivalents to add or
20047      * subtract from the current memory allocated to the regex engine being
20048      * constructed. */
20049
20050     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20051
20052     RExC_size += size;
20053
20054     Renewc(RExC_rxi,
20055            sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20056                                                 /* +1 for REG_MAGIC */
20057            char,
20058            regexp_internal);
20059     if ( RExC_rxi == NULL )
20060         FAIL("Regexp out of space");
20061     RXi_SET(RExC_rx, RExC_rxi);
20062
20063     RExC_emit_start = RExC_rxi->program;
20064     if (size > 0) {
20065         Zero(REGNODE_p(RExC_emit), size, regnode);
20066     }
20067
20068 #ifdef RE_TRACK_PATTERN_OFFSETS
20069     Renew(RExC_offsets, 2*RExC_size+1, U32);
20070     if (size > 0) {
20071         Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20072     }
20073     RExC_offsets[0] = RExC_size;
20074 #endif
20075 }
20076
20077 STATIC regnode_offset
20078 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20079 {
20080     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20081      * equivalents space.  It aligns and increments RExC_size
20082      *
20083      * It returns the regnode's offset into the regex engine program */
20084
20085     const regnode_offset ret = RExC_emit;
20086
20087     GET_RE_DEBUG_FLAGS_DECL;
20088
20089     PERL_ARGS_ASSERT_REGNODE_GUTS;
20090
20091     SIZE_ALIGN(RExC_size);
20092     change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20093     NODE_ALIGN_FILL(REGNODE_p(ret));
20094 #ifndef RE_TRACK_PATTERN_OFFSETS
20095     PERL_UNUSED_ARG(name);
20096     PERL_UNUSED_ARG(op);
20097 #else
20098     assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20099
20100     if (RExC_offsets) {         /* MJD */
20101         MJD_OFFSET_DEBUG(
20102               ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20103               name, __LINE__,
20104               PL_reg_name[op],
20105               (UV)(RExC_emit) > RExC_offsets[0]
20106                 ? "Overwriting end of array!\n" : "OK",
20107               (UV)(RExC_emit),
20108               (UV)(RExC_parse - RExC_start),
20109               (UV)RExC_offsets[0]));
20110         Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20111     }
20112 #endif
20113     return(ret);
20114 }
20115
20116 /*
20117 - reg_node - emit a node
20118 */
20119 STATIC regnode_offset /* Location. */
20120 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20121 {
20122     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20123     regnode_offset ptr = ret;
20124
20125     PERL_ARGS_ASSERT_REG_NODE;
20126
20127     assert(regarglen[op] == 0);
20128
20129     FILL_ADVANCE_NODE(ptr, op);
20130     RExC_emit = ptr;
20131     return(ret);
20132 }
20133
20134 /*
20135 - reganode - emit a node with an argument
20136 */
20137 STATIC regnode_offset /* Location. */
20138 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20139 {
20140     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20141     regnode_offset ptr = ret;
20142
20143     PERL_ARGS_ASSERT_REGANODE;
20144
20145     /* ANYOF are special cased to allow non-length 1 args */
20146     assert(regarglen[op] == 1);
20147
20148     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20149     RExC_emit = ptr;
20150     return(ret);
20151 }
20152
20153 STATIC regnode_offset
20154 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20155 {
20156     /* emit a node with U32 and I32 arguments */
20157
20158     const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20159     regnode_offset ptr = ret;
20160
20161     PERL_ARGS_ASSERT_REG2LANODE;
20162
20163     assert(regarglen[op] == 2);
20164
20165     FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20166     RExC_emit = ptr;
20167     return(ret);
20168 }
20169
20170 /*
20171 - reginsert - insert an operator in front of already-emitted operand
20172 *
20173 * That means that on exit 'operand' is the offset of the newly inserted
20174 * operator, and the original operand has been relocated.
20175 *
20176 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20177 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20178 *
20179 *   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20180 *   NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20181 *
20182 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20183 */
20184 STATIC void
20185 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20186                   const regnode_offset operand, const U32 depth)
20187 {
20188     regnode *src;
20189     regnode *dst;
20190     regnode *place;
20191     const int offset = regarglen[(U8)op];
20192     const int size = NODE_STEP_REGNODE + offset;
20193     GET_RE_DEBUG_FLAGS_DECL;
20194
20195     PERL_ARGS_ASSERT_REGINSERT;
20196     PERL_UNUSED_CONTEXT;
20197     PERL_UNUSED_ARG(depth);
20198 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20199     DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20200     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20201                                     studying. If this is wrong then we need to adjust RExC_recurse
20202                                     below like we do with RExC_open_parens/RExC_close_parens. */
20203     change_engine_size(pRExC_state, (Ptrdiff_t) size);
20204     src = REGNODE_p(RExC_emit);
20205     RExC_emit += size;
20206     dst = REGNODE_p(RExC_emit);
20207
20208     /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20209      * and [perl #133871] shows this can lead to problems, so skip this
20210      * realignment of parens until a later pass when they are reliable */
20211     if (! IN_PARENS_PASS && RExC_open_parens) {
20212         int paren;
20213         /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20214         /* remember that RExC_npar is rex->nparens + 1,
20215          * iow it is 1 more than the number of parens seen in
20216          * the pattern so far. */
20217         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20218             /* note, RExC_open_parens[0] is the start of the
20219              * regex, it can't move. RExC_close_parens[0] is the end
20220              * of the regex, it *can* move. */
20221             if ( paren && RExC_open_parens[paren] >= operand ) {
20222                 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20223                 RExC_open_parens[paren] += size;
20224             } else {
20225                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20226             }
20227             if ( RExC_close_parens[paren] >= operand ) {
20228                 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20229                 RExC_close_parens[paren] += size;
20230             } else {
20231                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20232             }
20233         }
20234     }
20235     if (RExC_end_op)
20236         RExC_end_op += size;
20237
20238     while (src > REGNODE_p(operand)) {
20239         StructCopy(--src, --dst, regnode);
20240 #ifdef RE_TRACK_PATTERN_OFFSETS
20241         if (RExC_offsets) {     /* MJD 20010112 */
20242             MJD_OFFSET_DEBUG(
20243                  ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20244                   "reginsert",
20245                   __LINE__,
20246                   PL_reg_name[op],
20247                   (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20248                     ? "Overwriting end of array!\n" : "OK",
20249                   (UV)REGNODE_OFFSET(src),
20250                   (UV)REGNODE_OFFSET(dst),
20251                   (UV)RExC_offsets[0]));
20252             Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20253             Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20254         }
20255 #endif
20256     }
20257
20258     place = REGNODE_p(operand); /* Op node, where operand used to be. */
20259 #ifdef RE_TRACK_PATTERN_OFFSETS
20260     if (RExC_offsets) {         /* MJD */
20261         MJD_OFFSET_DEBUG(
20262               ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20263               "reginsert",
20264               __LINE__,
20265               PL_reg_name[op],
20266               (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20267               ? "Overwriting end of array!\n" : "OK",
20268               (UV)REGNODE_OFFSET(place),
20269               (UV)(RExC_parse - RExC_start),
20270               (UV)RExC_offsets[0]));
20271         Set_Node_Offset(place, RExC_parse);
20272         Set_Node_Length(place, 1);
20273     }
20274 #endif
20275     src = NEXTOPER(place);
20276     FLAGS(place) = 0;
20277     FILL_NODE(operand, op);
20278
20279     /* Zero out any arguments in the new node */
20280     Zero(src, offset, regnode);
20281 }
20282
20283 /*
20284 - regtail - set the next-pointer at the end of a node chain of p to val.  If
20285             that value won't fit in the space available, instead returns FALSE.
20286             (Except asserts if we can't fit in the largest space the regex
20287             engine is designed for.)
20288 - SEE ALSO: regtail_study
20289 */
20290 STATIC bool
20291 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20292                 const regnode_offset p,
20293                 const regnode_offset val,
20294                 const U32 depth)
20295 {
20296     regnode_offset scan;
20297     GET_RE_DEBUG_FLAGS_DECL;
20298
20299     PERL_ARGS_ASSERT_REGTAIL;
20300 #ifndef DEBUGGING
20301     PERL_UNUSED_ARG(depth);
20302 #endif
20303
20304     /* Find last node. */
20305     scan = (regnode_offset) p;
20306     for (;;) {
20307         regnode * const temp = regnext(REGNODE_p(scan));
20308         DEBUG_PARSE_r({
20309             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20310             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20311             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
20312                 SvPV_nolen_const(RExC_mysv), scan,
20313                     (temp == NULL ? "->" : ""),
20314                     (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20315             );
20316         });
20317         if (temp == NULL)
20318             break;
20319         scan = REGNODE_OFFSET(temp);
20320     }
20321
20322     assert(val >= scan);
20323     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20324         assert((UV) (val - scan) <= U32_MAX);
20325         ARG_SET(REGNODE_p(scan), val - scan);
20326     }
20327     else {
20328         if (val - scan > U16_MAX) {
20329             /* Populate this with something that won't loop and will likely
20330              * lead to a crash if the caller ignores the failure return, and
20331              * execution continues */
20332             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20333             return FALSE;
20334         }
20335         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20336     }
20337
20338     return TRUE;
20339 }
20340
20341 #ifdef DEBUGGING
20342 /*
20343 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20344 - Look for optimizable sequences at the same time.
20345 - currently only looks for EXACT chains.
20346
20347 This is experimental code. The idea is to use this routine to perform
20348 in place optimizations on branches and groups as they are constructed,
20349 with the long term intention of removing optimization from study_chunk so
20350 that it is purely analytical.
20351
20352 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20353 to control which is which.
20354
20355 This used to return a value that was ignored.  It was a problem that it is
20356 #ifdef'd to be another function that didn't return a value.  khw has changed it
20357 so both currently return a pass/fail return.
20358
20359 */
20360 /* TODO: All four parms should be const */
20361
20362 STATIC bool
20363 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20364                       const regnode_offset val, U32 depth)
20365 {
20366     regnode_offset scan;
20367     U8 exact = PSEUDO;
20368 #ifdef EXPERIMENTAL_INPLACESCAN
20369     I32 min = 0;
20370 #endif
20371     GET_RE_DEBUG_FLAGS_DECL;
20372
20373     PERL_ARGS_ASSERT_REGTAIL_STUDY;
20374
20375
20376     /* Find last node. */
20377
20378     scan = p;
20379     for (;;) {
20380         regnode * const temp = regnext(REGNODE_p(scan));
20381 #ifdef EXPERIMENTAL_INPLACESCAN
20382         if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20383             bool unfolded_multi_char;   /* Unexamined in this routine */
20384             if (join_exact(pRExC_state, scan, &min,
20385                            &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20386                 return TRUE; /* Was return EXACT */
20387         }
20388 #endif
20389         if ( exact ) {
20390             switch (OP(REGNODE_p(scan))) {
20391                 case LEXACT:
20392                 case EXACT:
20393                 case LEXACT_REQ8:
20394                 case EXACT_REQ8:
20395                 case EXACTL:
20396                 case EXACTF:
20397                 case EXACTFU_S_EDGE:
20398                 case EXACTFAA_NO_TRIE:
20399                 case EXACTFAA:
20400                 case EXACTFU:
20401                 case EXACTFU_REQ8:
20402                 case EXACTFLU8:
20403                 case EXACTFUP:
20404                 case EXACTFL:
20405                         if( exact == PSEUDO )
20406                             exact= OP(REGNODE_p(scan));
20407                         else if ( exact != OP(REGNODE_p(scan)) )
20408                             exact= 0;
20409                 case NOTHING:
20410                     break;
20411                 default:
20412                     exact= 0;
20413             }
20414         }
20415         DEBUG_PARSE_r({
20416             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20417             regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20418             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
20419                 SvPV_nolen_const(RExC_mysv),
20420                 scan,
20421                 PL_reg_name[exact]);
20422         });
20423         if (temp == NULL)
20424             break;
20425         scan = REGNODE_OFFSET(temp);
20426     }
20427     DEBUG_PARSE_r({
20428         DEBUG_PARSE_MSG("");
20429         regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20430         Perl_re_printf( aTHX_
20431                       "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20432                       SvPV_nolen_const(RExC_mysv),
20433                       (IV)val,
20434                       (IV)(val - scan)
20435         );
20436     });
20437     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20438         assert((UV) (val - scan) <= U32_MAX);
20439         ARG_SET(REGNODE_p(scan), val - scan);
20440     }
20441     else {
20442         if (val - scan > U16_MAX) {
20443             /* Populate this with something that won't loop and will likely
20444              * lead to a crash if the caller ignores the failure return, and
20445              * execution continues */
20446             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20447             return FALSE;
20448         }
20449         NEXT_OFF(REGNODE_p(scan)) = val - scan;
20450     }
20451
20452     return TRUE; /* Was 'return exact' */
20453 }
20454 #endif
20455
20456 STATIC SV*
20457 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20458
20459     /* Returns an inversion list of all the code points matched by the
20460      * ANYOFM/NANYOFM node 'n' */
20461
20462     SV * cp_list = _new_invlist(-1);
20463     const U8 lowest = (U8) ARG(n);
20464     unsigned int i;
20465     U8 count = 0;
20466     U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20467
20468     PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20469
20470     /* Starting with the lowest code point, any code point that ANDed with the
20471      * mask yields the lowest code point is in the set */
20472     for (i = lowest; i <= 0xFF; i++) {
20473         if ((i & FLAGS(n)) == ARG(n)) {
20474             cp_list = add_cp_to_invlist(cp_list, i);
20475             count++;
20476
20477             /* We know how many code points (a power of two) that are in the
20478              * set.  No use looking once we've got that number */
20479             if (count >= needed) break;
20480         }
20481     }
20482
20483     if (OP(n) == NANYOFM) {
20484         _invlist_invert(cp_list);
20485     }
20486     return cp_list;
20487 }
20488
20489 /*
20490  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20491  */
20492 #ifdef DEBUGGING
20493
20494 static void
20495 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20496 {
20497     int bit;
20498     int set=0;
20499
20500     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20501
20502     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20503         if (flags & (1<<bit)) {
20504             if (!set++ && lead)
20505                 Perl_re_printf( aTHX_  "%s", lead);
20506             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
20507         }
20508     }
20509     if (lead)  {
20510         if (set)
20511             Perl_re_printf( aTHX_  "\n");
20512         else
20513             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20514     }
20515 }
20516
20517 static void
20518 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20519 {
20520     int bit;
20521     int set=0;
20522     regex_charset cs;
20523
20524     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20525
20526     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20527         if (flags & (1<<bit)) {
20528             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
20529                 continue;
20530             }
20531             if (!set++ && lead)
20532                 Perl_re_printf( aTHX_  "%s", lead);
20533             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
20534         }
20535     }
20536     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20537             if (!set++ && lead) {
20538                 Perl_re_printf( aTHX_  "%s", lead);
20539             }
20540             switch (cs) {
20541                 case REGEX_UNICODE_CHARSET:
20542                     Perl_re_printf( aTHX_  "UNICODE");
20543                     break;
20544                 case REGEX_LOCALE_CHARSET:
20545                     Perl_re_printf( aTHX_  "LOCALE");
20546                     break;
20547                 case REGEX_ASCII_RESTRICTED_CHARSET:
20548                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
20549                     break;
20550                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20551                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
20552                     break;
20553                 default:
20554                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
20555                     break;
20556             }
20557     }
20558     if (lead)  {
20559         if (set)
20560             Perl_re_printf( aTHX_  "\n");
20561         else
20562             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
20563     }
20564 }
20565 #endif
20566
20567 void
20568 Perl_regdump(pTHX_ const regexp *r)
20569 {
20570 #ifdef DEBUGGING
20571     int i;
20572     SV * const sv = sv_newmortal();
20573     SV *dsv= sv_newmortal();
20574     RXi_GET_DECL(r, ri);
20575     GET_RE_DEBUG_FLAGS_DECL;
20576
20577     PERL_ARGS_ASSERT_REGDUMP;
20578
20579     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20580
20581     /* Header fields of interest. */
20582     for (i = 0; i < 2; i++) {
20583         if (r->substrs->data[i].substr) {
20584             RE_PV_QUOTED_DECL(s, 0, dsv,
20585                             SvPVX_const(r->substrs->data[i].substr),
20586                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
20587                             PL_dump_re_max_len);
20588             Perl_re_printf( aTHX_
20589                           "%s %s%s at %" IVdf "..%" UVuf " ",
20590                           i ? "floating" : "anchored",
20591                           s,
20592                           RE_SV_TAIL(r->substrs->data[i].substr),
20593                           (IV)r->substrs->data[i].min_offset,
20594                           (UV)r->substrs->data[i].max_offset);
20595         }
20596         else if (r->substrs->data[i].utf8_substr) {
20597             RE_PV_QUOTED_DECL(s, 1, dsv,
20598                             SvPVX_const(r->substrs->data[i].utf8_substr),
20599                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20600                             30);
20601             Perl_re_printf( aTHX_
20602                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20603                           i ? "floating" : "anchored",
20604                           s,
20605                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20606                           (IV)r->substrs->data[i].min_offset,
20607                           (UV)r->substrs->data[i].max_offset);
20608         }
20609     }
20610
20611     if (r->check_substr || r->check_utf8)
20612         Perl_re_printf( aTHX_
20613                       (const char *)
20614                       (   r->check_substr == r->substrs->data[1].substr
20615                        && r->check_utf8   == r->substrs->data[1].utf8_substr
20616                        ? "(checking floating" : "(checking anchored"));
20617     if (r->intflags & PREGf_NOSCAN)
20618         Perl_re_printf( aTHX_  " noscan");
20619     if (r->extflags & RXf_CHECK_ALL)
20620         Perl_re_printf( aTHX_  " isall");
20621     if (r->check_substr || r->check_utf8)
20622         Perl_re_printf( aTHX_  ") ");
20623
20624     if (ri->regstclass) {
20625         regprop(r, sv, ri->regstclass, NULL, NULL);
20626         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
20627     }
20628     if (r->intflags & PREGf_ANCH) {
20629         Perl_re_printf( aTHX_  "anchored");
20630         if (r->intflags & PREGf_ANCH_MBOL)
20631             Perl_re_printf( aTHX_  "(MBOL)");
20632         if (r->intflags & PREGf_ANCH_SBOL)
20633             Perl_re_printf( aTHX_  "(SBOL)");
20634         if (r->intflags & PREGf_ANCH_GPOS)
20635             Perl_re_printf( aTHX_  "(GPOS)");
20636         Perl_re_printf( aTHX_ " ");
20637     }
20638     if (r->intflags & PREGf_GPOS_SEEN)
20639         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
20640     if (r->intflags & PREGf_SKIP)
20641         Perl_re_printf( aTHX_  "plus ");
20642     if (r->intflags & PREGf_IMPLICIT)
20643         Perl_re_printf( aTHX_  "implicit ");
20644     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
20645     if (r->extflags & RXf_EVAL_SEEN)
20646         Perl_re_printf( aTHX_  "with eval ");
20647     Perl_re_printf( aTHX_  "\n");
20648     DEBUG_FLAGS_r({
20649         regdump_extflags("r->extflags: ", r->extflags);
20650         regdump_intflags("r->intflags: ", r->intflags);
20651     });
20652 #else
20653     PERL_ARGS_ASSERT_REGDUMP;
20654     PERL_UNUSED_CONTEXT;
20655     PERL_UNUSED_ARG(r);
20656 #endif  /* DEBUGGING */
20657 }
20658
20659 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20660 #ifdef DEBUGGING
20661
20662 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
20663      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
20664      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
20665      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
20666      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
20667      || _CC_VERTSPACE != 15
20668 #   error Need to adjust order of anyofs[]
20669 #  endif
20670 static const char * const anyofs[] = {
20671     "\\w",
20672     "\\W",
20673     "\\d",
20674     "\\D",
20675     "[:alpha:]",
20676     "[:^alpha:]",
20677     "[:lower:]",
20678     "[:^lower:]",
20679     "[:upper:]",
20680     "[:^upper:]",
20681     "[:punct:]",
20682     "[:^punct:]",
20683     "[:print:]",
20684     "[:^print:]",
20685     "[:alnum:]",
20686     "[:^alnum:]",
20687     "[:graph:]",
20688     "[:^graph:]",
20689     "[:cased:]",
20690     "[:^cased:]",
20691     "\\s",
20692     "\\S",
20693     "[:blank:]",
20694     "[:^blank:]",
20695     "[:xdigit:]",
20696     "[:^xdigit:]",
20697     "[:cntrl:]",
20698     "[:^cntrl:]",
20699     "[:ascii:]",
20700     "[:^ascii:]",
20701     "\\v",
20702     "\\V"
20703 };
20704 #endif
20705
20706 /*
20707 - regprop - printable representation of opcode, with run time support
20708 */
20709
20710 void
20711 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20712 {
20713 #ifdef DEBUGGING
20714     dVAR;
20715     int k;
20716     RXi_GET_DECL(prog, progi);
20717     GET_RE_DEBUG_FLAGS_DECL;
20718
20719     PERL_ARGS_ASSERT_REGPROP;
20720
20721     SvPVCLEAR(sv);
20722
20723     if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
20724         if (pRExC_state) {  /* This gives more info, if we have it */
20725             FAIL3("panic: corrupted regexp opcode %d > %d",
20726                   (int)OP(o), (int)REGNODE_MAX);
20727         }
20728         else {
20729             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
20730                              (int)OP(o), (int)REGNODE_MAX);
20731         }
20732     }
20733     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20734
20735     k = PL_regkind[OP(o)];
20736
20737     if (k == EXACT) {
20738         sv_catpvs(sv, " ");
20739         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20740          * is a crude hack but it may be the best for now since
20741          * we have no flag "this EXACTish node was UTF-8"
20742          * --jhi */
20743         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20744                   PL_colors[0], PL_colors[1],
20745                   PERL_PV_ESCAPE_UNI_DETECT |
20746                   PERL_PV_ESCAPE_NONASCII   |
20747                   PERL_PV_PRETTY_ELLIPSES   |
20748                   PERL_PV_PRETTY_LTGT       |
20749                   PERL_PV_PRETTY_NOCLEAR
20750                   );
20751     } else if (k == TRIE) {
20752         /* print the details of the trie in dumpuntil instead, as
20753          * progi->data isn't available here */
20754         const char op = OP(o);
20755         const U32 n = ARG(o);
20756         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20757                (reg_ac_data *)progi->data->data[n] :
20758                NULL;
20759         const reg_trie_data * const trie
20760             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20761
20762         Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20763         DEBUG_TRIE_COMPILE_r({
20764           if (trie->jump)
20765             sv_catpvs(sv, "(JUMP)");
20766           Perl_sv_catpvf(aTHX_ sv,
20767             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20768             (UV)trie->startstate,
20769             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20770             (UV)trie->wordcount,
20771             (UV)trie->minlen,
20772             (UV)trie->maxlen,
20773             (UV)TRIE_CHARCOUNT(trie),
20774             (UV)trie->uniquecharcount
20775           );
20776         });
20777         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20778             sv_catpvs(sv, "[");
20779             (void) put_charclass_bitmap_innards(sv,
20780                                                 ((IS_ANYOF_TRIE(op))
20781                                                  ? ANYOF_BITMAP(o)
20782                                                  : TRIE_BITMAP(trie)),
20783                                                 NULL,
20784                                                 NULL,
20785                                                 NULL,
20786                                                 0,
20787                                                 FALSE
20788                                                );
20789             sv_catpvs(sv, "]");
20790         }
20791     } else if (k == CURLY) {
20792         U32 lo = ARG1(o), hi = ARG2(o);
20793         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20794             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20795         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20796         if (hi == REG_INFTY)
20797             sv_catpvs(sv, "INFTY");
20798         else
20799             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20800         sv_catpvs(sv, "}");
20801     }
20802     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
20803         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20804     else if (k == REF || k == OPEN || k == CLOSE
20805              || k == GROUPP || OP(o)==ACCEPT)
20806     {
20807         AV *name_list= NULL;
20808         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20809         Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno);        /* Parenth number */
20810         if ( RXp_PAREN_NAMES(prog) ) {
20811             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20812         } else if ( pRExC_state ) {
20813             name_list= RExC_paren_name_list;
20814         }
20815         if (name_list) {
20816             if ( k != REF || (OP(o) < REFN)) {
20817                 SV **name= av_fetch(name_list, parno, 0 );
20818                 if (name)
20819                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20820             }
20821             else {
20822                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20823                 I32 *nums=(I32*)SvPVX(sv_dat);
20824                 SV **name= av_fetch(name_list, nums[0], 0 );
20825                 I32 n;
20826                 if (name) {
20827                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
20828                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20829                                     (n ? "," : ""), (IV)nums[n]);
20830                     }
20831                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20832                 }
20833             }
20834         }
20835         if ( k == REF && reginfo) {
20836             U32 n = ARG(o);  /* which paren pair */
20837             I32 ln = prog->offs[n].start;
20838             if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20839                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20840             else if (ln == prog->offs[n].end)
20841                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20842             else {
20843                 const char *s = reginfo->strbeg + ln;
20844                 Perl_sv_catpvf(aTHX_ sv, ": ");
20845                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20846                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20847             }
20848         }
20849     } else if (k == GOSUB) {
20850         AV *name_list= NULL;
20851         if ( RXp_PAREN_NAMES(prog) ) {
20852             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20853         } else if ( pRExC_state ) {
20854             name_list= RExC_paren_name_list;
20855         }
20856
20857         /* Paren and offset */
20858         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20859                 (int)((o + (int)ARG2L(o)) - progi->program) );
20860         if (name_list) {
20861             SV **name= av_fetch(name_list, ARG(o), 0 );
20862             if (name)
20863                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20864         }
20865     }
20866     else if (k == LOGICAL)
20867         /* 2: embedded, otherwise 1 */
20868         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20869     else if (k == ANYOF || k == ANYOFR) {
20870         U8 flags;
20871         char * bitmap;
20872         U32 arg;
20873         bool do_sep = FALSE;    /* Do we need to separate various components of
20874                                    the output? */
20875         /* Set if there is still an unresolved user-defined property */
20876         SV *unresolved                = NULL;
20877
20878         /* Things that are ignored except when the runtime locale is UTF-8 */
20879         SV *only_utf8_locale_invlist = NULL;
20880
20881         /* Code points that don't fit in the bitmap */
20882         SV *nonbitmap_invlist = NULL;
20883
20884         /* And things that aren't in the bitmap, but are small enough to be */
20885         SV* bitmap_range_not_in_bitmap = NULL;
20886
20887         bool inverted;
20888
20889         if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
20890             flags = 0;
20891             bitmap = NULL;
20892             arg = 0;
20893         }
20894         else {
20895             flags = ANYOF_FLAGS(o);
20896             bitmap = ANYOF_BITMAP(o);
20897             arg = ARG(o);
20898         }
20899
20900         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20901             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20902                 sv_catpvs(sv, "{utf8-locale-reqd}");
20903             }
20904             if (flags & ANYOFL_FOLD) {
20905                 sv_catpvs(sv, "{i}");
20906             }
20907         }
20908
20909         inverted = flags & ANYOF_INVERT;
20910
20911         /* If there is stuff outside the bitmap, get it */
20912         if (arg != ANYOF_ONLY_HAS_BITMAP) {
20913             if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
20914                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20915                                             ANYOFRbase(o),
20916                                             ANYOFRbase(o) + ANYOFRdelta(o));
20917             }
20918             else {
20919                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20920                                                 &unresolved,
20921                                                 &only_utf8_locale_invlist,
20922                                                 &nonbitmap_invlist);
20923             }
20924
20925             /* The non-bitmap data may contain stuff that could fit in the
20926              * bitmap.  This could come from a user-defined property being
20927              * finally resolved when this call was done; or much more likely
20928              * because there are matches that require UTF-8 to be valid, and so
20929              * aren't in the bitmap (or ANYOFR).  This is teased apart later */
20930             _invlist_intersection(nonbitmap_invlist,
20931                                   PL_InBitmap,
20932                                   &bitmap_range_not_in_bitmap);
20933             /* Leave just the things that don't fit into the bitmap */
20934             _invlist_subtract(nonbitmap_invlist,
20935                               PL_InBitmap,
20936                               &nonbitmap_invlist);
20937         }
20938
20939         /* Obey this flag to add all above-the-bitmap code points */
20940         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20941             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20942                                                       NUM_ANYOF_CODE_POINTS,
20943                                                       UV_MAX);
20944         }
20945
20946         /* Ready to start outputting.  First, the initial left bracket */
20947         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20948
20949         /* ANYOFH by definition doesn't have anything that will fit inside the
20950          * bitmap;  ANYOFR may or may not. */
20951         if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
20952             && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
20953                 ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
20954         {
20955             /* Then all the things that could fit in the bitmap */
20956             do_sep = put_charclass_bitmap_innards(sv,
20957                                                   bitmap,
20958                                                   bitmap_range_not_in_bitmap,
20959                                                   only_utf8_locale_invlist,
20960                                                   o,
20961                                                   flags,
20962
20963                                                   /* Can't try inverting for a
20964                                                    * better display if there
20965                                                    * are things that haven't
20966                                                    * been resolved */
20967                                                   unresolved != NULL
20968                                             || inRANGE(OP(o), ANYOFR, ANYOFRb));
20969             SvREFCNT_dec(bitmap_range_not_in_bitmap);
20970
20971             /* If there are user-defined properties which haven't been defined
20972              * yet, output them.  If the result is not to be inverted, it is
20973              * clearest to output them in a separate [] from the bitmap range
20974              * stuff.  If the result is to be complemented, we have to show
20975              * everything in one [], as the inversion applies to the whole
20976              * thing.  Use {braces} to separate them from anything in the
20977              * bitmap and anything above the bitmap. */
20978             if (unresolved) {
20979                 if (inverted) {
20980                     if (! do_sep) { /* If didn't output anything in the bitmap
20981                                      */
20982                         sv_catpvs(sv, "^");
20983                     }
20984                     sv_catpvs(sv, "{");
20985                 }
20986                 else if (do_sep) {
20987                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20988                                                       PL_colors[0]);
20989                 }
20990                 sv_catsv(sv, unresolved);
20991                 if (inverted) {
20992                     sv_catpvs(sv, "}");
20993                 }
20994                 do_sep = ! inverted;
20995             }
20996         }
20997
20998         /* And, finally, add the above-the-bitmap stuff */
20999         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21000             SV* contents;
21001
21002             /* See if truncation size is overridden */
21003             const STRLEN dump_len = (PL_dump_re_max_len > 256)
21004                                     ? PL_dump_re_max_len
21005                                     : 256;
21006
21007             /* This is output in a separate [] */
21008             if (do_sep) {
21009                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21010             }
21011
21012             /* And, for easy of understanding, it is shown in the
21013              * uncomplemented form if possible.  The one exception being if
21014              * there are unresolved items, where the inversion has to be
21015              * delayed until runtime */
21016             if (inverted && ! unresolved) {
21017                 _invlist_invert(nonbitmap_invlist);
21018                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21019             }
21020
21021             contents = invlist_contents(nonbitmap_invlist,
21022                                         FALSE /* output suitable for catsv */
21023                                        );
21024
21025             /* If the output is shorter than the permissible maximum, just do it. */
21026             if (SvCUR(contents) <= dump_len) {
21027                 sv_catsv(sv, contents);
21028             }
21029             else {
21030                 const char * contents_string = SvPVX(contents);
21031                 STRLEN i = dump_len;
21032
21033                 /* Otherwise, start at the permissible max and work back to the
21034                  * first break possibility */
21035                 while (i > 0 && contents_string[i] != ' ') {
21036                     i--;
21037                 }
21038                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
21039                                        find a legal break */
21040                     i = dump_len;
21041                 }
21042
21043                 sv_catpvn(sv, contents_string, i);
21044                 sv_catpvs(sv, "...");
21045             }
21046
21047             SvREFCNT_dec_NN(contents);
21048             SvREFCNT_dec_NN(nonbitmap_invlist);
21049         }
21050
21051         /* And finally the matching, closing ']' */
21052         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21053
21054         if (OP(o) == ANYOFHs) {
21055             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21056         }
21057         else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21058             U8 lowest = (OP(o) != ANYOFHr)
21059                          ? FLAGS(o)
21060                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21061             U8 highest = (OP(o) == ANYOFHr)
21062                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21063                          : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21064                            ? 0xFF
21065                            : lowest;
21066             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21067             if (lowest != highest) {
21068                 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21069             }
21070             Perl_sv_catpvf(aTHX_ sv, ")");
21071         }
21072
21073         SvREFCNT_dec(unresolved);
21074     }
21075     else if (k == ANYOFM) {
21076         SV * cp_list = get_ANYOFM_contents(o);
21077
21078         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21079         if (OP(o) == NANYOFM) {
21080             _invlist_invert(cp_list);
21081         }
21082
21083         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21084         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21085
21086         SvREFCNT_dec(cp_list);
21087     }
21088     else if (k == POSIXD || k == NPOSIXD) {
21089         U8 index = FLAGS(o) * 2;
21090         if (index < C_ARRAY_LENGTH(anyofs)) {
21091             if (*anyofs[index] != '[')  {
21092                 sv_catpvs(sv, "[");
21093             }
21094             sv_catpv(sv, anyofs[index]);
21095             if (*anyofs[index] != '[')  {
21096                 sv_catpvs(sv, "]");
21097             }
21098         }
21099         else {
21100             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21101         }
21102     }
21103     else if (k == BOUND || k == NBOUND) {
21104         /* Must be synced with order of 'bound_type' in regcomp.h */
21105         const char * const bounds[] = {
21106             "",      /* Traditional */
21107             "{gcb}",
21108             "{lb}",
21109             "{sb}",
21110             "{wb}"
21111         };
21112         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21113         sv_catpv(sv, bounds[FLAGS(o)]);
21114     }
21115     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21116         Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21117         if (o->next_off) {
21118             Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21119         }
21120         Perl_sv_catpvf(aTHX_ sv, "]");
21121     }
21122     else if (OP(o) == SBOL)
21123         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21124
21125     /* add on the verb argument if there is one */
21126     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21127         if ( ARG(o) )
21128             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21129                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21130         else
21131             sv_catpvs(sv, ":NULL");
21132     }
21133 #else
21134     PERL_UNUSED_CONTEXT;
21135     PERL_UNUSED_ARG(sv);
21136     PERL_UNUSED_ARG(o);
21137     PERL_UNUSED_ARG(prog);
21138     PERL_UNUSED_ARG(reginfo);
21139     PERL_UNUSED_ARG(pRExC_state);
21140 #endif  /* DEBUGGING */
21141 }
21142
21143
21144
21145 SV *
21146 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21147 {                               /* Assume that RE_INTUIT is set */
21148     struct regexp *const prog = ReANY(r);
21149     GET_RE_DEBUG_FLAGS_DECL;
21150
21151     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21152     PERL_UNUSED_CONTEXT;
21153
21154     DEBUG_COMPILE_r(
21155         {
21156             const char * const s = SvPV_nolen_const(RX_UTF8(r)
21157                       ? prog->check_utf8 : prog->check_substr);
21158
21159             if (!PL_colorset) reginitcolors();
21160             Perl_re_printf( aTHX_
21161                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21162                       PL_colors[4],
21163                       RX_UTF8(r) ? "utf8 " : "",
21164                       PL_colors[5], PL_colors[0],
21165                       s,
21166                       PL_colors[1],
21167                       (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21168         } );
21169
21170     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21171     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21172 }
21173
21174 /*
21175    pregfree()
21176
21177    handles refcounting and freeing the perl core regexp structure. When
21178    it is necessary to actually free the structure the first thing it
21179    does is call the 'free' method of the regexp_engine associated to
21180    the regexp, allowing the handling of the void *pprivate; member
21181    first. (This routine is not overridable by extensions, which is why
21182    the extensions free is called first.)
21183
21184    See regdupe and regdupe_internal if you change anything here.
21185 */
21186 #ifndef PERL_IN_XSUB_RE
21187 void
21188 Perl_pregfree(pTHX_ REGEXP *r)
21189 {
21190     SvREFCNT_dec(r);
21191 }
21192
21193 void
21194 Perl_pregfree2(pTHX_ REGEXP *rx)
21195 {
21196     struct regexp *const r = ReANY(rx);
21197     GET_RE_DEBUG_FLAGS_DECL;
21198
21199     PERL_ARGS_ASSERT_PREGFREE2;
21200
21201     if (! r)
21202         return;
21203
21204     if (r->mother_re) {
21205         ReREFCNT_dec(r->mother_re);
21206     } else {
21207         CALLREGFREE_PVT(rx); /* free the private data */
21208         SvREFCNT_dec(RXp_PAREN_NAMES(r));
21209     }
21210     if (r->substrs) {
21211         int i;
21212         for (i = 0; i < 2; i++) {
21213             SvREFCNT_dec(r->substrs->data[i].substr);
21214             SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21215         }
21216         Safefree(r->substrs);
21217     }
21218     RX_MATCH_COPY_FREE(rx);
21219 #ifdef PERL_ANY_COW
21220     SvREFCNT_dec(r->saved_copy);
21221 #endif
21222     Safefree(r->offs);
21223     SvREFCNT_dec(r->qr_anoncv);
21224     if (r->recurse_locinput)
21225         Safefree(r->recurse_locinput);
21226 }
21227
21228
21229 /*  reg_temp_copy()
21230
21231     Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21232     except that dsv will be created if NULL.
21233
21234     This function is used in two main ways. First to implement
21235         $r = qr/....; $s = $$r;
21236
21237     Secondly, it is used as a hacky workaround to the structural issue of
21238     match results
21239     being stored in the regexp structure which is in turn stored in
21240     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21241     could be PL_curpm in multiple contexts, and could require multiple
21242     result sets being associated with the pattern simultaneously, such
21243     as when doing a recursive match with (??{$qr})
21244
21245     The solution is to make a lightweight copy of the regexp structure
21246     when a qr// is returned from the code executed by (??{$qr}) this
21247     lightweight copy doesn't actually own any of its data except for
21248     the starp/end and the actual regexp structure itself.
21249
21250 */
21251
21252
21253 REGEXP *
21254 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21255 {
21256     struct regexp *drx;
21257     struct regexp *const srx = ReANY(ssv);
21258     const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21259
21260     PERL_ARGS_ASSERT_REG_TEMP_COPY;
21261
21262     if (!dsv)
21263         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21264     else {
21265         assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21266
21267         /* our only valid caller, sv_setsv_flags(), should have done
21268          * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21269         assert(!SvOOK(dsv));
21270         assert(!SvIsCOW(dsv));
21271         assert(!SvROK(dsv));
21272
21273         if (SvPVX_const(dsv)) {
21274             if (SvLEN(dsv))
21275                 Safefree(SvPVX(dsv));
21276             SvPVX(dsv) = NULL;
21277         }
21278         SvLEN_set(dsv, 0);
21279         SvCUR_set(dsv, 0);
21280         SvOK_off((SV *)dsv);
21281
21282         if (islv) {
21283             /* For PVLVs, the head (sv_any) points to an XPVLV, while
21284              * the LV's xpvlenu_rx will point to a regexp body, which
21285              * we allocate here */
21286             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21287             assert(!SvPVX(dsv));
21288             ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21289             temp->sv_any = NULL;
21290             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21291             SvREFCNT_dec_NN(temp);
21292             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21293                ing below will not set it. */
21294             SvCUR_set(dsv, SvCUR(ssv));
21295         }
21296     }
21297     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21298        sv_force_normal(sv) is called.  */
21299     SvFAKE_on(dsv);
21300     drx = ReANY(dsv);
21301
21302     SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21303     SvPV_set(dsv, RX_WRAPPED(ssv));
21304     /* We share the same string buffer as the original regexp, on which we
21305        hold a reference count, incremented when mother_re is set below.
21306        The string pointer is copied here, being part of the regexp struct.
21307      */
21308     memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21309            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21310     if (!islv)
21311         SvLEN_set(dsv, 0);
21312     if (srx->offs) {
21313         const I32 npar = srx->nparens+1;
21314         Newx(drx->offs, npar, regexp_paren_pair);
21315         Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21316     }
21317     if (srx->substrs) {
21318         int i;
21319         Newx(drx->substrs, 1, struct reg_substr_data);
21320         StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21321
21322         for (i = 0; i < 2; i++) {
21323             SvREFCNT_inc_void(drx->substrs->data[i].substr);
21324             SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21325         }
21326
21327         /* check_substr and check_utf8, if non-NULL, point to either their
21328            anchored or float namesakes, and don't hold a second reference.  */
21329     }
21330     RX_MATCH_COPIED_off(dsv);
21331 #ifdef PERL_ANY_COW
21332     drx->saved_copy = NULL;
21333 #endif
21334     drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21335     SvREFCNT_inc_void(drx->qr_anoncv);
21336     if (srx->recurse_locinput)
21337         Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21338
21339     return dsv;
21340 }
21341 #endif
21342
21343
21344 /* regfree_internal()
21345
21346    Free the private data in a regexp. This is overloadable by
21347    extensions. Perl takes care of the regexp structure in pregfree(),
21348    this covers the *pprivate pointer which technically perl doesn't
21349    know about, however of course we have to handle the
21350    regexp_internal structure when no extension is in use.
21351
21352    Note this is called before freeing anything in the regexp
21353    structure.
21354  */
21355
21356 void
21357 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21358 {
21359     struct regexp *const r = ReANY(rx);
21360     RXi_GET_DECL(r, ri);
21361     GET_RE_DEBUG_FLAGS_DECL;
21362
21363     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21364
21365     if (! ri) {
21366         return;
21367     }
21368
21369     DEBUG_COMPILE_r({
21370         if (!PL_colorset)
21371             reginitcolors();
21372         {
21373             SV *dsv= sv_newmortal();
21374             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21375                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21376             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21377                 PL_colors[4], PL_colors[5], s);
21378         }
21379     });
21380
21381 #ifdef RE_TRACK_PATTERN_OFFSETS
21382     if (ri->u.offsets)
21383         Safefree(ri->u.offsets);             /* 20010421 MJD */
21384 #endif
21385     if (ri->code_blocks)
21386         S_free_codeblocks(aTHX_ ri->code_blocks);
21387
21388     if (ri->data) {
21389         int n = ri->data->count;
21390
21391         while (--n >= 0) {
21392           /* If you add a ->what type here, update the comment in regcomp.h */
21393             switch (ri->data->what[n]) {
21394             case 'a':
21395             case 'r':
21396             case 's':
21397             case 'S':
21398             case 'u':
21399                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21400                 break;
21401             case 'f':
21402                 Safefree(ri->data->data[n]);
21403                 break;
21404             case 'l':
21405             case 'L':
21406                 break;
21407             case 'T':
21408                 { /* Aho Corasick add-on structure for a trie node.
21409                      Used in stclass optimization only */
21410                     U32 refcount;
21411                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21412 #ifdef USE_ITHREADS
21413                     dVAR;
21414 #endif
21415                     OP_REFCNT_LOCK;
21416                     refcount = --aho->refcount;
21417                     OP_REFCNT_UNLOCK;
21418                     if ( !refcount ) {
21419                         PerlMemShared_free(aho->states);
21420                         PerlMemShared_free(aho->fail);
21421                          /* do this last!!!! */
21422                         PerlMemShared_free(ri->data->data[n]);
21423                         /* we should only ever get called once, so
21424                          * assert as much, and also guard the free
21425                          * which /might/ happen twice. At the least
21426                          * it will make code anlyzers happy and it
21427                          * doesn't cost much. - Yves */
21428                         assert(ri->regstclass);
21429                         if (ri->regstclass) {
21430                             PerlMemShared_free(ri->regstclass);
21431                             ri->regstclass = 0;
21432                         }
21433                     }
21434                 }
21435                 break;
21436             case 't':
21437                 {
21438                     /* trie structure. */
21439                     U32 refcount;
21440                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21441 #ifdef USE_ITHREADS
21442                     dVAR;
21443 #endif
21444                     OP_REFCNT_LOCK;
21445                     refcount = --trie->refcount;
21446                     OP_REFCNT_UNLOCK;
21447                     if ( !refcount ) {
21448                         PerlMemShared_free(trie->charmap);
21449                         PerlMemShared_free(trie->states);
21450                         PerlMemShared_free(trie->trans);
21451                         if (trie->bitmap)
21452                             PerlMemShared_free(trie->bitmap);
21453                         if (trie->jump)
21454                             PerlMemShared_free(trie->jump);
21455                         PerlMemShared_free(trie->wordinfo);
21456                         /* do this last!!!! */
21457                         PerlMemShared_free(ri->data->data[n]);
21458                     }
21459                 }
21460                 break;
21461             default:
21462                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21463                                                     ri->data->what[n]);
21464             }
21465         }
21466         Safefree(ri->data->what);
21467         Safefree(ri->data);
21468     }
21469
21470     Safefree(ri);
21471 }
21472
21473 #define av_dup_inc(s, t)        MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21474 #define hv_dup_inc(s, t)        MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21475 #define SAVEPVN(p, n)   ((p) ? savepvn(p, n) : NULL)
21476
21477 /*
21478    re_dup_guts - duplicate a regexp.
21479
21480    This routine is expected to clone a given regexp structure. It is only
21481    compiled under USE_ITHREADS.
21482
21483    After all of the core data stored in struct regexp is duplicated
21484    the regexp_engine.dupe method is used to copy any private data
21485    stored in the *pprivate pointer. This allows extensions to handle
21486    any duplication it needs to do.
21487
21488    See pregfree() and regfree_internal() if you change anything here.
21489 */
21490 #if defined(USE_ITHREADS)
21491 #ifndef PERL_IN_XSUB_RE
21492 void
21493 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21494 {
21495     dVAR;
21496     I32 npar;
21497     const struct regexp *r = ReANY(sstr);
21498     struct regexp *ret = ReANY(dstr);
21499
21500     PERL_ARGS_ASSERT_RE_DUP_GUTS;
21501
21502     npar = r->nparens+1;
21503     Newx(ret->offs, npar, regexp_paren_pair);
21504     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21505
21506     if (ret->substrs) {
21507         /* Do it this way to avoid reading from *r after the StructCopy().
21508            That way, if any of the sv_dup_inc()s dislodge *r from the L1
21509            cache, it doesn't matter.  */
21510         int i;
21511         const bool anchored = r->check_substr
21512             ? r->check_substr == r->substrs->data[0].substr
21513             : r->check_utf8   == r->substrs->data[0].utf8_substr;
21514         Newx(ret->substrs, 1, struct reg_substr_data);
21515         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21516
21517         for (i = 0; i < 2; i++) {
21518             ret->substrs->data[i].substr =
21519                         sv_dup_inc(ret->substrs->data[i].substr, param);
21520             ret->substrs->data[i].utf8_substr =
21521                         sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21522         }
21523
21524         /* check_substr and check_utf8, if non-NULL, point to either their
21525            anchored or float namesakes, and don't hold a second reference.  */
21526
21527         if (ret->check_substr) {
21528             if (anchored) {
21529                 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21530
21531                 ret->check_substr = ret->substrs->data[0].substr;
21532                 ret->check_utf8   = ret->substrs->data[0].utf8_substr;
21533             } else {
21534                 assert(r->check_substr == r->substrs->data[1].substr);
21535                 assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
21536
21537                 ret->check_substr = ret->substrs->data[1].substr;
21538                 ret->check_utf8   = ret->substrs->data[1].utf8_substr;
21539             }
21540         } else if (ret->check_utf8) {
21541             if (anchored) {
21542                 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21543             } else {
21544                 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21545             }
21546         }
21547     }
21548
21549     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21550     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21551     if (r->recurse_locinput)
21552         Newx(ret->recurse_locinput, r->nparens + 1, char *);
21553
21554     if (ret->pprivate)
21555         RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21556
21557     if (RX_MATCH_COPIED(dstr))
21558         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
21559     else
21560         ret->subbeg = NULL;
21561 #ifdef PERL_ANY_COW
21562     ret->saved_copy = NULL;
21563 #endif
21564
21565     /* Whether mother_re be set or no, we need to copy the string.  We
21566        cannot refrain from copying it when the storage points directly to
21567        our mother regexp, because that's
21568                1: a buffer in a different thread
21569                2: something we no longer hold a reference on
21570                so we need to copy it locally.  */
21571     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21572     /* set malloced length to a non-zero value so it will be freed
21573      * (otherwise in combination with SVf_FAKE it looks like an alien
21574      * buffer). It doesn't have to be the actual malloced size, since it
21575      * should never be grown */
21576     SvLEN_set(dstr, SvCUR(sstr)+1);
21577     ret->mother_re   = NULL;
21578 }
21579 #endif /* PERL_IN_XSUB_RE */
21580
21581 /*
21582    regdupe_internal()
21583
21584    This is the internal complement to regdupe() which is used to copy
21585    the structure pointed to by the *pprivate pointer in the regexp.
21586    This is the core version of the extension overridable cloning hook.
21587    The regexp structure being duplicated will be copied by perl prior
21588    to this and will be provided as the regexp *r argument, however
21589    with the /old/ structures pprivate pointer value. Thus this routine
21590    may override any copying normally done by perl.
21591
21592    It returns a pointer to the new regexp_internal structure.
21593 */
21594
21595 void *
21596 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21597 {
21598     dVAR;
21599     struct regexp *const r = ReANY(rx);
21600     regexp_internal *reti;
21601     int len;
21602     RXi_GET_DECL(r, ri);
21603
21604     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21605
21606     len = ProgLen(ri);
21607
21608     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21609           char, regexp_internal);
21610     Copy(ri->program, reti->program, len+1, regnode);
21611
21612
21613     if (ri->code_blocks) {
21614         int n;
21615         Newx(reti->code_blocks, 1, struct reg_code_blocks);
21616         Newx(reti->code_blocks->cb, ri->code_blocks->count,
21617                     struct reg_code_block);
21618         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21619              ri->code_blocks->count, struct reg_code_block);
21620         for (n = 0; n < ri->code_blocks->count; n++)
21621              reti->code_blocks->cb[n].src_regex = (REGEXP*)
21622                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21623         reti->code_blocks->count = ri->code_blocks->count;
21624         reti->code_blocks->refcnt = 1;
21625     }
21626     else
21627         reti->code_blocks = NULL;
21628
21629     reti->regstclass = NULL;
21630
21631     if (ri->data) {
21632         struct reg_data *d;
21633         const int count = ri->data->count;
21634         int i;
21635
21636         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21637                 char, struct reg_data);
21638         Newx(d->what, count, U8);
21639
21640         d->count = count;
21641         for (i = 0; i < count; i++) {
21642             d->what[i] = ri->data->what[i];
21643             switch (d->what[i]) {
21644                 /* see also regcomp.h and regfree_internal() */
21645             case 'a': /* actually an AV, but the dup function is identical.
21646                          values seem to be "plain sv's" generally. */
21647             case 'r': /* a compiled regex (but still just another SV) */
21648             case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21649                          this use case should go away, the code could have used
21650                          'a' instead - see S_set_ANYOF_arg() for array contents. */
21651             case 'S': /* actually an SV, but the dup function is identical.  */
21652             case 'u': /* actually an HV, but the dup function is identical.
21653                          values are "plain sv's" */
21654                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21655                 break;
21656             case 'f':
21657                 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21658                  * patterns which could start with several different things. Pre-TRIE
21659                  * this was more important than it is now, however this still helps
21660                  * in some places, for instance /x?a+/ might produce a SSC equivalent
21661                  * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21662                  * in regexec.c
21663                  */
21664                 /* This is cheating. */
21665                 Newx(d->data[i], 1, regnode_ssc);
21666                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21667                 reti->regstclass = (regnode*)d->data[i];
21668                 break;
21669             case 'T':
21670                 /* AHO-CORASICK fail table */
21671                 /* Trie stclasses are readonly and can thus be shared
21672                  * without duplication. We free the stclass in pregfree
21673                  * when the corresponding reg_ac_data struct is freed.
21674                  */
21675                 reti->regstclass= ri->regstclass;
21676                 /* FALLTHROUGH */
21677             case 't':
21678                 /* TRIE transition table */
21679                 OP_REFCNT_LOCK;
21680                 ((reg_trie_data*)ri->data->data[i])->refcount++;
21681                 OP_REFCNT_UNLOCK;
21682                 /* FALLTHROUGH */
21683             case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21684             case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21685                          is not from another regexp */
21686                 d->data[i] = ri->data->data[i];
21687                 break;
21688             default:
21689                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21690                                                            ri->data->what[i]);
21691             }
21692         }
21693
21694         reti->data = d;
21695     }
21696     else
21697         reti->data = NULL;
21698
21699     reti->name_list_idx = ri->name_list_idx;
21700
21701 #ifdef RE_TRACK_PATTERN_OFFSETS
21702     if (ri->u.offsets) {
21703         Newx(reti->u.offsets, 2*len+1, U32);
21704         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21705     }
21706 #else
21707     SetProgLen(reti, len);
21708 #endif
21709
21710     return (void*)reti;
21711 }
21712
21713 #endif    /* USE_ITHREADS */
21714
21715 #ifndef PERL_IN_XSUB_RE
21716
21717 /*
21718  - regnext - dig the "next" pointer out of a node
21719  */
21720 regnode *
21721 Perl_regnext(pTHX_ regnode *p)
21722 {
21723     I32 offset;
21724
21725     if (!p)
21726         return(NULL);
21727
21728     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
21729         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21730                                                 (int)OP(p), (int)REGNODE_MAX);
21731     }
21732
21733     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21734     if (offset == 0)
21735         return(NULL);
21736
21737     return(p+offset);
21738 }
21739
21740 #endif
21741
21742 STATIC void
21743 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21744 {
21745     va_list args;
21746     STRLEN l1 = strlen(pat1);
21747     STRLEN l2 = strlen(pat2);
21748     char buf[512];
21749     SV *msv;
21750     const char *message;
21751
21752     PERL_ARGS_ASSERT_RE_CROAK2;
21753
21754     if (l1 > 510)
21755         l1 = 510;
21756     if (l1 + l2 > 510)
21757         l2 = 510 - l1;
21758     Copy(pat1, buf, l1 , char);
21759     Copy(pat2, buf + l1, l2 , char);
21760     buf[l1 + l2] = '\n';
21761     buf[l1 + l2 + 1] = '\0';
21762     va_start(args, pat2);
21763     msv = vmess(buf, &args);
21764     va_end(args);
21765     message = SvPV_const(msv, l1);
21766     if (l1 > 512)
21767         l1 = 512;
21768     Copy(message, buf, l1 , char);
21769     /* l1-1 to avoid \n */
21770     Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21771 }
21772
21773 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
21774
21775 #ifndef PERL_IN_XSUB_RE
21776 void
21777 Perl_save_re_context(pTHX)
21778 {
21779     I32 nparens = -1;
21780     I32 i;
21781
21782     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21783
21784     if (PL_curpm) {
21785         const REGEXP * const rx = PM_GETRE(PL_curpm);
21786         if (rx)
21787             nparens = RX_NPARENS(rx);
21788     }
21789
21790     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21791      * that PL_curpm will be null, but that utf8.pm and the modules it
21792      * loads will only use $1..$3.
21793      * The t/porting/re_context.t test file checks this assumption.
21794      */
21795     if (nparens == -1)
21796         nparens = 3;
21797
21798     for (i = 1; i <= nparens; i++) {
21799         char digits[TYPE_CHARS(long)];
21800         const STRLEN len = my_snprintf(digits, sizeof(digits),
21801                                        "%lu", (long)i);
21802         GV *const *const gvp
21803             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21804
21805         if (gvp) {
21806             GV * const gv = *gvp;
21807             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21808                 save_scalar(gv);
21809         }
21810     }
21811 }
21812 #endif
21813
21814 #ifdef DEBUGGING
21815
21816 STATIC void
21817 S_put_code_point(pTHX_ SV *sv, UV c)
21818 {
21819     PERL_ARGS_ASSERT_PUT_CODE_POINT;
21820
21821     if (c > 255) {
21822         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21823     }
21824     else if (isPRINT(c)) {
21825         const char string = (char) c;
21826
21827         /* We use {phrase} as metanotation in the class, so also escape literal
21828          * braces */
21829         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21830             sv_catpvs(sv, "\\");
21831         sv_catpvn(sv, &string, 1);
21832     }
21833     else if (isMNEMONIC_CNTRL(c)) {
21834         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21835     }
21836     else {
21837         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21838     }
21839 }
21840
21841 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21842
21843 STATIC void
21844 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21845 {
21846     /* Appends to 'sv' a displayable version of the range of code points from
21847      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
21848      * that have them, when they occur at the beginning or end of the range.
21849      * It uses hex to output the remaining code points, unless 'allow_literals'
21850      * is true, in which case the printable ASCII ones are output as-is (though
21851      * some of these will be escaped by put_code_point()).
21852      *
21853      * NOTE:  This is designed only for printing ranges of code points that fit
21854      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
21855      */
21856
21857     const unsigned int min_range_count = 3;
21858
21859     assert(start <= end);
21860
21861     PERL_ARGS_ASSERT_PUT_RANGE;
21862
21863     while (start <= end) {
21864         UV this_end;
21865         const char * format;
21866
21867         if (end - start < min_range_count) {
21868
21869             /* Output chars individually when they occur in short ranges */
21870             for (; start <= end; start++) {
21871                 put_code_point(sv, start);
21872             }
21873             break;
21874         }
21875
21876         /* If permitted by the input options, and there is a possibility that
21877          * this range contains a printable literal, look to see if there is
21878          * one. */
21879         if (allow_literals && start <= MAX_PRINT_A) {
21880
21881             /* If the character at the beginning of the range isn't an ASCII
21882              * printable, effectively split the range into two parts:
21883              *  1) the portion before the first such printable,
21884              *  2) the rest
21885              * and output them separately. */
21886             if (! isPRINT_A(start)) {
21887                 UV temp_end = start + 1;
21888
21889                 /* There is no point looking beyond the final possible
21890                  * printable, in MAX_PRINT_A */
21891                 UV max = MIN(end, MAX_PRINT_A);
21892
21893                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21894                     temp_end++;
21895                 }
21896
21897                 /* Here, temp_end points to one beyond the first printable if
21898                  * found, or to one beyond 'max' if not.  If none found, make
21899                  * sure that we use the entire range */
21900                 if (temp_end > MAX_PRINT_A) {
21901                     temp_end = end + 1;
21902                 }
21903
21904                 /* Output the first part of the split range: the part that
21905                  * doesn't have printables, with the parameter set to not look
21906                  * for literals (otherwise we would infinitely recurse) */
21907                 put_range(sv, start, temp_end - 1, FALSE);
21908
21909                 /* The 2nd part of the range (if any) starts here. */
21910                 start = temp_end;
21911
21912                 /* We do a continue, instead of dropping down, because even if
21913                  * the 2nd part is non-empty, it could be so short that we want
21914                  * to output it as individual characters, as tested for at the
21915                  * top of this loop.  */
21916                 continue;
21917             }
21918
21919             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
21920              * output a sub-range of just the digits or letters, then process
21921              * the remaining portion as usual. */
21922             if (isALPHANUMERIC_A(start)) {
21923                 UV mask = (isDIGIT_A(start))
21924                            ? _CC_DIGIT
21925                              : isUPPER_A(start)
21926                                ? _CC_UPPER
21927                                : _CC_LOWER;
21928                 UV temp_end = start + 1;
21929
21930                 /* Find the end of the sub-range that includes just the
21931                  * characters in the same class as the first character in it */
21932                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21933                     temp_end++;
21934                 }
21935                 temp_end--;
21936
21937                 /* For short ranges, don't duplicate the code above to output
21938                  * them; just call recursively */
21939                 if (temp_end - start < min_range_count) {
21940                     put_range(sv, start, temp_end, FALSE);
21941                 }
21942                 else {  /* Output as a range */
21943                     put_code_point(sv, start);
21944                     sv_catpvs(sv, "-");
21945                     put_code_point(sv, temp_end);
21946                 }
21947                 start = temp_end + 1;
21948                 continue;
21949             }
21950
21951             /* We output any other printables as individual characters */
21952             if (isPUNCT_A(start) || isSPACE_A(start)) {
21953                 while (start <= end && (isPUNCT_A(start)
21954                                         || isSPACE_A(start)))
21955                 {
21956                     put_code_point(sv, start);
21957                     start++;
21958                 }
21959                 continue;
21960             }
21961         } /* End of looking for literals */
21962
21963         /* Here is not to output as a literal.  Some control characters have
21964          * mnemonic names.  Split off any of those at the beginning and end of
21965          * the range to print mnemonically.  It isn't possible for many of
21966          * these to be in a row, so this won't overwhelm with output */
21967         if (   start <= end
21968             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21969         {
21970             while (isMNEMONIC_CNTRL(start) && start <= end) {
21971                 put_code_point(sv, start);
21972                 start++;
21973             }
21974
21975             /* If this didn't take care of the whole range ... */
21976             if (start <= end) {
21977
21978                 /* Look backwards from the end to find the final non-mnemonic
21979                  * */
21980                 UV temp_end = end;
21981                 while (isMNEMONIC_CNTRL(temp_end)) {
21982                     temp_end--;
21983                 }
21984
21985                 /* And separately output the interior range that doesn't start
21986                  * or end with mnemonics */
21987                 put_range(sv, start, temp_end, FALSE);
21988
21989                 /* Then output the mnemonic trailing controls */
21990                 start = temp_end + 1;
21991                 while (start <= end) {
21992                     put_code_point(sv, start);
21993                     start++;
21994                 }
21995                 break;
21996             }
21997         }
21998
21999         /* As a final resort, output the range or subrange as hex. */
22000
22001         if (start >= NUM_ANYOF_CODE_POINTS) {
22002             this_end = end;
22003         }
22004         else {  /* Have to split range at the bitmap boundary */
22005             this_end = (end < NUM_ANYOF_CODE_POINTS)
22006                         ? end
22007                         : NUM_ANYOF_CODE_POINTS - 1;
22008         }
22009 #if NUM_ANYOF_CODE_POINTS > 256
22010         format = (this_end < 256)
22011                  ? "\\x%02" UVXf "-\\x%02" UVXf
22012                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22013 #else
22014         format = "\\x%02" UVXf "-\\x%02" UVXf;
22015 #endif
22016         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22017         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22018         GCC_DIAG_RESTORE_STMT;
22019         break;
22020     }
22021 }
22022
22023 STATIC void
22024 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22025 {
22026     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22027      * 'invlist' */
22028
22029     UV start, end;
22030     bool allow_literals = TRUE;
22031
22032     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22033
22034     /* Generally, it is more readable if printable characters are output as
22035      * literals, but if a range (nearly) spans all of them, it's best to output
22036      * it as a single range.  This code will use a single range if all but 2
22037      * ASCII printables are in it */
22038     invlist_iterinit(invlist);
22039     while (invlist_iternext(invlist, &start, &end)) {
22040
22041         /* If the range starts beyond the final printable, it doesn't have any
22042          * in it */
22043         if (start > MAX_PRINT_A) {
22044             break;
22045         }
22046
22047         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
22048          * all but two, the range must start and end no later than 2 from
22049          * either end */
22050         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22051             if (end > MAX_PRINT_A) {
22052                 end = MAX_PRINT_A;
22053             }
22054             if (start < ' ') {
22055                 start = ' ';
22056             }
22057             if (end - start >= MAX_PRINT_A - ' ' - 2) {
22058                 allow_literals = FALSE;
22059             }
22060             break;
22061         }
22062     }
22063     invlist_iterfinish(invlist);
22064
22065     /* Here we have figured things out.  Output each range */
22066     invlist_iterinit(invlist);
22067     while (invlist_iternext(invlist, &start, &end)) {
22068         if (start >= NUM_ANYOF_CODE_POINTS) {
22069             break;
22070         }
22071         put_range(sv, start, end, allow_literals);
22072     }
22073     invlist_iterfinish(invlist);
22074
22075     return;
22076 }
22077
22078 STATIC SV*
22079 S_put_charclass_bitmap_innards_common(pTHX_
22080         SV* invlist,            /* The bitmap */
22081         SV* posixes,            /* Under /l, things like [:word:], \S */
22082         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
22083         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
22084         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
22085         const bool invert       /* Is the result to be inverted? */
22086 )
22087 {
22088     /* Create and return an SV containing a displayable version of the bitmap
22089      * and associated information determined by the input parameters.  If the
22090      * output would have been only the inversion indicator '^', NULL is instead
22091      * returned. */
22092
22093     dVAR;
22094     SV * output;
22095
22096     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22097
22098     if (invert) {
22099         output = newSVpvs("^");
22100     }
22101     else {
22102         output = newSVpvs("");
22103     }
22104
22105     /* First, the code points in the bitmap that are unconditionally there */
22106     put_charclass_bitmap_innards_invlist(output, invlist);
22107
22108     /* Traditionally, these have been placed after the main code points */
22109     if (posixes) {
22110         sv_catsv(output, posixes);
22111     }
22112
22113     if (only_utf8 && _invlist_len(only_utf8)) {
22114         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22115         put_charclass_bitmap_innards_invlist(output, only_utf8);
22116     }
22117
22118     if (not_utf8 && _invlist_len(not_utf8)) {
22119         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22120         put_charclass_bitmap_innards_invlist(output, not_utf8);
22121     }
22122
22123     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22124         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22125         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22126
22127         /* This is the only list in this routine that can legally contain code
22128          * points outside the bitmap range.  The call just above to
22129          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22130          * output them here.  There's about a half-dozen possible, and none in
22131          * contiguous ranges longer than 2 */
22132         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22133             UV start, end;
22134             SV* above_bitmap = NULL;
22135
22136             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22137
22138             invlist_iterinit(above_bitmap);
22139             while (invlist_iternext(above_bitmap, &start, &end)) {
22140                 UV i;
22141
22142                 for (i = start; i <= end; i++) {
22143                     put_code_point(output, i);
22144                 }
22145             }
22146             invlist_iterfinish(above_bitmap);
22147             SvREFCNT_dec_NN(above_bitmap);
22148         }
22149     }
22150
22151     if (invert && SvCUR(output) == 1) {
22152         return NULL;
22153     }
22154
22155     return output;
22156 }
22157
22158 STATIC bool
22159 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22160                                      char *bitmap,
22161                                      SV *nonbitmap_invlist,
22162                                      SV *only_utf8_locale_invlist,
22163                                      const regnode * const node,
22164                                      const U8 flags,
22165                                      const bool force_as_is_display)
22166 {
22167     /* Appends to 'sv' a displayable version of the innards of the bracketed
22168      * character class defined by the other arguments:
22169      *  'bitmap' points to the bitmap, or NULL if to ignore that.
22170      *  'nonbitmap_invlist' is an inversion list of the code points that are in
22171      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
22172      *      none.  The reasons for this could be that they require some
22173      *      condition such as the target string being or not being in UTF-8
22174      *      (under /d), or because they came from a user-defined property that
22175      *      was not resolved at the time of the regex compilation (under /u)
22176      *  'only_utf8_locale_invlist' is an inversion list of the code points that
22177      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
22178      *  'node' is the regex pattern ANYOF node.  It is needed only when the
22179      *      above two parameters are not null, and is passed so that this
22180      *      routine can tease apart the various reasons for them.
22181      *  'flags' is the flags field of 'node'
22182      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
22183      *      to invert things to see if that leads to a cleaner display.  If
22184      *      FALSE, this routine is free to use its judgment about doing this.
22185      *
22186      * It returns TRUE if there was actually something output.  (It may be that
22187      * the bitmap, etc is empty.)
22188      *
22189      * When called for outputting the bitmap of a non-ANYOF node, just pass the
22190      * bitmap, with the succeeding parameters set to NULL, and the final one to
22191      * FALSE.
22192      */
22193
22194     /* In general, it tries to display the 'cleanest' representation of the
22195      * innards, choosing whether to display them inverted or not, regardless of
22196      * whether the class itself is to be inverted.  However,  there are some
22197      * cases where it can't try inverting, as what actually matches isn't known
22198      * until runtime, and hence the inversion isn't either. */
22199
22200     dVAR;
22201     bool inverting_allowed = ! force_as_is_display;
22202
22203     int i;
22204     STRLEN orig_sv_cur = SvCUR(sv);
22205
22206     SV* invlist;            /* Inversion list we accumulate of code points that
22207                                are unconditionally matched */
22208     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
22209                                UTF-8 */
22210     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
22211                              */
22212     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
22213     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
22214                                        is UTF-8 */
22215
22216     SV* as_is_display;      /* The output string when we take the inputs
22217                                literally */
22218     SV* inverted_display;   /* The output string when we invert the inputs */
22219
22220     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
22221                                                    to match? */
22222     /* We are biased in favor of displaying things without them being inverted,
22223      * as that is generally easier to understand */
22224     const int bias = 5;
22225
22226     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22227
22228     /* Start off with whatever code points are passed in.  (We clone, so we
22229      * don't change the caller's list) */
22230     if (nonbitmap_invlist) {
22231         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22232         invlist = invlist_clone(nonbitmap_invlist, NULL);
22233     }
22234     else {  /* Worst case size is every other code point is matched */
22235         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22236     }
22237
22238     if (flags) {
22239         if (OP(node) == ANYOFD) {
22240
22241             /* This flag indicates that the code points below 0x100 in the
22242              * nonbitmap list are precisely the ones that match only when the
22243              * target is UTF-8 (they should all be non-ASCII). */
22244             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22245             {
22246                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22247                 _invlist_subtract(invlist, only_utf8, &invlist);
22248             }
22249
22250             /* And this flag for matching all non-ASCII 0xFF and below */
22251             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22252             {
22253                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22254             }
22255         }
22256         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22257
22258             /* If either of these flags are set, what matches isn't
22259              * determinable except during execution, so don't know enough here
22260              * to invert */
22261             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22262                 inverting_allowed = FALSE;
22263             }
22264
22265             /* What the posix classes match also varies at runtime, so these
22266              * will be output symbolically. */
22267             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22268                 int i;
22269
22270                 posixes = newSVpvs("");
22271                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22272                     if (ANYOF_POSIXL_TEST(node, i)) {
22273                         sv_catpv(posixes, anyofs[i]);
22274                     }
22275                 }
22276             }
22277         }
22278     }
22279
22280     /* Accumulate the bit map into the unconditional match list */
22281     if (bitmap) {
22282         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22283             if (BITMAP_TEST(bitmap, i)) {
22284                 int start = i++;
22285                 for (;
22286                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22287                      i++)
22288                 { /* empty */ }
22289                 invlist = _add_range_to_invlist(invlist, start, i-1);
22290             }
22291         }
22292     }
22293
22294     /* Make sure that the conditional match lists don't have anything in them
22295      * that match unconditionally; otherwise the output is quite confusing.
22296      * This could happen if the code that populates these misses some
22297      * duplication. */
22298     if (only_utf8) {
22299         _invlist_subtract(only_utf8, invlist, &only_utf8);
22300     }
22301     if (not_utf8) {
22302         _invlist_subtract(not_utf8, invlist, &not_utf8);
22303     }
22304
22305     if (only_utf8_locale_invlist) {
22306
22307         /* Since this list is passed in, we have to make a copy before
22308          * modifying it */
22309         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22310
22311         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22312
22313         /* And, it can get really weird for us to try outputting an inverted
22314          * form of this list when it has things above the bitmap, so don't even
22315          * try */
22316         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22317             inverting_allowed = FALSE;
22318         }
22319     }
22320
22321     /* Calculate what the output would be if we take the input as-is */
22322     as_is_display = put_charclass_bitmap_innards_common(invlist,
22323                                                     posixes,
22324                                                     only_utf8,
22325                                                     not_utf8,
22326                                                     only_utf8_locale,
22327                                                     invert);
22328
22329     /* If have to take the output as-is, just do that */
22330     if (! inverting_allowed) {
22331         if (as_is_display) {
22332             sv_catsv(sv, as_is_display);
22333             SvREFCNT_dec_NN(as_is_display);
22334         }
22335     }
22336     else { /* But otherwise, create the output again on the inverted input, and
22337               use whichever version is shorter */
22338
22339         int inverted_bias, as_is_bias;
22340
22341         /* We will apply our bias to whichever of the the results doesn't have
22342          * the '^' */
22343         if (invert) {
22344             invert = FALSE;
22345             as_is_bias = bias;
22346             inverted_bias = 0;
22347         }
22348         else {
22349             invert = TRUE;
22350             as_is_bias = 0;
22351             inverted_bias = bias;
22352         }
22353
22354         /* Now invert each of the lists that contribute to the output,
22355          * excluding from the result things outside the possible range */
22356
22357         /* For the unconditional inversion list, we have to add in all the
22358          * conditional code points, so that when inverted, they will be gone
22359          * from it */
22360         _invlist_union(only_utf8, invlist, &invlist);
22361         _invlist_union(not_utf8, invlist, &invlist);
22362         _invlist_union(only_utf8_locale, invlist, &invlist);
22363         _invlist_invert(invlist);
22364         _invlist_intersection(invlist, PL_InBitmap, &invlist);
22365
22366         if (only_utf8) {
22367             _invlist_invert(only_utf8);
22368             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22369         }
22370         else if (not_utf8) {
22371
22372             /* If a code point matches iff the target string is not in UTF-8,
22373              * then complementing the result has it not match iff not in UTF-8,
22374              * which is the same thing as matching iff it is UTF-8. */
22375             only_utf8 = not_utf8;
22376             not_utf8 = NULL;
22377         }
22378
22379         if (only_utf8_locale) {
22380             _invlist_invert(only_utf8_locale);
22381             _invlist_intersection(only_utf8_locale,
22382                                   PL_InBitmap,
22383                                   &only_utf8_locale);
22384         }
22385
22386         inverted_display = put_charclass_bitmap_innards_common(
22387                                             invlist,
22388                                             posixes,
22389                                             only_utf8,
22390                                             not_utf8,
22391                                             only_utf8_locale, invert);
22392
22393         /* Use the shortest representation, taking into account our bias
22394          * against showing it inverted */
22395         if (   inverted_display
22396             && (   ! as_is_display
22397                 || (  SvCUR(inverted_display) + inverted_bias
22398                     < SvCUR(as_is_display)    + as_is_bias)))
22399         {
22400             sv_catsv(sv, inverted_display);
22401         }
22402         else if (as_is_display) {
22403             sv_catsv(sv, as_is_display);
22404         }
22405
22406         SvREFCNT_dec(as_is_display);
22407         SvREFCNT_dec(inverted_display);
22408     }
22409
22410     SvREFCNT_dec_NN(invlist);
22411     SvREFCNT_dec(only_utf8);
22412     SvREFCNT_dec(not_utf8);
22413     SvREFCNT_dec(posixes);
22414     SvREFCNT_dec(only_utf8_locale);
22415
22416     return SvCUR(sv) > orig_sv_cur;
22417 }
22418
22419 #define CLEAR_OPTSTART                                                       \
22420     if (optstart) STMT_START {                                               \
22421         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
22422                               " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22423         optstart=NULL;                                                       \
22424     } STMT_END
22425
22426 #define DUMPUNTIL(b,e)                                                       \
22427                     CLEAR_OPTSTART;                                          \
22428                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22429
22430 STATIC const regnode *
22431 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22432             const regnode *last, const regnode *plast,
22433             SV* sv, I32 indent, U32 depth)
22434 {
22435     U8 op = PSEUDO;     /* Arbitrary non-END op. */
22436     const regnode *next;
22437     const regnode *optstart= NULL;
22438
22439     RXi_GET_DECL(r, ri);
22440     GET_RE_DEBUG_FLAGS_DECL;
22441
22442     PERL_ARGS_ASSERT_DUMPUNTIL;
22443
22444 #ifdef DEBUG_DUMPUNTIL
22445     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
22446         last ? last-start : 0, plast ? plast-start : 0);
22447 #endif
22448
22449     if (plast && plast < last)
22450         last= plast;
22451
22452     while (PL_regkind[op] != END && (!last || node < last)) {
22453         assert(node);
22454         /* While that wasn't END last time... */
22455         NODE_ALIGN(node);
22456         op = OP(node);
22457         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22458             indent--;
22459         next = regnext((regnode *)node);
22460
22461         /* Where, what. */
22462         if (OP(node) == OPTIMIZED) {
22463             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22464                 optstart = node;
22465             else
22466                 goto after_print;
22467         } else
22468             CLEAR_OPTSTART;
22469
22470         regprop(r, sv, node, NULL, NULL);
22471         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
22472                       (int)(2*indent + 1), "", SvPVX_const(sv));
22473
22474         if (OP(node) != OPTIMIZED) {
22475             if (next == NULL)           /* Next ptr. */
22476                 Perl_re_printf( aTHX_  " (0)");
22477             else if (PL_regkind[(U8)op] == BRANCH
22478                      && PL_regkind[OP(next)] != BRANCH )
22479                 Perl_re_printf( aTHX_  " (FAIL)");
22480             else
22481                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
22482             Perl_re_printf( aTHX_ "\n");
22483         }
22484
22485       after_print:
22486         if (PL_regkind[(U8)op] == BRANCHJ) {
22487             assert(next);
22488             {
22489                 const regnode *nnode = (OP(next) == LONGJMP
22490                                        ? regnext((regnode *)next)
22491                                        : next);
22492                 if (last && nnode > last)
22493                     nnode = last;
22494                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22495             }
22496         }
22497         else if (PL_regkind[(U8)op] == BRANCH) {
22498             assert(next);
22499             DUMPUNTIL(NEXTOPER(node), next);
22500         }
22501         else if ( PL_regkind[(U8)op]  == TRIE ) {
22502             const regnode *this_trie = node;
22503             const char op = OP(node);
22504             const U32 n = ARG(node);
22505             const reg_ac_data * const ac = op>=AHOCORASICK ?
22506                (reg_ac_data *)ri->data->data[n] :
22507                NULL;
22508             const reg_trie_data * const trie =
22509                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22510 #ifdef DEBUGGING
22511             AV *const trie_words
22512                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22513 #endif
22514             const regnode *nextbranch= NULL;
22515             I32 word_idx;
22516             SvPVCLEAR(sv);
22517             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22518                 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22519
22520                 Perl_re_indentf( aTHX_  "%s ",
22521                     indent+3,
22522                     elem_ptr
22523                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22524                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
22525                                 PL_colors[0], PL_colors[1],
22526                                 (SvUTF8(*elem_ptr)
22527                                  ? PERL_PV_ESCAPE_UNI
22528                                  : 0)
22529                                 | PERL_PV_PRETTY_ELLIPSES
22530                                 | PERL_PV_PRETTY_LTGT
22531                             )
22532                     : "???"
22533                 );
22534                 if (trie->jump) {
22535                     U16 dist= trie->jump[word_idx+1];
22536                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
22537                                (UV)((dist ? this_trie + dist : next) - start));
22538                     if (dist) {
22539                         if (!nextbranch)
22540                             nextbranch= this_trie + trie->jump[0];
22541                         DUMPUNTIL(this_trie + dist, nextbranch);
22542                     }
22543                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22544                         nextbranch= regnext((regnode *)nextbranch);
22545                 } else {
22546                     Perl_re_printf( aTHX_  "\n");
22547                 }
22548             }
22549             if (last && next > last)
22550                 node= last;
22551             else
22552                 node= next;
22553         }
22554         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
22555             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22556                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22557         }
22558         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22559             assert(next);
22560             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22561         }
22562         else if ( op == PLUS || op == STAR) {
22563             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22564         }
22565         else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22566             /* Literal string, where present. */
22567             node += NODE_SZ_STR(node) - 1;
22568             node = NEXTOPER(node);
22569         }
22570         else {
22571             node = NEXTOPER(node);
22572             node += regarglen[(U8)op];
22573         }
22574         if (op == CURLYX || op == OPEN || op == SROPEN)
22575             indent++;
22576     }
22577     CLEAR_OPTSTART;
22578 #ifdef DEBUG_DUMPUNTIL
22579     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
22580 #endif
22581     return node;
22582 }
22583
22584 #endif  /* DEBUGGING */
22585
22586 #ifndef PERL_IN_XSUB_RE
22587
22588 #include "uni_keywords.h"
22589
22590 void
22591 Perl_init_uniprops(pTHX)
22592 {
22593     dVAR;
22594
22595 #ifdef DEBUGGING
22596     char * dump_len_string;
22597
22598     dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22599     if (   ! dump_len_string
22600         || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22601     {
22602         PL_dump_re_max_len = 60;    /* A reasonable default */
22603     }
22604 #endif
22605
22606     PL_user_def_props = newHV();
22607
22608 #ifdef USE_ITHREADS
22609
22610     HvSHAREKEYS_off(PL_user_def_props);
22611     PL_user_def_props_aTHX = aTHX;
22612
22613 #endif
22614
22615     /* Set up the inversion list interpreter-level variables */
22616
22617     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22618     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22619     PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22620     PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22621     PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22622     PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22623     PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22624     PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22625     PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22626     PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22627     PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22628     PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22629     PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22630     PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22631     PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22632     PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22633
22634     PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22635     PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22636     PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22637     PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22638     PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22639     PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22640     PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22641     PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22642     PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22643     PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22644     PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22645     PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22646     PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22647     PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22648     PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22649     PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22650
22651     PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22652     PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22653     PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22654     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22655     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22656
22657     PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
22658     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22659     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22660     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22661
22662     PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22663
22664     PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22665     PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22666
22667     PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22668     PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22669
22670     PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22671     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22672                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22673     PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22674                                             UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22675     PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22676     PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22677     PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22678     PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22679     PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22680     PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22681     PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22682     PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22683     PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22684
22685 #ifdef UNI_XIDC
22686     /* The below are used only by deprecated functions.  They could be removed */
22687     PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22688     PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22689     PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22690 #endif
22691 }
22692
22693 #if 0
22694
22695 This code was mainly added for backcompat to give a warning for non-portable
22696 code points in user-defined properties.  But experiments showed that the
22697 warning in earlier perls were only omitted on overflow, which should be an
22698 error, so there really isnt a backcompat issue, and actually adding the
22699 warning when none was present before might cause breakage, for little gain.  So
22700 khw left this code in, but not enabled.  Tests were never added.
22701
22702 embed.fnc entry:
22703 Ei      |const char *|get_extended_utf8_msg|const UV cp
22704
22705 PERL_STATIC_INLINE const char *
22706 S_get_extended_utf8_msg(pTHX_ const UV cp)
22707 {
22708     U8 dummy[UTF8_MAXBYTES + 1];
22709     HV *msgs;
22710     SV **msg;
22711
22712     uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22713                              &msgs);
22714
22715     msg = hv_fetchs(msgs, "text", 0);
22716     assert(msg);
22717
22718     (void) sv_2mortal((SV *) msgs);
22719
22720     return SvPVX(*msg);
22721 }
22722
22723 #endif
22724
22725 SV *
22726 Perl_handle_user_defined_property(pTHX_
22727
22728     /* Parses the contents of a user-defined property definition; returning the
22729      * expanded definition if possible.  If so, the return is an inversion
22730      * list.
22731      *
22732      * If there are subroutines that are part of the expansion and which aren't
22733      * known at the time of the call to this function, this returns what
22734      * parse_uniprop_string() returned for the first one encountered.
22735      *
22736      * If an error was found, NULL is returned, and 'msg' gets a suitable
22737      * message appended to it.  (Appending allows the back trace of how we got
22738      * to the faulty definition to be displayed through nested calls of
22739      * user-defined subs.)
22740      *
22741      * The caller IS responsible for freeing any returned SV.
22742      *
22743      * The syntax of the contents is pretty much described in perlunicode.pod,
22744      * but we also allow comments on each line */
22745
22746     const char * name,          /* Name of property */
22747     const STRLEN name_len,      /* The name's length in bytes */
22748     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
22749     const bool to_fold,         /* ? Is this under /i */
22750     const bool runtime,         /* ? Are we in compile- or run-time */
22751     const bool deferrable,      /* Is it ok for this property's full definition
22752                                    to be deferred until later? */
22753     SV* contents,               /* The property's definition */
22754     bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
22755                                    getting called unless this is thought to be
22756                                    a user-defined property */
22757     SV * msg,                   /* Any error or warning msg(s) are appended to
22758                                    this */
22759     const STRLEN level)         /* Recursion level of this call */
22760 {
22761     STRLEN len;
22762     const char * string         = SvPV_const(contents, len);
22763     const char * const e        = string + len;
22764     const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22765     const STRLEN msgs_length_on_entry = SvCUR(msg);
22766
22767     const char * s0 = string;   /* Points to first byte in the current line
22768                                    being parsed in 'string' */
22769     const char overflow_msg[] = "Code point too large in \"";
22770     SV* running_definition = NULL;
22771
22772     PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22773
22774     *user_defined_ptr = TRUE;
22775
22776     /* Look at each line */
22777     while (s0 < e) {
22778         const char * s;     /* Current byte */
22779         char op = '+';      /* Default operation is 'union' */
22780         IV   min = 0;       /* range begin code point */
22781         IV   max = -1;      /* and range end */
22782         SV* this_definition;
22783
22784         /* Skip comment lines */
22785         if (*s0 == '#') {
22786             s0 = strchr(s0, '\n');
22787             if (s0 == NULL) {
22788                 break;
22789             }
22790             s0++;
22791             continue;
22792         }
22793
22794         /* For backcompat, allow an empty first line */
22795         if (*s0 == '\n') {
22796             s0++;
22797             continue;
22798         }
22799
22800         /* First character in the line may optionally be the operation */
22801         if (   *s0 == '+'
22802             || *s0 == '!'
22803             || *s0 == '-'
22804             || *s0 == '&')
22805         {
22806             op = *s0++;
22807         }
22808
22809         /* If the line is one or two hex digits separated by blank space, its
22810          * a range; otherwise it is either another user-defined property or an
22811          * error */
22812
22813         s = s0;
22814
22815         if (! isXDIGIT(*s)) {
22816             goto check_if_property;
22817         }
22818
22819         do { /* Each new hex digit will add 4 bits. */
22820             if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22821                 s = strchr(s, '\n');
22822                 if (s == NULL) {
22823                     s = e;
22824                 }
22825                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22826                 sv_catpv(msg, overflow_msg);
22827                 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22828                                      UTF8fARG(is_contents_utf8, s - s0, s0));
22829                 sv_catpvs(msg, "\"");
22830                 goto return_failure;
22831             }
22832
22833             /* Accumulate this digit into the value */
22834             min = (min << 4) + READ_XDIGIT(s);
22835         } while (isXDIGIT(*s));
22836
22837         while (isBLANK(*s)) { s++; }
22838
22839         /* We allow comments at the end of the line */
22840         if (*s == '#') {
22841             s = strchr(s, '\n');
22842             if (s == NULL) {
22843                 s = e;
22844             }
22845             s++;
22846         }
22847         else if (s < e && *s != '\n') {
22848             if (! isXDIGIT(*s)) {
22849                 goto check_if_property;
22850             }
22851
22852             /* Look for the high point of the range */
22853             max = 0;
22854             do {
22855                 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22856                     s = strchr(s, '\n');
22857                     if (s == NULL) {
22858                         s = e;
22859                     }
22860                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22861                     sv_catpv(msg, overflow_msg);
22862                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22863                                       UTF8fARG(is_contents_utf8, s - s0, s0));
22864                     sv_catpvs(msg, "\"");
22865                     goto return_failure;
22866                 }
22867
22868                 max = (max << 4) + READ_XDIGIT(s);
22869             } while (isXDIGIT(*s));
22870
22871             while (isBLANK(*s)) { s++; }
22872
22873             if (*s == '#') {
22874                 s = strchr(s, '\n');
22875                 if (s == NULL) {
22876                     s = e;
22877                 }
22878             }
22879             else if (s < e && *s != '\n') {
22880                 goto check_if_property;
22881             }
22882         }
22883
22884         if (max == -1) {    /* The line only had one entry */
22885             max = min;
22886         }
22887         else if (max < min) {
22888             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22889             sv_catpvs(msg, "Illegal range in \"");
22890             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22891                                 UTF8fARG(is_contents_utf8, s - s0, s0));
22892             sv_catpvs(msg, "\"");
22893             goto return_failure;
22894         }
22895
22896 #if 0   /* See explanation at definition above of get_extended_utf8_msg() */
22897
22898         if (   UNICODE_IS_PERL_EXTENDED(min)
22899             || UNICODE_IS_PERL_EXTENDED(max))
22900         {
22901             if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22902
22903             /* If both code points are non-portable, warn only on the lower
22904              * one. */
22905             sv_catpv(msg, get_extended_utf8_msg(
22906                                             (UNICODE_IS_PERL_EXTENDED(min))
22907                                             ? min : max));
22908             sv_catpvs(msg, " in \"");
22909             Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22910                                  UTF8fARG(is_contents_utf8, s - s0, s0));
22911             sv_catpvs(msg, "\"");
22912         }
22913
22914 #endif
22915
22916         /* Here, this line contains a legal range */
22917         this_definition = sv_2mortal(_new_invlist(2));
22918         this_definition = _add_range_to_invlist(this_definition, min, max);
22919         goto calculate;
22920
22921       check_if_property:
22922
22923         /* Here it isn't a legal range line.  See if it is a legal property
22924          * line.  First find the end of the meat of the line */
22925         s = strpbrk(s, "#\n");
22926         if (s == NULL) {
22927             s = e;
22928         }
22929
22930         /* Ignore trailing blanks in keeping with the requirements of
22931          * parse_uniprop_string() */
22932         s--;
22933         while (s > s0 && isBLANK_A(*s)) {
22934             s--;
22935         }
22936         s++;
22937
22938         this_definition = parse_uniprop_string(s0, s - s0,
22939                                                is_utf8, to_fold, runtime,
22940                                                deferrable,
22941                                                user_defined_ptr, msg,
22942                                                (name_len == 0)
22943                                                 ? level /* Don't increase level
22944                                                            if input is empty */
22945                                                 : level + 1
22946                                               );
22947         if (this_definition == NULL) {
22948             goto return_failure;    /* 'msg' should have had the reason
22949                                        appended to it by the above call */
22950         }
22951
22952         if (! is_invlist(this_definition)) {    /* Unknown at this time */
22953             return newSVsv(this_definition);
22954         }
22955
22956         if (*s != '\n') {
22957             s = strchr(s, '\n');
22958             if (s == NULL) {
22959                 s = e;
22960             }
22961         }
22962
22963       calculate:
22964
22965         switch (op) {
22966             case '+':
22967                 _invlist_union(running_definition, this_definition,
22968                                                         &running_definition);
22969                 break;
22970             case '-':
22971                 _invlist_subtract(running_definition, this_definition,
22972                                                         &running_definition);
22973                 break;
22974             case '&':
22975                 _invlist_intersection(running_definition, this_definition,
22976                                                         &running_definition);
22977                 break;
22978             case '!':
22979                 _invlist_union_complement_2nd(running_definition,
22980                                         this_definition, &running_definition);
22981                 break;
22982             default:
22983                 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22984                                  __FILE__, __LINE__, op);
22985                 break;
22986         }
22987
22988         /* Position past the '\n' */
22989         s0 = s + 1;
22990     }   /* End of loop through the lines of 'contents' */
22991
22992     /* Here, we processed all the lines in 'contents' without error.  If we
22993      * didn't add any warnings, simply return success */
22994     if (msgs_length_on_entry == SvCUR(msg)) {
22995
22996         /* If the expansion was empty, the answer isn't nothing: its an empty
22997          * inversion list */
22998         if (running_definition == NULL) {
22999             running_definition = _new_invlist(1);
23000         }
23001
23002         return running_definition;
23003     }
23004
23005     /* Otherwise, add some explanatory text, but we will return success */
23006     goto return_msg;
23007
23008   return_failure:
23009     running_definition = NULL;
23010
23011   return_msg:
23012
23013     if (name_len > 0) {
23014         sv_catpvs(msg, " in expansion of ");
23015         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23016     }
23017
23018     return running_definition;
23019 }
23020
23021 /* As explained below, certain operations need to take place in the first
23022  * thread created.  These macros switch contexts */
23023 #ifdef USE_ITHREADS
23024 #  define DECLARATION_FOR_GLOBAL_CONTEXT                                    \
23025                                         PerlInterpreter * save_aTHX = aTHX;
23026 #  define SWITCH_TO_GLOBAL_CONTEXT                                          \
23027                            PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23028 #  define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
23029 #  define CUR_CONTEXT      aTHX
23030 #  define ORIGINAL_CONTEXT save_aTHX
23031 #else
23032 #  define DECLARATION_FOR_GLOBAL_CONTEXT
23033 #  define SWITCH_TO_GLOBAL_CONTEXT          NOOP
23034 #  define RESTORE_CONTEXT                   NOOP
23035 #  define CUR_CONTEXT                       NULL
23036 #  define ORIGINAL_CONTEXT                  NULL
23037 #endif
23038
23039 STATIC void
23040 S_delete_recursion_entry(pTHX_ void *key)
23041 {
23042     /* Deletes the entry used to detect recursion when expanding user-defined
23043      * properties.  This is a function so it can be set up to be called even if
23044      * the program unexpectedly quits */
23045
23046     dVAR;
23047     SV ** current_entry;
23048     const STRLEN key_len = strlen((const char *) key);
23049     DECLARATION_FOR_GLOBAL_CONTEXT;
23050
23051     SWITCH_TO_GLOBAL_CONTEXT;
23052
23053     /* If the entry is one of these types, it is a permanent entry, and not the
23054      * one used to detect recursions.  This function should delete only the
23055      * recursion entry */
23056     current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23057     if (     current_entry
23058         && ! is_invlist(*current_entry)
23059         && ! SvPOK(*current_entry))
23060     {
23061         (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23062                                                                     G_DISCARD);
23063     }
23064
23065     RESTORE_CONTEXT;
23066 }
23067
23068 STATIC SV *
23069 S_get_fq_name(pTHX_
23070               const char * const name,    /* The first non-blank in the \p{}, \P{} */
23071               const Size_t name_len,      /* Its length in bytes, not including any trailing space */
23072               const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23073               const bool has_colon_colon
23074              )
23075 {
23076     /* Returns a mortal SV containing the fully qualified version of the input
23077      * name */
23078
23079     SV * fq_name;
23080
23081     fq_name = newSVpvs_flags("", SVs_TEMP);
23082
23083     /* Use the current package if it wasn't included in our input */
23084     if (! has_colon_colon) {
23085         const HV * pkg = (IN_PERL_COMPILETIME)
23086                          ? PL_curstash
23087                          : CopSTASH(PL_curcop);
23088         const char* pkgname = HvNAME(pkg);
23089
23090         Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23091                       UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23092         sv_catpvs(fq_name, "::");
23093     }
23094
23095     Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23096                          UTF8fARG(is_utf8, name_len, name));
23097     return fq_name;
23098 }
23099
23100 SV *
23101 Perl_parse_uniprop_string(pTHX_
23102
23103     /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
23104      * now.  If so, the return is an inversion list.
23105      *
23106      * If the property is user-defined, it is a subroutine, which in turn
23107      * may call other subroutines.  This function will call the whole nest of
23108      * them to get the definition they return; if some aren't known at the time
23109      * of the call to this function, the fully qualified name of the highest
23110      * level sub is returned.  It is an error to call this function at runtime
23111      * without every sub defined.
23112      *
23113      * If an error was found, NULL is returned, and 'msg' gets a suitable
23114      * message appended to it.  (Appending allows the back trace of how we got
23115      * to the faulty definition to be displayed through nested calls of
23116      * user-defined subs.)
23117      *
23118      * The caller should NOT try to free any returned inversion list.
23119      *
23120      * Other parameters will be set on return as described below */
23121
23122     const char * const name,    /* The first non-blank in the \p{}, \P{} */
23123     Size_t name_len,            /* Its length in bytes, not including any
23124                                    trailing space */
23125     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
23126     const bool to_fold,         /* ? Is this under /i */
23127     const bool runtime,         /* TRUE if this is being called at run time */
23128     const bool deferrable,      /* TRUE if it's ok for the definition to not be
23129                                    known at this call */
23130     bool *user_defined_ptr,     /* Upon return from this function it will be
23131                                    set to TRUE if any component is a
23132                                    user-defined property */
23133     SV * msg,                   /* Any error or warning msg(s) are appended to
23134                                    this */
23135    const STRLEN level)          /* Recursion level of this call */
23136 {
23137     dVAR;
23138     char* lookup_name;          /* normalized name for lookup in our tables */
23139     unsigned lookup_len;        /* Its length */
23140     bool stricter = FALSE;      /* Some properties have stricter name
23141                                    normalization rules, which we decide upon
23142                                    based on parsing */
23143
23144     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23145      * (though it requires extra effort to download them from Unicode and
23146      * compile perl to know about them) */
23147     bool is_nv_type = FALSE;
23148
23149     unsigned int i, j = 0;
23150     int equals_pos = -1;    /* Where the '=' is found, or negative if none */
23151     int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
23152     int table_index = 0;    /* The entry number for this property in the table
23153                                of all Unicode property names */
23154     bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
23155     Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
23156                                    the normalized name in certain situations */
23157     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
23158                                    part of a package name */
23159     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
23160                                              property rather than a Unicode
23161                                              one. */
23162     SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
23163                                      if an error.  If it is an inversion list,
23164                                      it is the definition.  Otherwise it is a
23165                                      string containing the fully qualified sub
23166                                      name of 'name' */
23167     SV * fq_name = NULL;        /* For user-defined properties, the fully
23168                                    qualified name */
23169     bool invert_return = FALSE; /* ? Do we need to complement the result before
23170                                      returning it */
23171     bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23172                                        explicit utf8:: package that we strip
23173                                        off  */
23174
23175     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23176
23177     /* The input will be normalized into 'lookup_name' */
23178     Newx(lookup_name, name_len, char);
23179     SAVEFREEPV(lookup_name);
23180
23181     /* Parse the input. */
23182     for (i = 0; i < name_len; i++) {
23183         char cur = name[i];
23184
23185         /* Most of the characters in the input will be of this ilk, being parts
23186          * of a name */
23187         if (isIDCONT_A(cur)) {
23188
23189             /* Case differences are ignored.  Our lookup routine assumes
23190              * everything is lowercase, so normalize to that */
23191             if (isUPPER_A(cur)) {
23192                 lookup_name[j++] = toLOWER_A(cur);
23193                 continue;
23194             }
23195
23196             if (cur == '_') { /* Don't include these in the normalized name */
23197                 continue;
23198             }
23199
23200             lookup_name[j++] = cur;
23201
23202             /* The first character in a user-defined name must be of this type.
23203              * */
23204             if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23205                 could_be_user_defined = FALSE;
23206             }
23207
23208             continue;
23209         }
23210
23211         /* Here, the character is not something typically in a name,  But these
23212          * two types of characters (and the '_' above) can be freely ignored in
23213          * most situations.  Later it may turn out we shouldn't have ignored
23214          * them, and we have to reparse, but we don't have enough information
23215          * yet to make that decision */
23216         if (cur == '-' || isSPACE_A(cur)) {
23217             could_be_user_defined = FALSE;
23218             continue;
23219         }
23220
23221         /* An equals sign or single colon mark the end of the first part of
23222          * the property name */
23223         if (    cur == '='
23224             || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23225         {
23226             lookup_name[j++] = '='; /* Treat the colon as an '=' */
23227             equals_pos = j; /* Note where it occurred in the input */
23228             could_be_user_defined = FALSE;
23229             break;
23230         }
23231
23232         /* If this looks like it is a marker we inserted at compile time,
23233          * ignore it; otherwise keep it as it would have been user input. */
23234         if (     UNLIKELY(cur == DEFERRED_PROP_EXPANSION_MARKERc)
23235             && ! deferrable
23236             &&   could_be_user_defined
23237             &&   i == name_len - 1)
23238         {
23239             name_len--;
23240             continue;
23241         }
23242
23243         /* Otherwise, this character is part of the name. */
23244         lookup_name[j++] = cur;
23245
23246         /* Here it isn't a single colon, so if it is a colon, it must be a
23247          * double colon */
23248         if (cur == ':') {
23249
23250             /* A double colon should be a package qualifier.  We note its
23251              * position and continue.  Note that one could have
23252              *      pkg1::pkg2::...::foo
23253              * so that the position at the end of the loop will be just after
23254              * the final qualifier */
23255
23256             i++;
23257             non_pkg_begin = i + 1;
23258             lookup_name[j++] = ':';
23259         }
23260         else { /* Only word chars (and '::') can be in a user-defined name */
23261             could_be_user_defined = FALSE;
23262         }
23263     } /* End of parsing through the lhs of the property name (or all of it if
23264          no rhs) */
23265
23266 #define STRLENs(s)  (sizeof("" s "") - 1)
23267
23268     /* If there is a single package name 'utf8::', it is ambiguous.  It could
23269      * be for a user-defined property, or it could be a Unicode property, as
23270      * all of them are considered to be for that package.  For the purposes of
23271      * parsing the rest of the property, strip it off */
23272     if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23273         lookup_name +=  STRLENs("utf8::");
23274         j -=  STRLENs("utf8::");
23275         equals_pos -=  STRLENs("utf8::");
23276         stripped_utf8_pkg = TRUE;
23277     }
23278
23279     /* Here, we are either done with the whole property name, if it was simple;
23280      * or are positioned just after the '=' if it is compound. */
23281
23282     if (equals_pos >= 0) {
23283         assert(! stricter); /* We shouldn't have set this yet */
23284
23285         /* Space immediately after the '=' is ignored */
23286         i++;
23287         for (; i < name_len; i++) {
23288             if (! isSPACE_A(name[i])) {
23289                 break;
23290             }
23291         }
23292
23293         /* Most punctuation after the equals indicates a subpattern, like
23294          * \p{foo=/bar/} */
23295         if (   isPUNCT_A(name[i])
23296             &&  name[i] != '-'
23297             &&  name[i] != '+'
23298             &&  name[i] != '_'
23299             &&  name[i] != '{'
23300                 /* A backslash means the real delimitter is the next character,
23301                  * but it must be punctuation */
23302             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23303         {
23304             /* Find the property.  The table includes the equals sign, so we
23305              * use 'j' as-is */
23306             table_index = match_uniprop((U8 *) lookup_name, j);
23307             if (table_index) {
23308                 const char * const * prop_values
23309                                             = UNI_prop_value_ptrs[table_index];
23310                 SV * subpattern;
23311                 Size_t subpattern_len;
23312                 REGEXP * subpattern_re;
23313                 char open = name[i++];
23314                 char close;
23315                 const char * pos_in_brackets;
23316                 bool escaped = 0;
23317
23318                 /* Backslash => delimitter is the character following.  We
23319                  * already checked that it is punctuation */
23320                 if (open == '\\') {
23321                     open = name[i++];
23322                     escaped = 1;
23323                 }
23324
23325                 /* This data structure is constructed so that the matching
23326                  * closing bracket is 3 past its matching opening.  The second
23327                  * set of closing is so that if the opening is something like
23328                  * ']', the closing will be that as well.  Something similar is
23329                  * done in toke.c */
23330                 pos_in_brackets = strchr("([<)]>)]>", open);
23331                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23332
23333                 if (    i >= name_len
23334                     ||  name[name_len-1] != close
23335                     || (escaped && name[name_len-2] != '\\'))
23336                 {
23337                     sv_catpvs(msg, "Unicode property wildcard not terminated");
23338                     goto append_name_to_msg;
23339                 }
23340
23341                 Perl_ck_warner_d(aTHX_
23342                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23343                     "The Unicode property wildcards feature is experimental");
23344
23345                 /* Now create and compile the wildcard subpattern.  Use /iaa
23346                  * because nothing outside of ASCII will match, and it the
23347                  * property values should all match /i.  Note that when the
23348                  * pattern fails to compile, our added text to the user's
23349                  * pattern will be displayed to the user, which is not so
23350                  * desirable. */
23351                 subpattern_len = name_len - i - 1 - escaped;
23352                 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
23353                                               (unsigned) subpattern_len,
23354                                               name + i);
23355                 subpattern = sv_2mortal(subpattern);
23356                 subpattern_re = re_compile(subpattern, 0);
23357                 assert(subpattern_re);  /* Should have died if didn't compile
23358                                          successfully */
23359
23360                 /* For each legal property value, see if the supplied pattern
23361                  * matches it. */
23362                 while (*prop_values) {
23363                     const char * const entry = *prop_values;
23364                     const Size_t len = strlen(entry);
23365                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23366
23367                     if (pregexec(subpattern_re,
23368                                  (char *) entry,
23369                                  (char *) entry + len,
23370                                  (char *) entry, 0,
23371                                  entry_sv,
23372                                  0))
23373                     { /* Here, matched.  Add to the returned list */
23374                         Size_t total_len = j + len;
23375                         SV * sub_invlist = NULL;
23376                         char * this_string;
23377
23378                         /* We know this is a legal \p{property=value}.  Call
23379                          * the function to return the list of code points that
23380                          * match it */
23381                         Newxz(this_string, total_len + 1, char);
23382                         Copy(lookup_name, this_string, j, char);
23383                         my_strlcat(this_string, entry, total_len + 1);
23384                         SAVEFREEPV(this_string);
23385                         sub_invlist = parse_uniprop_string(this_string,
23386                                                            total_len,
23387                                                            is_utf8,
23388                                                            to_fold,
23389                                                            runtime,
23390                                                            deferrable,
23391                                                            user_defined_ptr,
23392                                                            msg,
23393                                                            level + 1);
23394                         _invlist_union(prop_definition, sub_invlist,
23395                                        &prop_definition);
23396                     }
23397
23398                     prop_values++;  /* Next iteration, look at next propvalue */
23399                 } /* End of looking through property values; (the data
23400                      structure is terminated by a NULL ptr) */
23401
23402                 SvREFCNT_dec_NN(subpattern_re);
23403
23404                 if (prop_definition) {
23405                     return prop_definition;
23406                 }
23407
23408                 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23409                 goto append_name_to_msg;
23410             }
23411
23412             /* Here's how khw thinks we should proceed to handle the properties
23413              * not yet done:    Bidi Mirroring Glyph
23414                                 Bidi Paired Bracket
23415                                 Case Folding  (both full and simple)
23416                                 Decomposition Mapping
23417                                 Equivalent Unified Ideograph
23418                                 Name
23419                                 Name Alias
23420                                 Lowercase Mapping  (both full and simple)
23421                                 NFKC Case Fold
23422                                 Titlecase Mapping  (both full and simple)
23423                                 Uppercase Mapping  (both full and simple)
23424              * Move the part that looks at the property values into a perl
23425              * script, like utf8_heavy.pl was done.  This makes things somewhat
23426              * easier, but most importantly, it avoids always adding all these
23427              * strings to the memory usage when the feature is little-used.
23428              *
23429              * The property values would all be concatenated into a single
23430              * string per property with each value on a separate line, and the
23431              * code point it's for on alternating lines.  Then we match the
23432              * user's input pattern m//mg, without having to worry about their
23433              * uses of '^' and '$'.  Only the values that aren't the default
23434              * would be in the strings.  Code points would be in UTF-8.  The
23435              * search pattern that we would construct would look like
23436              * (?: \n (code-point_re) \n (?aam: user-re ) \n )
23437              * And so $1 would contain the code point that matched the user-re.
23438              * For properties where the default is the code point itself, such
23439              * as any of the case changing mappings, the string would otherwise
23440              * consist of all Unicode code points in UTF-8 strung together.
23441              * This would be impractical.  So instead, examine their compiled
23442              * pattern, looking at the ssc.  If none, reject the pattern as an
23443              * error.  Otherwise run the pattern against every code point in
23444              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
23445              * And it might be good to create an API to return the ssc.
23446              *
23447              * For the name properties, a new function could be created in
23448              * charnames which essentially does the same thing as above,
23449              * sharing Name.pl with the other charname functions.  Don't know
23450              * about loose name matching, or algorithmically determined names.
23451              * Decomposition.pl similarly.
23452              *
23453              * It might be that a new pattern modifier would have to be
23454              * created, like /t for resTricTed, which changed the behavior of
23455              * some constructs in their subpattern, like \A. */
23456         } /* End of is a wildcard subppattern */
23457
23458
23459         /* Certain properties whose values are numeric need special handling.
23460          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
23461          * purposes of checking if this is one of those properties */
23462         if (memBEGINPs(lookup_name, j, "is")) {
23463             lookup_offset = 2;
23464         }
23465
23466         /* Then check if it is one of these specially-handled properties.  The
23467          * possibilities are hard-coded because easier this way, and the list
23468          * is unlikely to change.
23469          *
23470          * All numeric value type properties are of this ilk, and are also
23471          * special in a different way later on.  So find those first.  There
23472          * are several numeric value type properties in the Unihan DB (which is
23473          * unlikely to be compiled with perl, but we handle it here in case it
23474          * does get compiled).  They all end with 'numeric'.  The interiors
23475          * aren't checked for the precise property.  This would stop working if
23476          * a cjk property were to be created that ended with 'numeric' and
23477          * wasn't a numeric type */
23478         is_nv_type = memEQs(lookup_name + lookup_offset,
23479                        j - 1 - lookup_offset, "numericvalue")
23480                   || memEQs(lookup_name + lookup_offset,
23481                       j - 1 - lookup_offset, "nv")
23482                   || (   memENDPs(lookup_name + lookup_offset,
23483                             j - 1 - lookup_offset, "numeric")
23484                       && (   memBEGINPs(lookup_name + lookup_offset,
23485                                       j - 1 - lookup_offset, "cjk")
23486                           || memBEGINPs(lookup_name + lookup_offset,
23487                                       j - 1 - lookup_offset, "k")));
23488         if (   is_nv_type
23489             || memEQs(lookup_name + lookup_offset,
23490                       j - 1 - lookup_offset, "canonicalcombiningclass")
23491             || memEQs(lookup_name + lookup_offset,
23492                       j - 1 - lookup_offset, "ccc")
23493             || memEQs(lookup_name + lookup_offset,
23494                       j - 1 - lookup_offset, "age")
23495             || memEQs(lookup_name + lookup_offset,
23496                       j - 1 - lookup_offset, "in")
23497             || memEQs(lookup_name + lookup_offset,
23498                       j - 1 - lookup_offset, "presentin"))
23499         {
23500             unsigned int k;
23501
23502             /* Since the stuff after the '=' is a number, we can't throw away
23503              * '-' willy-nilly, as those could be a minus sign.  Other stricter
23504              * rules also apply.  However, these properties all can have the
23505              * rhs not be a number, in which case they contain at least one
23506              * alphabetic.  In those cases, the stricter rules don't apply.
23507              * But the numeric type properties can have the alphas [Ee] to
23508              * signify an exponent, and it is still a number with stricter
23509              * rules.  So look for an alpha that signifies not-strict */
23510             stricter = TRUE;
23511             for (k = i; k < name_len; k++) {
23512                 if (   isALPHA_A(name[k])
23513                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
23514                 {
23515                     stricter = FALSE;
23516                     break;
23517                 }
23518             }
23519         }
23520
23521         if (stricter) {
23522
23523             /* A number may have a leading '+' or '-'.  The latter is retained
23524              * */
23525             if (name[i] == '+') {
23526                 i++;
23527             }
23528             else if (name[i] == '-') {
23529                 lookup_name[j++] = '-';
23530                 i++;
23531             }
23532
23533             /* Skip leading zeros including single underscores separating the
23534              * zeros, or between the final leading zero and the first other
23535              * digit */
23536             for (; i < name_len - 1; i++) {
23537                 if (    name[i] != '0'
23538                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23539                 {
23540                     break;
23541                 }
23542             }
23543         }
23544     }
23545     else {  /* No '=' */
23546
23547        /* Only a few properties without an '=' should be parsed with stricter
23548         * rules.  The list is unlikely to change. */
23549         if (   memBEGINPs(lookup_name, j, "perl")
23550             && memNEs(lookup_name + 4, j - 4, "space")
23551             && memNEs(lookup_name + 4, j - 4, "word"))
23552         {
23553             stricter = TRUE;
23554
23555             /* We set the inputs back to 0 and the code below will reparse,
23556              * using strict */
23557             i = j = 0;
23558         }
23559     }
23560
23561     /* Here, we have either finished the property, or are positioned to parse
23562      * the remainder, and we know if stricter rules apply.  Finish out, if not
23563      * already done */
23564     for (; i < name_len; i++) {
23565         char cur = name[i];
23566
23567         /* In all instances, case differences are ignored, and we normalize to
23568          * lowercase */
23569         if (isUPPER_A(cur)) {
23570             lookup_name[j++] = toLOWER(cur);
23571             continue;
23572         }
23573
23574         /* An underscore is skipped, but not under strict rules unless it
23575          * separates two digits */
23576         if (cur == '_') {
23577             if (    stricter
23578                 && (     i == 0 || (int) i == equals_pos || i == name_len- 1
23579                     || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23580             {
23581                 lookup_name[j++] = '_';
23582             }
23583             continue;
23584         }
23585
23586         /* Hyphens are skipped except under strict */
23587         if (cur == '-' && ! stricter) {
23588             continue;
23589         }
23590
23591         /* XXX Bug in documentation.  It says white space skipped adjacent to
23592          * non-word char.  Maybe we should, but shouldn't skip it next to a dot
23593          * in a number */
23594         if (isSPACE_A(cur) && ! stricter) {
23595             continue;
23596         }
23597
23598         lookup_name[j++] = cur;
23599
23600         /* Unless this is a non-trailing slash, we are done with it */
23601         if (i >= name_len - 1 || cur != '/') {
23602             continue;
23603         }
23604
23605         slash_pos = j;
23606
23607         /* A slash in the 'numeric value' property indicates that what follows
23608          * is a denominator.  It can have a leading '+' and '0's that should be
23609          * skipped.  But we have never allowed a negative denominator, so treat
23610          * a minus like every other character.  (No need to rule out a second
23611          * '/', as that won't match anything anyway */
23612         if (is_nv_type) {
23613             i++;
23614             if (i < name_len && name[i] == '+') {
23615                 i++;
23616             }
23617
23618             /* Skip leading zeros including underscores separating digits */
23619             for (; i < name_len - 1; i++) {
23620                 if (   name[i] != '0'
23621                     && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23622                 {
23623                     break;
23624                 }
23625             }
23626
23627             /* Store the first real character in the denominator */
23628             if (i < name_len) {
23629                 lookup_name[j++] = name[i];
23630             }
23631         }
23632     }
23633
23634     /* Here are completely done parsing the input 'name', and 'lookup_name'
23635      * contains a copy, normalized.
23636      *
23637      * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23638      * different from without the underscores.  */
23639     if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
23640            || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23641         && UNLIKELY(name[name_len-1] == '_'))
23642     {
23643         lookup_name[j++] = '&';
23644     }
23645
23646     /* If the original input began with 'In' or 'Is', it could be a subroutine
23647      * call to a user-defined property instead of a Unicode property name. */
23648     if (    name_len - non_pkg_begin > 2
23649         &&  name[non_pkg_begin+0] == 'I'
23650         && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23651     {
23652         /* Names that start with In have different characterstics than those
23653          * that start with Is */
23654         if (name[non_pkg_begin+1] == 's') {
23655             starts_with_Is = TRUE;
23656         }
23657     }
23658     else {
23659         could_be_user_defined = FALSE;
23660     }
23661
23662     if (could_be_user_defined) {
23663         CV* user_sub;
23664
23665         /* If the user defined property returns the empty string, it could
23666          * easily be because the pattern is being compiled before the data it
23667          * actually needs to compile is available.  This could be argued to be
23668          * a bug in the perl code, but this is a change of behavior for Perl,
23669          * so we handle it.  This means that intentionally returning nothing
23670          * will not be resolved until runtime */
23671         bool empty_return = FALSE;
23672
23673         /* Here, the name could be for a user defined property, which are
23674          * implemented as subs. */
23675         user_sub = get_cvn_flags(name, name_len, 0);
23676         if (! user_sub) {
23677
23678             /* Here, the property name could be a user-defined one, but there
23679              * is no subroutine to handle it (as of now).   Defer handling it
23680              * until runtime.  Otherwise, a block defined by Unicode in a later
23681              * release would get the synonym InFoo added for it, and existing
23682              * code that used that name would suddenly break if it referred to
23683              * the property before the sub was declared.  See [perl #134146] */
23684             if (deferrable) {
23685                 goto definition_deferred;
23686             }
23687
23688             /* If we haven't already stripped the package name (if one), do so
23689              * now so can look for an official property with the stripped name.
23690              * */
23691             if (! stripped_utf8_pkg) {
23692                 lookup_name += non_pkg_begin;
23693                 j -= non_pkg_begin;
23694             }
23695
23696             /* Drop down to look up in the official properties */
23697         }
23698         else {
23699             const char insecure[] = "Insecure user-defined property";
23700
23701             /* Here, there is a sub by the correct name.  Normally we call it
23702              * to get the property definition */
23703             dSP;
23704             SV * user_sub_sv = MUTABLE_SV(user_sub);
23705             SV * error;     /* Any error returned by calling 'user_sub' */
23706             SV * key;       /* The key into the hash of user defined sub names
23707                              */
23708             SV * placeholder;
23709             SV ** saved_user_prop_ptr;      /* Hash entry for this property */
23710
23711             /* How many times to retry when another thread is in the middle of
23712              * expanding the same definition we want */
23713             PERL_INT_FAST8_T retry_countdown = 10;
23714
23715             DECLARATION_FOR_GLOBAL_CONTEXT;
23716
23717             /* If we get here, we know this property is user-defined */
23718             *user_defined_ptr = TRUE;
23719
23720             /* We refuse to call a potentially tainted subroutine; returning an
23721              * error instead */
23722             if (TAINT_get) {
23723                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23724                 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23725                 goto append_name_to_msg;
23726             }
23727
23728             /* In principal, we only call each subroutine property definition
23729              * once during the life of the program.  This guarantees that the
23730              * property definition never changes.  The results of the single
23731              * sub call are stored in a hash, which is used instead for future
23732              * references to this property.  The property definition is thus
23733              * immutable.  But, to allow the user to have a /i-dependent
23734              * definition, we call the sub once for non-/i, and once for /i,
23735              * should the need arise, passing the /i status as a parameter.
23736              *
23737              * We start by constructing the hash key name, consisting of the
23738              * fully qualified subroutine name, preceded by the /i status, so
23739              * that there is a key for /i and a different key for non-/i */
23740             key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23741             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23742                                           non_pkg_begin != 0);
23743             sv_catsv(key, fq_name);
23744             sv_2mortal(key);
23745
23746             /* We only call the sub once throughout the life of the program
23747              * (with the /i, non-/i exception noted above).  That means the
23748              * hash must be global and accessible to all threads.  It is
23749              * created at program start-up, before any threads are created, so
23750              * is accessible to all children.  But this creates some
23751              * complications.
23752              *
23753              * 1) The keys can't be shared, or else problems arise; sharing is
23754              *    turned off at hash creation time
23755              * 2) All SVs in it are there for the remainder of the life of the
23756              *    program, and must be created in the same interpreter context
23757              *    as the hash, or else they will be freed from the wrong pool
23758              *    at global destruction time.  This is handled by switching to
23759              *    the hash's context to create each SV going into it, and then
23760              *    immediately switching back
23761              * 3) All accesses to the hash must be controlled by a mutex, to
23762              *    prevent two threads from getting an unstable state should
23763              *    they simultaneously be accessing it.  The code below is
23764              *    crafted so that the mutex is locked whenever there is an
23765              *    access and unlocked only when the next stable state is
23766              *    achieved.
23767              *
23768              * The hash stores either the definition of the property if it was
23769              * valid, or, if invalid, the error message that was raised.  We
23770              * use the type of SV to distinguish.
23771              *
23772              * There's also the need to guard against the definition expansion
23773              * from infinitely recursing.  This is handled by storing the aTHX
23774              * of the expanding thread during the expansion.  Again the SV type
23775              * is used to distinguish this from the other two cases.  If we
23776              * come to here and the hash entry for this property is our aTHX,
23777              * it means we have recursed, and the code assumes that we would
23778              * infinitely recurse, so instead stops and raises an error.
23779              * (Any recursion has always been treated as infinite recursion in
23780              * this feature.)
23781              *
23782              * If instead, the entry is for a different aTHX, it means that
23783              * that thread has gotten here first, and hasn't finished expanding
23784              * the definition yet.  We just have to wait until it is done.  We
23785              * sleep and retry a few times, returning an error if the other
23786              * thread doesn't complete. */
23787
23788           re_fetch:
23789             USER_PROP_MUTEX_LOCK;
23790
23791             /* If we have an entry for this key, the subroutine has already
23792              * been called once with this /i status. */
23793             saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23794                                                    SvPVX(key), SvCUR(key), 0);
23795             if (saved_user_prop_ptr) {
23796
23797                 /* If the saved result is an inversion list, it is the valid
23798                  * definition of this property */
23799                 if (is_invlist(*saved_user_prop_ptr)) {
23800                     prop_definition = *saved_user_prop_ptr;
23801
23802                     /* The SV in the hash won't be removed until global
23803                      * destruction, so it is stable and we can unlock */
23804                     USER_PROP_MUTEX_UNLOCK;
23805
23806                     /* The caller shouldn't try to free this SV */
23807                     return prop_definition;
23808                 }
23809
23810                 /* Otherwise, if it is a string, it is the error message
23811                  * that was returned when we first tried to evaluate this
23812                  * property.  Fail, and append the message */
23813                 if (SvPOK(*saved_user_prop_ptr)) {
23814                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23815                     sv_catsv(msg, *saved_user_prop_ptr);
23816
23817                     /* The SV in the hash won't be removed until global
23818                      * destruction, so it is stable and we can unlock */
23819                     USER_PROP_MUTEX_UNLOCK;
23820
23821                     return NULL;
23822                 }
23823
23824                 assert(SvIOK(*saved_user_prop_ptr));
23825
23826                 /* Here, we have an unstable entry in the hash.  Either another
23827                  * thread is in the middle of expanding the property's
23828                  * definition, or we are ourselves recursing.  We use the aTHX
23829                  * in it to distinguish */
23830                 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23831
23832                     /* Here, it's another thread doing the expanding.  We've
23833                      * looked as much as we are going to at the contents of the
23834                      * hash entry.  It's safe to unlock. */
23835                     USER_PROP_MUTEX_UNLOCK;
23836
23837                     /* Retry a few times */
23838                     if (retry_countdown-- > 0) {
23839                         PerlProc_sleep(1);
23840                         goto re_fetch;
23841                     }
23842
23843                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23844                     sv_catpvs(msg, "Timeout waiting for another thread to "
23845                                    "define");
23846                     goto append_name_to_msg;
23847                 }
23848
23849                 /* Here, we are recursing; don't dig any deeper */
23850                 USER_PROP_MUTEX_UNLOCK;
23851
23852                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23853                 sv_catpvs(msg,
23854                           "Infinite recursion in user-defined property");
23855                 goto append_name_to_msg;
23856             }
23857
23858             /* Here, this thread has exclusive control, and there is no entry
23859              * for this property in the hash.  So we have the go ahead to
23860              * expand the definition ourselves. */
23861
23862             PUSHSTACKi(PERLSI_MAGIC);
23863             ENTER;
23864
23865             /* Create a temporary placeholder in the hash to detect recursion
23866              * */
23867             SWITCH_TO_GLOBAL_CONTEXT;
23868             placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23869             (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23870             RESTORE_CONTEXT;
23871
23872             /* Now that we have a placeholder, we can let other threads
23873              * continue */
23874             USER_PROP_MUTEX_UNLOCK;
23875
23876             /* Make sure the placeholder always gets destroyed */
23877             SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23878
23879             PUSHMARK(SP);
23880             SAVETMPS;
23881
23882             /* Call the user's function, with the /i status as a parameter.
23883              * Note that we have gone to a lot of trouble to keep this call
23884              * from being within the locked mutex region. */
23885             XPUSHs(boolSV(to_fold));
23886             PUTBACK;
23887
23888             /* The following block was taken from swash_init().  Presumably
23889              * they apply to here as well, though we no longer use a swash --
23890              * khw */
23891             SAVEHINTS();
23892             save_re_context();
23893             /* We might get here via a subroutine signature which uses a utf8
23894              * parameter name, at which point PL_subname will have been set
23895              * but not yet used. */
23896             save_item(PL_subname);
23897
23898             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23899
23900             SPAGAIN;
23901
23902             error = ERRSV;
23903             if (TAINT_get || SvTRUE(error)) {
23904                 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23905                 if (SvTRUE(error)) {
23906                     sv_catpvs(msg, "Error \"");
23907                     sv_catsv(msg, error);
23908                     sv_catpvs(msg, "\"");
23909                 }
23910                 if (TAINT_get) {
23911                     if (SvTRUE(error)) sv_catpvs(msg, "; ");
23912                     sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23913                 }
23914
23915                 if (name_len > 0) {
23916                     sv_catpvs(msg, " in expansion of ");
23917                     Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23918                                                                   name_len,
23919                                                                   name));
23920                 }
23921
23922                 (void) POPs;
23923                 prop_definition = NULL;
23924             }
23925             else {  /* G_SCALAR guarantees a single return value */
23926                 SV * contents = POPs;
23927
23928                 /* The contents is supposed to be the expansion of the property
23929                  * definition.  If the definition is deferrable, and we got an
23930                  * empty string back, set a flag to later defer it (after clean
23931                  * up below). */
23932                 if (      deferrable
23933                     && (! SvPOK(contents) || SvCUR(contents) == 0))
23934                 {
23935                         empty_return = TRUE;
23936                 }
23937                 else { /* Otherwise, call a function to check for valid syntax,
23938                           and handle it */
23939
23940                     prop_definition = handle_user_defined_property(
23941                                                     name, name_len,
23942                                                     is_utf8, to_fold, runtime,
23943                                                     deferrable,
23944                                                     contents, user_defined_ptr,
23945                                                     msg,
23946                                                     level);
23947                 }
23948             }
23949
23950             /* Here, we have the results of the expansion.  Delete the
23951              * placeholder, and if the definition is now known, replace it with
23952              * that definition.  We need exclusive access to the hash, and we
23953              * can't let anyone else in, between when we delete the placeholder
23954              * and add the permanent entry */
23955             USER_PROP_MUTEX_LOCK;
23956
23957             S_delete_recursion_entry(aTHX_ SvPVX(key));
23958
23959             if (    ! empty_return
23960                 && (! prop_definition || is_invlist(prop_definition)))
23961             {
23962                 /* If we got success we use the inversion list defining the
23963                  * property; otherwise use the error message */
23964                 SWITCH_TO_GLOBAL_CONTEXT;
23965                 (void) hv_store_ent(PL_user_def_props,
23966                                     key,
23967                                     ((prop_definition)
23968                                      ? newSVsv(prop_definition)
23969                                      : newSVsv(msg)),
23970                                     0);
23971                 RESTORE_CONTEXT;
23972             }
23973
23974             /* All done, and the hash now has a permanent entry for this
23975              * property.  Give up exclusive control */
23976             USER_PROP_MUTEX_UNLOCK;
23977
23978             FREETMPS;
23979             LEAVE;
23980             POPSTACK;
23981
23982             if (empty_return) {
23983                 goto definition_deferred;
23984             }
23985
23986             if (prop_definition) {
23987
23988                 /* If the definition is for something not known at this time,
23989                  * we toss it, and go return the main property name, as that's
23990                  * the one the user will be aware of */
23991                 if (! is_invlist(prop_definition)) {
23992                     SvREFCNT_dec_NN(prop_definition);
23993                     goto definition_deferred;
23994                 }
23995
23996                 sv_2mortal(prop_definition);
23997             }
23998
23999             /* And return */
24000             return prop_definition;
24001
24002         }   /* End of calling the subroutine for the user-defined property */
24003     }       /* End of it could be a user-defined property */
24004
24005     /* Here it wasn't a user-defined property that is known at this time.  See
24006      * if it is a Unicode property */
24007
24008     lookup_len = j;     /* This is a more mnemonic name than 'j' */
24009
24010     /* Get the index into our pointer table of the inversion list corresponding
24011      * to the property */
24012     table_index = match_uniprop((U8 *) lookup_name, lookup_len);
24013
24014     /* If it didn't find the property ... */
24015     if (table_index == 0) {
24016
24017         /* Try again stripping off any initial 'Is'.  This is because we
24018          * promise that an initial Is is optional.  The same isn't true of
24019          * names that start with 'In'.  Those can match only blocks, and the
24020          * lookup table already has those accounted for. */
24021         if (starts_with_Is) {
24022             lookup_name += 2;
24023             lookup_len -= 2;
24024             equals_pos -= 2;
24025             slash_pos -= 2;
24026
24027             table_index = match_uniprop((U8 *) lookup_name, lookup_len);
24028         }
24029
24030         if (table_index == 0) {
24031             char * canonical;
24032
24033             /* Here, we didn't find it.  If not a numeric type property, and
24034              * can't be a user-defined one, it isn't a legal property */
24035             if (! is_nv_type) {
24036                 if (! could_be_user_defined) {
24037                     goto failed;
24038                 }
24039
24040                 /* Here, the property name is legal as a user-defined one.   At
24041                  * compile time, it might just be that the subroutine for that
24042                  * property hasn't been encountered yet, but at runtime, it's
24043                  * an error to try to use an undefined one */
24044                 if (! deferrable) {
24045                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24046                     sv_catpvs(msg, "Unknown user-defined property name");
24047                     goto append_name_to_msg;
24048                 }
24049
24050                 goto definition_deferred;
24051             } /* End of isn't a numeric type property */
24052
24053             /* The numeric type properties need more work to decide.  What we
24054              * do is make sure we have the number in canonical form and look
24055              * that up. */
24056
24057             if (slash_pos < 0) {    /* No slash */
24058
24059                 /* When it isn't a rational, take the input, convert it to a
24060                  * NV, then create a canonical string representation of that
24061                  * NV. */
24062
24063                 NV value;
24064                 SSize_t value_len = lookup_len - equals_pos;
24065
24066                 /* Get the value */
24067                 if (   value_len <= 0
24068                     || my_atof3(lookup_name + equals_pos, &value,
24069                                 value_len)
24070                           != lookup_name + lookup_len)
24071                 {
24072                     goto failed;
24073                 }
24074
24075                 /* If the value is an integer, the canonical value is integral
24076                  * */
24077                 if (Perl_ceil(value) == value) {
24078                     canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24079                                             equals_pos, lookup_name, value);
24080                 }
24081                 else {  /* Otherwise, it is %e with a known precision */
24082                     char * exp_ptr;
24083
24084                     canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24085                                                 equals_pos, lookup_name,
24086                                                 PL_E_FORMAT_PRECISION, value);
24087
24088                     /* The exponent generated is expecting two digits, whereas
24089                      * %e on some systems will generate three.  Remove leading
24090                      * zeros in excess of 2 from the exponent.  We start
24091                      * looking for them after the '=' */
24092                     exp_ptr = strchr(canonical + equals_pos, 'e');
24093                     if (exp_ptr) {
24094                         char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24095                         SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24096
24097                         assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24098
24099                         if (excess_exponent_len > 0) {
24100                             SSize_t leading_zeros = strspn(cur_ptr, "0");
24101                             SSize_t excess_leading_zeros
24102                                     = MIN(leading_zeros, excess_exponent_len);
24103                             if (excess_leading_zeros > 0) {
24104                                 Move(cur_ptr + excess_leading_zeros,
24105                                      cur_ptr,
24106                                      strlen(cur_ptr) - excess_leading_zeros
24107                                        + 1,  /* Copy the NUL as well */
24108                                      char);
24109                             }
24110                         }
24111                     }
24112                 }
24113             }
24114             else {  /* Has a slash.  Create a rational in canonical form  */
24115                 UV numerator, denominator, gcd, trial;
24116                 const char * end_ptr;
24117                 const char * sign = "";
24118
24119                 /* We can't just find the numerator, denominator, and do the
24120                  * division, then use the method above, because that is
24121                  * inexact.  And the input could be a rational that is within
24122                  * epsilon (given our precision) of a valid rational, and would
24123                  * then incorrectly compare valid.
24124                  *
24125                  * We're only interested in the part after the '=' */
24126                 const char * this_lookup_name = lookup_name + equals_pos;
24127                 lookup_len -= equals_pos;
24128                 slash_pos -= equals_pos;
24129
24130                 /* Handle any leading minus */
24131                 if (this_lookup_name[0] == '-') {
24132                     sign = "-";
24133                     this_lookup_name++;
24134                     lookup_len--;
24135                     slash_pos--;
24136                 }
24137
24138                 /* Convert the numerator to numeric */
24139                 end_ptr = this_lookup_name + slash_pos;
24140                 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24141                     goto failed;
24142                 }
24143
24144                 /* It better have included all characters before the slash */
24145                 if (*end_ptr != '/') {
24146                     goto failed;
24147                 }
24148
24149                 /* Set to look at just the denominator */
24150                 this_lookup_name += slash_pos;
24151                 lookup_len -= slash_pos;
24152                 end_ptr = this_lookup_name + lookup_len;
24153
24154                 /* Convert the denominator to numeric */
24155                 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24156                     goto failed;
24157                 }
24158
24159                 /* It better be the rest of the characters, and don't divide by
24160                  * 0 */
24161                 if (   end_ptr != this_lookup_name + lookup_len
24162                     || denominator == 0)
24163                 {
24164                     goto failed;
24165                 }
24166
24167                 /* Get the greatest common denominator using
24168                    http://en.wikipedia.org/wiki/Euclidean_algorithm */
24169                 gcd = numerator;
24170                 trial = denominator;
24171                 while (trial != 0) {
24172                     UV temp = trial;
24173                     trial = gcd % trial;
24174                     gcd = temp;
24175                 }
24176
24177                 /* If already in lowest possible terms, we have already tried
24178                  * looking this up */
24179                 if (gcd == 1) {
24180                     goto failed;
24181                 }
24182
24183                 /* Reduce the rational, which should put it in canonical form
24184                  * */
24185                 numerator /= gcd;
24186                 denominator /= gcd;
24187
24188                 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24189                         equals_pos, lookup_name, sign, numerator, denominator);
24190             }
24191
24192             /* Here, we have the number in canonical form.  Try that */
24193             table_index = match_uniprop((U8 *) canonical, strlen(canonical));
24194             if (table_index == 0) {
24195                 goto failed;
24196             }
24197         }   /* End of still didn't find the property in our table */
24198     }       /* End of       didn't find the property in our table */
24199
24200     /* Here, we have a non-zero return, which is an index into a table of ptrs.
24201      * A negative return signifies that the real index is the absolute value,
24202      * but the result needs to be inverted */
24203     if (table_index < 0) {
24204         invert_return = TRUE;
24205         table_index = -table_index;
24206     }
24207
24208     /* Out-of band indices indicate a deprecated property.  The proper index is
24209      * modulo it with the table size.  And dividing by the table size yields
24210      * an offset into a table constructed by regen/mk_invlists.pl to contain
24211      * the corresponding warning message */
24212     if (table_index > MAX_UNI_KEYWORD_INDEX) {
24213         Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24214         table_index %= MAX_UNI_KEYWORD_INDEX;
24215         Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24216                 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24217                 (int) name_len, name, deprecated_property_msgs[warning_offset]);
24218     }
24219
24220     /* In a few properties, a different property is used under /i.  These are
24221      * unlikely to change, so are hard-coded here. */
24222     if (to_fold) {
24223         if (   table_index == UNI_XPOSIXUPPER
24224             || table_index == UNI_XPOSIXLOWER
24225             || table_index == UNI_TITLE)
24226         {
24227             table_index = UNI_CASED;
24228         }
24229         else if (   table_index == UNI_UPPERCASELETTER
24230                  || table_index == UNI_LOWERCASELETTER
24231 #  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
24232                  || table_index == UNI_TITLECASELETTER
24233 #  endif
24234         ) {
24235             table_index = UNI_CASEDLETTER;
24236         }
24237         else if (  table_index == UNI_POSIXUPPER
24238                 || table_index == UNI_POSIXLOWER)
24239         {
24240             table_index = UNI_POSIXALPHA;
24241         }
24242     }
24243
24244     /* Create and return the inversion list */
24245     prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
24246     sv_2mortal(prop_definition);
24247
24248
24249     /* See if there is a private use override to add to this definition */
24250     {
24251         COPHH * hinthash = (IN_PERL_COMPILETIME)
24252                            ? CopHINTHASH_get(&PL_compiling)
24253                            : CopHINTHASH_get(PL_curcop);
24254         SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24255
24256         if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24257
24258             /* See if there is an element in the hints hash for this table */
24259             SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24260             const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24261
24262             if (pos) {
24263                 bool dummy;
24264                 SV * pu_definition;
24265                 SV * pu_invlist;
24266                 SV * expanded_prop_definition =
24267                             sv_2mortal(invlist_clone(prop_definition, NULL));
24268
24269                 /* If so, it's definition is the string from here to the next
24270                  * \a character.  And its format is the same as a user-defined
24271                  * property */
24272                 pos += SvCUR(pu_lookup);
24273                 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24274                 pu_invlist = handle_user_defined_property(lookup_name,
24275                                                           lookup_len,
24276                                                           0, /* Not UTF-8 */
24277                                                           0, /* Not folded */
24278                                                           runtime,
24279                                                           deferrable,
24280                                                           pu_definition,
24281                                                           &dummy,
24282                                                           msg,
24283                                                           level);
24284                 if (TAINT_get) {
24285                     if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24286                     sv_catpvs(msg, "Insecure private-use override");
24287                     goto append_name_to_msg;
24288                 }
24289
24290                 /* For now, as a safety measure, make sure that it doesn't
24291                  * override non-private use code points */
24292                 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24293
24294                 /* Add it to the list to be returned */
24295                 _invlist_union(prop_definition, pu_invlist,
24296                                &expanded_prop_definition);
24297                 prop_definition = expanded_prop_definition;
24298                 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24299             }
24300         }
24301     }
24302
24303     if (invert_return) {
24304         _invlist_invert(prop_definition);
24305     }
24306     return prop_definition;
24307
24308
24309   failed:
24310     if (non_pkg_begin != 0) {
24311         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24312         sv_catpvs(msg, "Illegal user-defined property name");
24313     }
24314     else {
24315         if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24316         sv_catpvs(msg, "Can't find Unicode property definition");
24317     }
24318     /* FALLTHROUGH */
24319
24320   append_name_to_msg:
24321     {
24322         const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
24323         const char * suffix = (runtime && level == 0) ?  "}" : "\"";
24324
24325         sv_catpv(msg, prefix);
24326         Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24327         sv_catpv(msg, suffix);
24328     }
24329
24330     return NULL;
24331
24332   definition_deferred:
24333
24334     {
24335         bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
24336
24337         /* Here it could yet to be defined, so defer evaluation of this until
24338          * its needed at runtime.  We need the fully qualified property name to
24339          * avoid ambiguity */
24340         if (! fq_name) {
24341             fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24342                                                                 is_qualified);
24343         }
24344
24345         /* If it didn't come with a package, or the package is utf8::, this
24346          * actually could be an official Unicode property whose inclusion we
24347          * are deferring until runtime to make sure that it isn't overridden by
24348          * a user-defined property of the same name (which we haven't
24349          * encountered yet).  Add a marker to indicate this possibility, for
24350          * use at such time when we first need the definition during pattern
24351          * matching execution */
24352         if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
24353             sv_catpvs(fq_name, DEFERRED_PROP_EXPANSION_MARKERs);
24354         }
24355
24356         /* We also need a trailing newline */
24357         sv_catpvs(fq_name, "\n");
24358
24359         *user_defined_ptr = TRUE;
24360         return fq_name;
24361     }
24362 }
24363
24364 #endif
24365
24366 /*
24367  * ex: set ts=8 sts=4 sw=4 et:
24368  */